From: Visa Hankala <visa@hankala.org>
Subject: Re: kevent(2): make EVFILT_TIMER mp-safe
To: Vitaliy Makkoveev <mvs@openbsd.org>
Cc: tech@openbsd.org
Date: Thu, 8 May 2025 14:25:04 +0000

On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote:
> On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> > The last one kernel locked from kern/kern_event.c. 
> > 
> > Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> > could reschedule itself, so `ft_reschedule' to prevent this while we
> > canceling timeout. The `ft_mtx' mutex(9) also protects the
> > re-initialization, because it could be concurrently accessed from
> > another filt_timermodify() thread.

Concurrent filt_timermodify() calls cannot happen, though, because
the kqueue subsystem serializes the same knote's f_attach, f_detach,
f_modify, and f_process calls automatically.

> > 
> > Note, this diff keeps the timeout handler under kernel lock. We don't
> > touch kernel lock while we perform timeout_add(9) or
> > timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not
> > stuck in kernel lock, meanwhile the timeout firing will take kernel lock
> > in most cases even for mp-safe timeouts.
> > 
> 
> Oops, forgot to remove "kq_ntimeouts++;".
> 
> Index: sys/kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.201
> diff -u -p -r1.201 kern_event.c
> --- sys/kern/kern_event.c	10 Feb 2025 16:45:46 -0000	1.201
> +++ sys/kern/kern_event.c	8 May 2025 00:41:41 -0000
> @@ -131,6 +131,7 @@ void	filt_sigdetach(struct knote *kn);
>  int	filt_signal(struct knote *kn, long hint);
>  int	filt_fileattach(struct knote *kn);
>  void	filt_timerexpire(void *knx);
> +void	filt_dotimerexpire(struct knote *kn);
>  int	filt_timerattach(struct knote *kn);
>  void	filt_timerdetach(struct knote *kn);
>  int	filt_timermodify(struct kevent *kev, struct knote *kn);
> @@ -172,7 +173,7 @@ const struct filterops file_filtops = {
>  };
>  
>  const struct filterops timer_filtops = {
> -	.f_flags	= 0,
> +	.f_flags	= FILTEROP_MPSAFE,
>  	.f_attach	= filt_timerattach,
>  	.f_detach	= filt_timerdetach,
>  	.f_event	= NULL,
> @@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
>  	.f_process	= filt_timerprocess,
>  };
>  
> +/*
> + * Locking:
> + *	I	immutable after creation
> + *	a	atomic operations
> + *	t	ft_mtx
> + */
> +
>  struct	pool knote_pool;
>  struct	pool kqueue_pool;
>  struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
>  struct	rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
> -int kq_ntimeouts = 0;
> -int kq_timeoutmax = (4 * 1024);
> +int kq_ntimeouts = 0;		/* [a] */
> +int kq_timeoutmax = (4 * 1024);	/* [I] */
>  
>  #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
>  
> @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t 
>  	return (0);
>  }
>  
> +struct filt_timer {
> +	struct mutex	ft_mtx;
> +	struct timeout	ft_to;		/* [t] */
> +	int		ft_reschedule;	/* [t] */
> +};
> +
>  static void
>  filt_timeradd(struct knote *kn, struct timespec *ts)
>  {
>  	struct timespec expiry, now;
> -	struct timeout *to = kn->kn_hook;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;

Please leave out these explicit type casts when getting ft from kn_hook.
The casts are not required by the C language, and hence mainly add
visual clutter (and make the already weak type checking even weaker by
allowing more types as the source, should there be a mishap with the
choice of the right-hand side variable).

>  	int tticks;
>  
> +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
> +	ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
> +	    (kn->kn_sfflags & NOTE_ABSTIME) == 0);
> +
>  	if (kn->kn_sfflags & NOTE_ABSTIME) {
>  		nanotime(&now);
>  		if (timespeccmp(ts, &now, >)) {
>  			timespecsub(ts, &now, &expiry);
>  			/* XXX timeout_abs_ts with CLOCK_REALTIME */
> -			timeout_add(to, tstohz(&expiry));
> +			timeout_add(&ft->ft_to, tstohz(&expiry));
>  		} else {
>  			/* Expire immediately. */
> -			filt_timerexpire(kn);
> +			filt_dotimerexpire(kn);
>  		}
>  		return;
>  	}
>  
>  	tticks = tstohz(ts);
>  	/* Remove extra tick from tstohz() if timeout has fired before. */
> -	if (timeout_triggered(to))
> +	if (timeout_triggered(&ft->ft_to))
>  		tticks--;
> -	timeout_add(to, (tticks > 0) ? tticks : 1);
> +	timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
>  }
>  
>  void
> -filt_timerexpire(void *knx)
> +filt_dotimerexpire(struct knote *kn)
>  {
>  	struct timespec ts;
> -	struct knote *kn = knx;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
>  	struct kqueue *kq = kn->kn_kq;
>  
> +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
>  	kn->kn_data++;
> +
>  	mtx_enter(&kq->kq_lock);
>  	knote_activate(kn);
>  	mtx_leave(&kq->kq_lock);
>  
> -	if ((kn->kn_flags & EV_ONESHOT) == 0 &&
> -	    (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
> +	if (ft->ft_reschedule) {
>  		(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
>  		filt_timeradd(kn, &ts);
>  	}
>  }
>  
> +void
> +filt_timerexpire(void *knx)
> +{
> +	struct knote *kn = knx;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> +
> +	mtx_enter(&ft->ft_mtx);
> +	filt_dotimerexpire(kn);
> +	mtx_leave(&ft->ft_mtx);
> +}
> +
>  /*
>   * data contains amount of time to sleep
>   */
> @@ -651,23 +683,29 @@ int
>  filt_timerattach(struct knote *kn)
>  {
>  	struct timespec ts;
> -	struct timeout *to;
> +	struct filt_timer *ft;
>  	int error;
>  
>  	error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
>  	if (error != 0)
>  		return (error);
>  
> -	if (kq_ntimeouts > kq_timeoutmax)
> +	if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
> +		atomic_dec_int(&kq_ntimeouts);
>  		return (ENOMEM);
> -	kq_ntimeouts++;
> +	}
>  
>  	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
>  		kn->kn_flags |= EV_CLEAR;	/* automatically set */
> -	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
> -	timeout_set(to, filt_timerexpire, kn);
> -	kn->kn_hook = to;
> +
> +	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
> +	mtx_init(&ft->ft_mtx, IPL_MPSAFE);

Shouldn't the level be IPL_SOFTCLOCK?

> +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
> +	kn->kn_hook = ft;
> +
> +	mtx_enter(&ft->ft_mtx);
>  	filt_timeradd(kn, &ts);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (0);
>  }
> @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
>  void
>  filt_timerdetach(struct knote *kn)
>  {
> -	struct timeout *to;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
>  
> -	to = (struct timeout *)kn->kn_hook;
> -	timeout_del_barrier(to);
> -	free(to, M_KEVENT, sizeof(*to));
> -	kq_ntimeouts--;
> +	mtx_enter(&ft->ft_mtx);
> +	ft->ft_reschedule = 0;
> +	mtx_leave(&ft->ft_mtx);
> +
> +	timeout_del_barrier(&ft->ft_to);
> +	free(ft, M_KEVENT, sizeof(*ft));
> +	atomic_dec_int(&kq_ntimeouts);
>  }
>  
>  int
> @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
>  {
>  	struct timespec ts;
>  	struct kqueue *kq = kn->kn_kq;
> -	struct timeout *to = kn->kn_hook;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
>  	int error;
>  
>  	error = filt_timervalidate(kev->fflags, kev->data, &ts);
> @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
>  	}
>  
>  	/* Reset the timer. Any pending events are discarded. */
> +	mtx_enter(&ft->ft_mtx);
> +	ft->ft_reschedule = 0;
> +	mtx_leave(&ft->ft_mtx);
>  
> -	timeout_del_barrier(to);
> +	timeout_del_barrier(&ft->ft_to);
>  
> +	mtx_enter(&ft->ft_mtx);
>  	mtx_enter(&kq->kq_lock);
>  	if (kn->kn_status & KN_QUEUED)
>  		knote_dequeue(kn);
> @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
>  	kn->kn_data = 0;
>  	knote_assign(kev, kn);
>  	/* Reinit timeout to invoke tick adjustment again. */
> -	timeout_set(to, filt_timerexpire, kn);
> +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
>  	filt_timeradd(kn, &ts);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (0);
>  }
> @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
>  int
>  filt_timerprocess(struct knote *kn, struct kevent *kev)
>  {
> -	int active, s;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> +	int active;
>  
> -	s = splsoftclock();
> +	mtx_enter(&ft->ft_mtx);
>  	active = (kn->kn_data != 0);
>  	if (active)
>  		knote_submit(kn, kev);
> -	splx(s);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (active);
>  }
>