Index | Thread | Search

From:
Visa Hankala <visa@hankala.org>
Subject:
Re: kevent(2): make EVFILT_TIMER mp-safe
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Sun, 11 May 2025 13:31:01 +0000

Download raw body.

Thread
On Fri, May 09, 2025 at 11:06:38AM +0300, Vitaliy Makkoveev wrote:
> On Fri, May 09, 2025 at 03:34:01PM +0000, Visa Hankala wrote:
> > On Thu, May 08, 2025 at 07:39:12PM +0300, Vitaliy Makkoveev wrote:
> > > On Thu, May 08, 2025 at 02:25:04PM +0000, Visa Hankala wrote:
> > > > On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote:
> > > > > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> > > > > > The last one kernel locked from kern/kern_event.c. 
> > > > > > 
> > > > > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> > > > > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> > > > > > could reschedule itself, so `ft_reschedule' to prevent this while we
> > > > > > canceling timeout. The `ft_mtx' mutex(9) also protects the
> > > > > > re-initialization, because it could be concurrently accessed from
> > > > > > another filt_timermodify() thread.
> > > > 
> > > > Concurrent filt_timermodify() calls cannot happen, though, because
> > > > the kqueue subsystem serializes the same knote's f_attach, f_detach,
> > > > f_modify, and f_process calls automatically.
> > > > 
> > > 
> > > Nice to hear. So I need serialization only with running timeout handler.
> > 
> > There can be concurrency between the timeout and {f_modify,f_process}.
> > For example, the timeout can hit while the user process is reading
> > the event (f_process call). This is why you should not take shortcuts
> > with the locking. Your previous diff looked better in this respect.
> > 
> > > I removed recursion from filt_timeradd()/filt_timerexpire() to reduce
> > > `ft_mtx' usage.
> > 
> > If you want to refactor the code please do so in separate commits
> > so that it is easier to see that the logic remains correct. The gist
> > and the trickiest part of EVFILT_TIMER is the timeout and knote
> > handling, and this code should not have unnecessary duplication.
> > 
> > > +	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
> > > +	mtx_init(&ft->ft_mtx, IPL_MPSAFE);
> > 
> > Please change this ipl argument to IPL_SOFTCLOCK because the mutex
> > is used in process context and timeout soft interrupt context.
> > IPL_MPSAFE is incorrect (it is not a proper IPL value but a hint to
> > interrupt handlers that the kernel lock is not needed).
> 
> Done.

Type casts seem to have sneaked back in filt_dotimerexpire() and
filt_timerexpire(). Otherwise looks good.

OK visa@

> Index: sys/kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.201
> diff -u -p -r1.201 kern_event.c
> --- sys/kern/kern_event.c	10 Feb 2025 16:45:46 -0000	1.201
> +++ sys/kern/kern_event.c	9 May 2025 16:06:18 -0000
> @@ -131,6 +131,7 @@ void	filt_sigdetach(struct knote *kn);
>  int	filt_signal(struct knote *kn, long hint);
>  int	filt_fileattach(struct knote *kn);
>  void	filt_timerexpire(void *knx);
> +void	filt_dotimerexpire(struct knote *kn);
>  int	filt_timerattach(struct knote *kn);
>  void	filt_timerdetach(struct knote *kn);
>  int	filt_timermodify(struct kevent *kev, struct knote *kn);
> @@ -172,7 +173,7 @@ const struct filterops file_filtops = {
>  };
>  
>  const struct filterops timer_filtops = {
> -	.f_flags	= 0,
> +	.f_flags	= FILTEROP_MPSAFE,
>  	.f_attach	= filt_timerattach,
>  	.f_detach	= filt_timerdetach,
>  	.f_event	= NULL,
> @@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
>  	.f_process	= filt_timerprocess,
>  };
>  
> +/*
> + * Locking:
> + *	I	immutable after creation
> + *	a	atomic operations
> + *	t	ft_mtx
> + */
> +
>  struct	pool knote_pool;
>  struct	pool kqueue_pool;
>  struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
>  struct	rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
> -int kq_ntimeouts = 0;
> -int kq_timeoutmax = (4 * 1024);
> +int kq_ntimeouts = 0;		/* [a] */
> +int kq_timeoutmax = (4 * 1024);	/* [I] */
>  
>  #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
>  
> @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t 
>  	return (0);
>  }
>  
> +struct filt_timer {
> +	struct mutex	ft_mtx;
> +	struct timeout	ft_to;		/* [t] */
> +	int		ft_reschedule;	/* [t] */
> +};
> +
>  static void
>  filt_timeradd(struct knote *kn, struct timespec *ts)
>  {
>  	struct timespec expiry, now;
> -	struct timeout *to = kn->kn_hook;
> +	struct filt_timer *ft = kn->kn_hook;
>  	int tticks;
>  
> +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
> +	ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
> +	    (kn->kn_sfflags & NOTE_ABSTIME) == 0);
> +
>  	if (kn->kn_sfflags & NOTE_ABSTIME) {
>  		nanotime(&now);
>  		if (timespeccmp(ts, &now, >)) {
>  			timespecsub(ts, &now, &expiry);
>  			/* XXX timeout_abs_ts with CLOCK_REALTIME */
> -			timeout_add(to, tstohz(&expiry));
> +			timeout_add(&ft->ft_to, tstohz(&expiry));
>  		} else {
>  			/* Expire immediately. */
> -			filt_timerexpire(kn);
> +			filt_dotimerexpire(kn);
>  		}
>  		return;
>  	}
>  
>  	tticks = tstohz(ts);
>  	/* Remove extra tick from tstohz() if timeout has fired before. */
> -	if (timeout_triggered(to))
> +	if (timeout_triggered(&ft->ft_to))
>  		tticks--;
> -	timeout_add(to, (tticks > 0) ? tticks : 1);
> +	timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
>  }
>  
>  void
> -filt_timerexpire(void *knx)
> +filt_dotimerexpire(struct knote *kn)
>  {
>  	struct timespec ts;
> -	struct knote *kn = knx;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
>  	struct kqueue *kq = kn->kn_kq;
>  
> +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
>  	kn->kn_data++;
> +
>  	mtx_enter(&kq->kq_lock);
>  	knote_activate(kn);
>  	mtx_leave(&kq->kq_lock);
>  
> -	if ((kn->kn_flags & EV_ONESHOT) == 0 &&
> -	    (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
> +	if (ft->ft_reschedule) {
>  		(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
>  		filt_timeradd(kn, &ts);
>  	}
>  }
>  
> +void
> +filt_timerexpire(void *knx)
> +{
> +	struct knote *kn = knx;
> +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> +
> +	mtx_enter(&ft->ft_mtx);
> +	filt_dotimerexpire(kn);
> +	mtx_leave(&ft->ft_mtx);
> +}
> +
>  /*
>   * data contains amount of time to sleep
>   */
> @@ -651,23 +683,29 @@ int
>  filt_timerattach(struct knote *kn)
>  {
>  	struct timespec ts;
> -	struct timeout *to;
> +	struct filt_timer *ft;
>  	int error;
>  
>  	error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
>  	if (error != 0)
>  		return (error);
>  
> -	if (kq_ntimeouts > kq_timeoutmax)
> +	if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
> +		atomic_dec_int(&kq_ntimeouts);
>  		return (ENOMEM);
> -	kq_ntimeouts++;
> +	}
>  
>  	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
>  		kn->kn_flags |= EV_CLEAR;	/* automatically set */
> -	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
> -	timeout_set(to, filt_timerexpire, kn);
> -	kn->kn_hook = to;
> +
> +	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
> +	mtx_init(&ft->ft_mtx, IPL_SOFTCLOCK);
> +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
> +	kn->kn_hook = ft;
> +
> +	mtx_enter(&ft->ft_mtx);
>  	filt_timeradd(kn, &ts);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (0);
>  }
> @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
>  void
>  filt_timerdetach(struct knote *kn)
>  {
> -	struct timeout *to;
> +	struct filt_timer *ft = kn->kn_hook;
>  
> -	to = (struct timeout *)kn->kn_hook;
> -	timeout_del_barrier(to);
> -	free(to, M_KEVENT, sizeof(*to));
> -	kq_ntimeouts--;
> +	mtx_enter(&ft->ft_mtx);
> +	ft->ft_reschedule = 0;
> +	mtx_leave(&ft->ft_mtx);
> +
> +	timeout_del_barrier(&ft->ft_to);
> +	free(ft, M_KEVENT, sizeof(*ft));
> +	atomic_dec_int(&kq_ntimeouts);
>  }
>  
>  int
> @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
>  {
>  	struct timespec ts;
>  	struct kqueue *kq = kn->kn_kq;
> -	struct timeout *to = kn->kn_hook;
> +	struct filt_timer *ft = kn->kn_hook;
>  	int error;
>  
>  	error = filt_timervalidate(kev->fflags, kev->data, &ts);
> @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
>  	}
>  
>  	/* Reset the timer. Any pending events are discarded. */
> +	mtx_enter(&ft->ft_mtx);
> +	ft->ft_reschedule = 0;
> +	mtx_leave(&ft->ft_mtx);
>  
> -	timeout_del_barrier(to);
> +	timeout_del_barrier(&ft->ft_to);
>  
> +	mtx_enter(&ft->ft_mtx);
>  	mtx_enter(&kq->kq_lock);
>  	if (kn->kn_status & KN_QUEUED)
>  		knote_dequeue(kn);
> @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
>  	kn->kn_data = 0;
>  	knote_assign(kev, kn);
>  	/* Reinit timeout to invoke tick adjustment again. */
> -	timeout_set(to, filt_timerexpire, kn);
> +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
>  	filt_timeradd(kn, &ts);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (0);
>  }
> @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
>  int
>  filt_timerprocess(struct knote *kn, struct kevent *kev)
>  {
> -	int active, s;
> +	struct filt_timer *ft = kn->kn_hook;
> +	int active;
>  
> -	s = splsoftclock();
> +	mtx_enter(&ft->ft_mtx);
>  	active = (kn->kn_data != 0);
>  	if (active)
>  		knote_submit(kn, kev);
> -	splx(s);
> +	mtx_leave(&ft->ft_mtx);
>  
>  	return (active);
>  }
>