Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: kevent(2): make EVFILT_TIMER mp-safe
To:
Visa Hankala <visa@hankala.org>
Cc:
tech@openbsd.org
Date:
Fri, 9 May 2025 11:06:38 +0300

Download raw body.

Thread
  • Visa Hankala:

    kevent(2): make EVFILT_TIMER mp-safe

  • On Fri, May 09, 2025 at 03:34:01PM +0000, Visa Hankala wrote:
    > On Thu, May 08, 2025 at 07:39:12PM +0300, Vitaliy Makkoveev wrote:
    > > On Thu, May 08, 2025 at 02:25:04PM +0000, Visa Hankala wrote:
    > > > On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote:
    > > > > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
    > > > > > The last one kernel locked from kern/kern_event.c. 
    > > > > > 
    > > > > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the
    > > > > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
    > > > > > could reschedule itself, so `ft_reschedule' to prevent this while we
    > > > > > canceling timeout. The `ft_mtx' mutex(9) also protects the
    > > > > > re-initialization, because it could be concurrently accessed from
    > > > > > another filt_timermodify() thread.
    > > > 
    > > > Concurrent filt_timermodify() calls cannot happen, though, because
    > > > the kqueue subsystem serializes the same knote's f_attach, f_detach,
    > > > f_modify, and f_process calls automatically.
    > > > 
    > > 
    > > Nice to hear. So I need serialization only with running timeout handler.
    > 
    > There can be concurrency between the timeout and {f_modify,f_process}.
    > For example, the timeout can hit while the user process is reading
    > the event (f_process call). This is why you should not take shortcuts
    > with the locking. Your previous diff looked better in this respect.
    > 
    > > I removed recursion from filt_timeradd()/filt_timerexpire() to reduce
    > > `ft_mtx' usage.
    > 
    > If you want to refactor the code please do so in separate commits
    > so that it is easier to see that the logic remains correct. The gist
    > and the trickiest part of EVFILT_TIMER is the timeout and knote
    > handling, and this code should not have unnecessary duplication.
    > 
    > > +	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
    > > +	mtx_init(&ft->ft_mtx, IPL_MPSAFE);
    > 
    > Please change this ipl argument to IPL_SOFTCLOCK because the mutex
    > is used in process context and timeout soft interrupt context.
    > IPL_MPSAFE is incorrect (it is not a proper IPL value but a hint to
    > interrupt handlers that the kernel lock is not needed).
    
    Done.
    
    Index: sys/kern/kern_event.c
    ===================================================================
    RCS file: /cvs/src/sys/kern/kern_event.c,v
    retrieving revision 1.201
    diff -u -p -r1.201 kern_event.c
    --- sys/kern/kern_event.c	10 Feb 2025 16:45:46 -0000	1.201
    +++ sys/kern/kern_event.c	9 May 2025 16:06:18 -0000
    @@ -131,6 +131,7 @@ void	filt_sigdetach(struct knote *kn);
     int	filt_signal(struct knote *kn, long hint);
     int	filt_fileattach(struct knote *kn);
     void	filt_timerexpire(void *knx);
    +void	filt_dotimerexpire(struct knote *kn);
     int	filt_timerattach(struct knote *kn);
     void	filt_timerdetach(struct knote *kn);
     int	filt_timermodify(struct kevent *kev, struct knote *kn);
    @@ -172,7 +173,7 @@ const struct filterops file_filtops = {
     };
     
     const struct filterops timer_filtops = {
    -	.f_flags	= 0,
    +	.f_flags	= FILTEROP_MPSAFE,
     	.f_attach	= filt_timerattach,
     	.f_detach	= filt_timerdetach,
     	.f_event	= NULL,
    @@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
     	.f_process	= filt_timerprocess,
     };
     
    +/*
    + * Locking:
    + *	I	immutable after creation
    + *	a	atomic operations
    + *	t	ft_mtx
    + */
    +
     struct	pool knote_pool;
     struct	pool kqueue_pool;
     struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
     struct	rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
    -int kq_ntimeouts = 0;
    -int kq_timeoutmax = (4 * 1024);
    +int kq_ntimeouts = 0;		/* [a] */
    +int kq_timeoutmax = (4 * 1024);	/* [I] */
     
     #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
     
    @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t 
     	return (0);
     }
     
    +struct filt_timer {
    +	struct mutex	ft_mtx;
    +	struct timeout	ft_to;		/* [t] */
    +	int		ft_reschedule;	/* [t] */
    +};
    +
     static void
     filt_timeradd(struct knote *kn, struct timespec *ts)
     {
     	struct timespec expiry, now;
    -	struct timeout *to = kn->kn_hook;
    +	struct filt_timer *ft = kn->kn_hook;
     	int tticks;
     
    +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
    +
    +	ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
    +	    (kn->kn_sfflags & NOTE_ABSTIME) == 0);
    +
     	if (kn->kn_sfflags & NOTE_ABSTIME) {
     		nanotime(&now);
     		if (timespeccmp(ts, &now, >)) {
     			timespecsub(ts, &now, &expiry);
     			/* XXX timeout_abs_ts with CLOCK_REALTIME */
    -			timeout_add(to, tstohz(&expiry));
    +			timeout_add(&ft->ft_to, tstohz(&expiry));
     		} else {
     			/* Expire immediately. */
    -			filt_timerexpire(kn);
    +			filt_dotimerexpire(kn);
     		}
     		return;
     	}
     
     	tticks = tstohz(ts);
     	/* Remove extra tick from tstohz() if timeout has fired before. */
    -	if (timeout_triggered(to))
    +	if (timeout_triggered(&ft->ft_to))
     		tticks--;
    -	timeout_add(to, (tticks > 0) ? tticks : 1);
    +	timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
     }
     
     void
    -filt_timerexpire(void *knx)
    +filt_dotimerexpire(struct knote *kn)
     {
     	struct timespec ts;
    -	struct knote *kn = knx;
    +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
     	struct kqueue *kq = kn->kn_kq;
     
    +	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
    +
     	kn->kn_data++;
    +
     	mtx_enter(&kq->kq_lock);
     	knote_activate(kn);
     	mtx_leave(&kq->kq_lock);
     
    -	if ((kn->kn_flags & EV_ONESHOT) == 0 &&
    -	    (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
    +	if (ft->ft_reschedule) {
     		(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
     		filt_timeradd(kn, &ts);
     	}
     }
     
    +void
    +filt_timerexpire(void *knx)
    +{
    +	struct knote *kn = knx;
    +	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
    +
    +	mtx_enter(&ft->ft_mtx);
    +	filt_dotimerexpire(kn);
    +	mtx_leave(&ft->ft_mtx);
    +}
    +
     /*
      * data contains amount of time to sleep
      */
    @@ -651,23 +683,29 @@ int
     filt_timerattach(struct knote *kn)
     {
     	struct timespec ts;
    -	struct timeout *to;
    +	struct filt_timer *ft;
     	int error;
     
     	error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
     	if (error != 0)
     		return (error);
     
    -	if (kq_ntimeouts > kq_timeoutmax)
    +	if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
    +		atomic_dec_int(&kq_ntimeouts);
     		return (ENOMEM);
    -	kq_ntimeouts++;
    +	}
     
     	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
     		kn->kn_flags |= EV_CLEAR;	/* automatically set */
    -	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
    -	timeout_set(to, filt_timerexpire, kn);
    -	kn->kn_hook = to;
    +
    +	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
    +	mtx_init(&ft->ft_mtx, IPL_SOFTCLOCK);
    +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
    +	kn->kn_hook = ft;
    +
    +	mtx_enter(&ft->ft_mtx);
     	filt_timeradd(kn, &ts);
    +	mtx_leave(&ft->ft_mtx);
     
     	return (0);
     }
    @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
     void
     filt_timerdetach(struct knote *kn)
     {
    -	struct timeout *to;
    +	struct filt_timer *ft = kn->kn_hook;
     
    -	to = (struct timeout *)kn->kn_hook;
    -	timeout_del_barrier(to);
    -	free(to, M_KEVENT, sizeof(*to));
    -	kq_ntimeouts--;
    +	mtx_enter(&ft->ft_mtx);
    +	ft->ft_reschedule = 0;
    +	mtx_leave(&ft->ft_mtx);
    +
    +	timeout_del_barrier(&ft->ft_to);
    +	free(ft, M_KEVENT, sizeof(*ft));
    +	atomic_dec_int(&kq_ntimeouts);
     }
     
     int
    @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
     {
     	struct timespec ts;
     	struct kqueue *kq = kn->kn_kq;
    -	struct timeout *to = kn->kn_hook;
    +	struct filt_timer *ft = kn->kn_hook;
     	int error;
     
     	error = filt_timervalidate(kev->fflags, kev->data, &ts);
    @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
     	}
     
     	/* Reset the timer. Any pending events are discarded. */
    +	mtx_enter(&ft->ft_mtx);
    +	ft->ft_reschedule = 0;
    +	mtx_leave(&ft->ft_mtx);
     
    -	timeout_del_barrier(to);
    +	timeout_del_barrier(&ft->ft_to);
     
    +	mtx_enter(&ft->ft_mtx);
     	mtx_enter(&kq->kq_lock);
     	if (kn->kn_status & KN_QUEUED)
     		knote_dequeue(kn);
    @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
     	kn->kn_data = 0;
     	knote_assign(kev, kn);
     	/* Reinit timeout to invoke tick adjustment again. */
    -	timeout_set(to, filt_timerexpire, kn);
    +	timeout_set(&ft->ft_to, filt_timerexpire, kn);
     	filt_timeradd(kn, &ts);
    +	mtx_leave(&ft->ft_mtx);
     
     	return (0);
     }
    @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
     int
     filt_timerprocess(struct knote *kn, struct kevent *kev)
     {
    -	int active, s;
    +	struct filt_timer *ft = kn->kn_hook;
    +	int active;
     
    -	s = splsoftclock();
    +	mtx_enter(&ft->ft_mtx);
     	active = (kn->kn_data != 0);
     	if (active)
     		knote_submit(kn, kev);
    -	splx(s);
    +	mtx_leave(&ft->ft_mtx);
     
     	return (active);
     }
    
    
  • Visa Hankala:

    kevent(2): make EVFILT_TIMER mp-safe