From: Visa Hankala Subject: Re: kevent(2): make EVFILT_TIMER mp-safe To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Thu, 8 May 2025 14:25:04 +0000 On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote: > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote: > > The last one kernel locked from kern/kern_event.c. > > > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler > > could reschedule itself, so `ft_reschedule' to prevent this while we > > canceling timeout. The `ft_mtx' mutex(9) also protects the > > re-initialization, because it could be concurrently accessed from > > another filt_timermodify() thread. Concurrent filt_timermodify() calls cannot happen, though, because the kqueue subsystem serializes the same knote's f_attach, f_detach, f_modify, and f_process calls automatically. > > > > Note, this diff keeps the timeout handler under kernel lock. We don't > > touch kernel lock while we perform timeout_add(9) or > > timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not > > stuck in kernel lock, meanwhile the timeout firing will take kernel lock > > in most cases even for mp-safe timeouts. > > > > Oops, forgot to remove "kq_ntimeouts++;". > > Index: sys/kern/kern_event.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_event.c,v > retrieving revision 1.201 > diff -u -p -r1.201 kern_event.c > --- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201 > +++ sys/kern/kern_event.c 8 May 2025 00:41:41 -0000 > @@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn); > int filt_signal(struct knote *kn, long hint); > int filt_fileattach(struct knote *kn); > void filt_timerexpire(void *knx); > +void filt_dotimerexpire(struct knote *kn); > int filt_timerattach(struct knote *kn); > void filt_timerdetach(struct knote *kn); > int filt_timermodify(struct kevent *kev, struct knote *kn); > @@ -172,7 +173,7 @@ const struct filterops file_filtops = { > }; > > const struct filterops timer_filtops = { > - .f_flags = 0, > + .f_flags = FILTEROP_MPSAFE, > .f_attach = filt_timerattach, > .f_detach = filt_timerdetach, > .f_event = NULL, > @@ -180,12 +181,19 @@ const struct filterops timer_filtops = { > .f_process = filt_timerprocess, > }; > > +/* > + * Locking: > + * I immutable after creation > + * a atomic operations > + * t ft_mtx > + */ > + > struct pool knote_pool; > struct pool kqueue_pool; > struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); > struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl"); > -int kq_ntimeouts = 0; > -int kq_timeoutmax = (4 * 1024); > +int kq_ntimeouts = 0; /* [a] */ > +int kq_timeoutmax = (4 * 1024); /* [I] */ > > #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) > > @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t > return (0); > } > > +struct filt_timer { > + struct mutex ft_mtx; > + struct timeout ft_to; /* [t] */ > + int ft_reschedule; /* [t] */ > +}; > + > static void > filt_timeradd(struct knote *kn, struct timespec *ts) > { > struct timespec expiry, now; > - struct timeout *to = kn->kn_hook; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; Please leave out these explicit type casts when getting ft from kn_hook. The casts are not required by the C language, and hence mainly add visual clutter (and make the already weak type checking even weaker by allowing more types as the source, should there be a mishap with the choice of the right-hand side variable). > int tticks; > > + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); > + > + ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 && > + (kn->kn_sfflags & NOTE_ABSTIME) == 0); > + > if (kn->kn_sfflags & NOTE_ABSTIME) { > nanotime(&now); > if (timespeccmp(ts, &now, >)) { > timespecsub(ts, &now, &expiry); > /* XXX timeout_abs_ts with CLOCK_REALTIME */ > - timeout_add(to, tstohz(&expiry)); > + timeout_add(&ft->ft_to, tstohz(&expiry)); > } else { > /* Expire immediately. */ > - filt_timerexpire(kn); > + filt_dotimerexpire(kn); > } > return; > } > > tticks = tstohz(ts); > /* Remove extra tick from tstohz() if timeout has fired before. */ > - if (timeout_triggered(to)) > + if (timeout_triggered(&ft->ft_to)) > tticks--; > - timeout_add(to, (tticks > 0) ? tticks : 1); > + timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1); > } > > void > -filt_timerexpire(void *knx) > +filt_dotimerexpire(struct knote *kn) > { > struct timespec ts; > - struct knote *kn = knx; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > struct kqueue *kq = kn->kn_kq; > > + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); > + > kn->kn_data++; > + > mtx_enter(&kq->kq_lock); > knote_activate(kn); > mtx_leave(&kq->kq_lock); > > - if ((kn->kn_flags & EV_ONESHOT) == 0 && > - (kn->kn_sfflags & NOTE_ABSTIME) == 0) { > + if (ft->ft_reschedule) { > (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); > filt_timeradd(kn, &ts); > } > } > > +void > +filt_timerexpire(void *knx) > +{ > + struct knote *kn = knx; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > + > + mtx_enter(&ft->ft_mtx); > + filt_dotimerexpire(kn); > + mtx_leave(&ft->ft_mtx); > +} > + > /* > * data contains amount of time to sleep > */ > @@ -651,23 +683,29 @@ int > filt_timerattach(struct knote *kn) > { > struct timespec ts; > - struct timeout *to; > + struct filt_timer *ft; > int error; > > error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); > if (error != 0) > return (error); > > - if (kq_ntimeouts > kq_timeoutmax) > + if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) { > + atomic_dec_int(&kq_ntimeouts); > return (ENOMEM); > - kq_ntimeouts++; > + } > > if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) > kn->kn_flags |= EV_CLEAR; /* automatically set */ > - to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); > - timeout_set(to, filt_timerexpire, kn); > - kn->kn_hook = to; > + > + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); > + mtx_init(&ft->ft_mtx, IPL_MPSAFE); Shouldn't the level be IPL_SOFTCLOCK? > + timeout_set(&ft->ft_to, filt_timerexpire, kn); > + kn->kn_hook = ft; > + > + mtx_enter(&ft->ft_mtx); > filt_timeradd(kn, &ts); > + mtx_leave(&ft->ft_mtx); > > return (0); > } > @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn) > void > filt_timerdetach(struct knote *kn) > { > - struct timeout *to; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > > - to = (struct timeout *)kn->kn_hook; > - timeout_del_barrier(to); > - free(to, M_KEVENT, sizeof(*to)); > - kq_ntimeouts--; > + mtx_enter(&ft->ft_mtx); > + ft->ft_reschedule = 0; > + mtx_leave(&ft->ft_mtx); > + > + timeout_del_barrier(&ft->ft_to); > + free(ft, M_KEVENT, sizeof(*ft)); > + atomic_dec_int(&kq_ntimeouts); > } > > int > @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str > { > struct timespec ts; > struct kqueue *kq = kn->kn_kq; > - struct timeout *to = kn->kn_hook; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > int error; > > error = filt_timervalidate(kev->fflags, kev->data, &ts); > @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str > } > > /* Reset the timer. Any pending events are discarded. */ > + mtx_enter(&ft->ft_mtx); > + ft->ft_reschedule = 0; > + mtx_leave(&ft->ft_mtx); > > - timeout_del_barrier(to); > + timeout_del_barrier(&ft->ft_to); > > + mtx_enter(&ft->ft_mtx); > mtx_enter(&kq->kq_lock); > if (kn->kn_status & KN_QUEUED) > knote_dequeue(kn); > @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str > kn->kn_data = 0; > knote_assign(kev, kn); > /* Reinit timeout to invoke tick adjustment again. */ > - timeout_set(to, filt_timerexpire, kn); > + timeout_set(&ft->ft_to, filt_timerexpire, kn); > filt_timeradd(kn, &ts); > + mtx_leave(&ft->ft_mtx); > > return (0); > } > @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str > int > filt_timerprocess(struct knote *kn, struct kevent *kev) > { > - int active, s; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > + int active; > > - s = splsoftclock(); > + mtx_enter(&ft->ft_mtx); > active = (kn->kn_data != 0); > if (active) > knote_submit(kn, kev); > - splx(s); > + mtx_leave(&ft->ft_mtx); > > return (active); > } >