From: Visa Hankala Subject: Re: kevent(2): make EVFILT_TIMER mp-safe To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Sun, 11 May 2025 13:31:01 +0000 On Fri, May 09, 2025 at 11:06:38AM +0300, Vitaliy Makkoveev wrote: > On Fri, May 09, 2025 at 03:34:01PM +0000, Visa Hankala wrote: > > On Thu, May 08, 2025 at 07:39:12PM +0300, Vitaliy Makkoveev wrote: > > > On Thu, May 08, 2025 at 02:25:04PM +0000, Visa Hankala wrote: > > > > On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote: > > > > > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote: > > > > > > The last one kernel locked from kern/kern_event.c. > > > > > > > > > > > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the > > > > > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler > > > > > > could reschedule itself, so `ft_reschedule' to prevent this while we > > > > > > canceling timeout. The `ft_mtx' mutex(9) also protects the > > > > > > re-initialization, because it could be concurrently accessed from > > > > > > another filt_timermodify() thread. > > > > > > > > Concurrent filt_timermodify() calls cannot happen, though, because > > > > the kqueue subsystem serializes the same knote's f_attach, f_detach, > > > > f_modify, and f_process calls automatically. > > > > > > > > > > Nice to hear. So I need serialization only with running timeout handler. > > > > There can be concurrency between the timeout and {f_modify,f_process}. > > For example, the timeout can hit while the user process is reading > > the event (f_process call). This is why you should not take shortcuts > > with the locking. Your previous diff looked better in this respect. > > > > > I removed recursion from filt_timeradd()/filt_timerexpire() to reduce > > > `ft_mtx' usage. > > > > If you want to refactor the code please do so in separate commits > > so that it is easier to see that the logic remains correct. The gist > > and the trickiest part of EVFILT_TIMER is the timeout and knote > > handling, and this code should not have unnecessary duplication. > > > > > + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); > > > + mtx_init(&ft->ft_mtx, IPL_MPSAFE); > > > > Please change this ipl argument to IPL_SOFTCLOCK because the mutex > > is used in process context and timeout soft interrupt context. > > IPL_MPSAFE is incorrect (it is not a proper IPL value but a hint to > > interrupt handlers that the kernel lock is not needed). > > Done. Type casts seem to have sneaked back in filt_dotimerexpire() and filt_timerexpire(). Otherwise looks good. OK visa@ > Index: sys/kern/kern_event.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_event.c,v > retrieving revision 1.201 > diff -u -p -r1.201 kern_event.c > --- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201 > +++ sys/kern/kern_event.c 9 May 2025 16:06:18 -0000 > @@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn); > int filt_signal(struct knote *kn, long hint); > int filt_fileattach(struct knote *kn); > void filt_timerexpire(void *knx); > +void filt_dotimerexpire(struct knote *kn); > int filt_timerattach(struct knote *kn); > void filt_timerdetach(struct knote *kn); > int filt_timermodify(struct kevent *kev, struct knote *kn); > @@ -172,7 +173,7 @@ const struct filterops file_filtops = { > }; > > const struct filterops timer_filtops = { > - .f_flags = 0, > + .f_flags = FILTEROP_MPSAFE, > .f_attach = filt_timerattach, > .f_detach = filt_timerdetach, > .f_event = NULL, > @@ -180,12 +181,19 @@ const struct filterops timer_filtops = { > .f_process = filt_timerprocess, > }; > > +/* > + * Locking: > + * I immutable after creation > + * a atomic operations > + * t ft_mtx > + */ > + > struct pool knote_pool; > struct pool kqueue_pool; > struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); > struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl"); > -int kq_ntimeouts = 0; > -int kq_timeoutmax = (4 * 1024); > +int kq_ntimeouts = 0; /* [a] */ > +int kq_timeoutmax = (4 * 1024); /* [I] */ > > #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) > > @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t > return (0); > } > > +struct filt_timer { > + struct mutex ft_mtx; > + struct timeout ft_to; /* [t] */ > + int ft_reschedule; /* [t] */ > +}; > + > static void > filt_timeradd(struct knote *kn, struct timespec *ts) > { > struct timespec expiry, now; > - struct timeout *to = kn->kn_hook; > + struct filt_timer *ft = kn->kn_hook; > int tticks; > > + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); > + > + ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 && > + (kn->kn_sfflags & NOTE_ABSTIME) == 0); > + > if (kn->kn_sfflags & NOTE_ABSTIME) { > nanotime(&now); > if (timespeccmp(ts, &now, >)) { > timespecsub(ts, &now, &expiry); > /* XXX timeout_abs_ts with CLOCK_REALTIME */ > - timeout_add(to, tstohz(&expiry)); > + timeout_add(&ft->ft_to, tstohz(&expiry)); > } else { > /* Expire immediately. */ > - filt_timerexpire(kn); > + filt_dotimerexpire(kn); > } > return; > } > > tticks = tstohz(ts); > /* Remove extra tick from tstohz() if timeout has fired before. */ > - if (timeout_triggered(to)) > + if (timeout_triggered(&ft->ft_to)) > tticks--; > - timeout_add(to, (tticks > 0) ? tticks : 1); > + timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1); > } > > void > -filt_timerexpire(void *knx) > +filt_dotimerexpire(struct knote *kn) > { > struct timespec ts; > - struct knote *kn = knx; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > struct kqueue *kq = kn->kn_kq; > > + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); > + > kn->kn_data++; > + > mtx_enter(&kq->kq_lock); > knote_activate(kn); > mtx_leave(&kq->kq_lock); > > - if ((kn->kn_flags & EV_ONESHOT) == 0 && > - (kn->kn_sfflags & NOTE_ABSTIME) == 0) { > + if (ft->ft_reschedule) { > (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); > filt_timeradd(kn, &ts); > } > } > > +void > +filt_timerexpire(void *knx) > +{ > + struct knote *kn = knx; > + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; > + > + mtx_enter(&ft->ft_mtx); > + filt_dotimerexpire(kn); > + mtx_leave(&ft->ft_mtx); > +} > + > /* > * data contains amount of time to sleep > */ > @@ -651,23 +683,29 @@ int > filt_timerattach(struct knote *kn) > { > struct timespec ts; > - struct timeout *to; > + struct filt_timer *ft; > int error; > > error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); > if (error != 0) > return (error); > > - if (kq_ntimeouts > kq_timeoutmax) > + if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) { > + atomic_dec_int(&kq_ntimeouts); > return (ENOMEM); > - kq_ntimeouts++; > + } > > if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) > kn->kn_flags |= EV_CLEAR; /* automatically set */ > - to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); > - timeout_set(to, filt_timerexpire, kn); > - kn->kn_hook = to; > + > + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); > + mtx_init(&ft->ft_mtx, IPL_SOFTCLOCK); > + timeout_set(&ft->ft_to, filt_timerexpire, kn); > + kn->kn_hook = ft; > + > + mtx_enter(&ft->ft_mtx); > filt_timeradd(kn, &ts); > + mtx_leave(&ft->ft_mtx); > > return (0); > } > @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn) > void > filt_timerdetach(struct knote *kn) > { > - struct timeout *to; > + struct filt_timer *ft = kn->kn_hook; > > - to = (struct timeout *)kn->kn_hook; > - timeout_del_barrier(to); > - free(to, M_KEVENT, sizeof(*to)); > - kq_ntimeouts--; > + mtx_enter(&ft->ft_mtx); > + ft->ft_reschedule = 0; > + mtx_leave(&ft->ft_mtx); > + > + timeout_del_barrier(&ft->ft_to); > + free(ft, M_KEVENT, sizeof(*ft)); > + atomic_dec_int(&kq_ntimeouts); > } > > int > @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str > { > struct timespec ts; > struct kqueue *kq = kn->kn_kq; > - struct timeout *to = kn->kn_hook; > + struct filt_timer *ft = kn->kn_hook; > int error; > > error = filt_timervalidate(kev->fflags, kev->data, &ts); > @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str > } > > /* Reset the timer. Any pending events are discarded. */ > + mtx_enter(&ft->ft_mtx); > + ft->ft_reschedule = 0; > + mtx_leave(&ft->ft_mtx); > > - timeout_del_barrier(to); > + timeout_del_barrier(&ft->ft_to); > > + mtx_enter(&ft->ft_mtx); > mtx_enter(&kq->kq_lock); > if (kn->kn_status & KN_QUEUED) > knote_dequeue(kn); > @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str > kn->kn_data = 0; > knote_assign(kev, kn); > /* Reinit timeout to invoke tick adjustment again. */ > - timeout_set(to, filt_timerexpire, kn); > + timeout_set(&ft->ft_to, filt_timerexpire, kn); > filt_timeradd(kn, &ts); > + mtx_leave(&ft->ft_mtx); > > return (0); > } > @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str > int > filt_timerprocess(struct knote *kn, struct kevent *kev) > { > - int active, s; > + struct filt_timer *ft = kn->kn_hook; > + int active; > > - s = splsoftclock(); > + mtx_enter(&ft->ft_mtx); > active = (kn->kn_data != 0); > if (active) > knote_submit(kn, kev); > - splx(s); > + mtx_leave(&ft->ft_mtx); > > return (active); > } >