From: Vitaliy Makkoveev Subject: Re: kevent(2): make EVFILT_TIMER mp-safe To: Visa Hankala Cc: tech@openbsd.org Date: Fri, 9 May 2025 11:06:38 +0300 On Fri, May 09, 2025 at 03:34:01PM +0000, Visa Hankala wrote: > On Thu, May 08, 2025 at 07:39:12PM +0300, Vitaliy Makkoveev wrote: > > On Thu, May 08, 2025 at 02:25:04PM +0000, Visa Hankala wrote: > > > On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote: > > > > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote: > > > > > The last one kernel locked from kern/kern_event.c. > > > > > > > > > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the > > > > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler > > > > > could reschedule itself, so `ft_reschedule' to prevent this while we > > > > > canceling timeout. The `ft_mtx' mutex(9) also protects the > > > > > re-initialization, because it could be concurrently accessed from > > > > > another filt_timermodify() thread. > > > > > > Concurrent filt_timermodify() calls cannot happen, though, because > > > the kqueue subsystem serializes the same knote's f_attach, f_detach, > > > f_modify, and f_process calls automatically. > > > > > > > Nice to hear. So I need serialization only with running timeout handler. > > There can be concurrency between the timeout and {f_modify,f_process}. > For example, the timeout can hit while the user process is reading > the event (f_process call). This is why you should not take shortcuts > with the locking. Your previous diff looked better in this respect. > > > I removed recursion from filt_timeradd()/filt_timerexpire() to reduce > > `ft_mtx' usage. > > If you want to refactor the code please do so in separate commits > so that it is easier to see that the logic remains correct. The gist > and the trickiest part of EVFILT_TIMER is the timeout and knote > handling, and this code should not have unnecessary duplication. > > > + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); > > + mtx_init(&ft->ft_mtx, IPL_MPSAFE); > > Please change this ipl argument to IPL_SOFTCLOCK because the mutex > is used in process context and timeout soft interrupt context. > IPL_MPSAFE is incorrect (it is not a proper IPL value but a hint to > interrupt handlers that the kernel lock is not needed). Done. Index: sys/kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.201 diff -u -p -r1.201 kern_event.c --- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201 +++ sys/kern/kern_event.c 9 May 2025 16:06:18 -0000 @@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn); int filt_signal(struct knote *kn, long hint); int filt_fileattach(struct knote *kn); void filt_timerexpire(void *knx); +void filt_dotimerexpire(struct knote *kn); int filt_timerattach(struct knote *kn); void filt_timerdetach(struct knote *kn); int filt_timermodify(struct kevent *kev, struct knote *kn); @@ -172,7 +173,7 @@ const struct filterops file_filtops = { }; const struct filterops timer_filtops = { - .f_flags = 0, + .f_flags = FILTEROP_MPSAFE, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = NULL, @@ -180,12 +181,19 @@ const struct filterops timer_filtops = { .f_process = filt_timerprocess, }; +/* + * Locking: + * I immutable after creation + * a atomic operations + * t ft_mtx + */ + struct pool knote_pool; struct pool kqueue_pool; struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl"); -int kq_ntimeouts = 0; -int kq_timeoutmax = (4 * 1024); +int kq_ntimeouts = 0; /* [a] */ +int kq_timeoutmax = (4 * 1024); /* [I] */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t return (0); } +struct filt_timer { + struct mutex ft_mtx; + struct timeout ft_to; /* [t] */ + int ft_reschedule; /* [t] */ +}; + static void filt_timeradd(struct knote *kn, struct timespec *ts) { struct timespec expiry, now; - struct timeout *to = kn->kn_hook; + struct filt_timer *ft = kn->kn_hook; int tticks; + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); + + ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 && + (kn->kn_sfflags & NOTE_ABSTIME) == 0); + if (kn->kn_sfflags & NOTE_ABSTIME) { nanotime(&now); if (timespeccmp(ts, &now, >)) { timespecsub(ts, &now, &expiry); /* XXX timeout_abs_ts with CLOCK_REALTIME */ - timeout_add(to, tstohz(&expiry)); + timeout_add(&ft->ft_to, tstohz(&expiry)); } else { /* Expire immediately. */ - filt_timerexpire(kn); + filt_dotimerexpire(kn); } return; } tticks = tstohz(ts); /* Remove extra tick from tstohz() if timeout has fired before. */ - if (timeout_triggered(to)) + if (timeout_triggered(&ft->ft_to)) tticks--; - timeout_add(to, (tticks > 0) ? tticks : 1); + timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1); } void -filt_timerexpire(void *knx) +filt_dotimerexpire(struct knote *kn) { struct timespec ts; - struct knote *kn = knx; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; struct kqueue *kq = kn->kn_kq; + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); + kn->kn_data++; + mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); - if ((kn->kn_flags & EV_ONESHOT) == 0 && - (kn->kn_sfflags & NOTE_ABSTIME) == 0) { + if (ft->ft_reschedule) { (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); filt_timeradd(kn, &ts); } } +void +filt_timerexpire(void *knx) +{ + struct knote *kn = knx; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; + + mtx_enter(&ft->ft_mtx); + filt_dotimerexpire(kn); + mtx_leave(&ft->ft_mtx); +} + /* * data contains amount of time to sleep */ @@ -651,23 +683,29 @@ int filt_timerattach(struct knote *kn) { struct timespec ts; - struct timeout *to; + struct filt_timer *ft; int error; error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); if (error != 0) return (error); - if (kq_ntimeouts > kq_timeoutmax) + if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) { + atomic_dec_int(&kq_ntimeouts); return (ENOMEM); - kq_ntimeouts++; + } if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ - to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); - timeout_set(to, filt_timerexpire, kn); - kn->kn_hook = to; + + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); + mtx_init(&ft->ft_mtx, IPL_SOFTCLOCK); + timeout_set(&ft->ft_to, filt_timerexpire, kn); + kn->kn_hook = ft; + + mtx_enter(&ft->ft_mtx); filt_timeradd(kn, &ts); + mtx_leave(&ft->ft_mtx); return (0); } @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn) void filt_timerdetach(struct knote *kn) { - struct timeout *to; + struct filt_timer *ft = kn->kn_hook; - to = (struct timeout *)kn->kn_hook; - timeout_del_barrier(to); - free(to, M_KEVENT, sizeof(*to)); - kq_ntimeouts--; + mtx_enter(&ft->ft_mtx); + ft->ft_reschedule = 0; + mtx_leave(&ft->ft_mtx); + + timeout_del_barrier(&ft->ft_to); + free(ft, M_KEVENT, sizeof(*ft)); + atomic_dec_int(&kq_ntimeouts); } int @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str { struct timespec ts; struct kqueue *kq = kn->kn_kq; - struct timeout *to = kn->kn_hook; + struct filt_timer *ft = kn->kn_hook; int error; error = filt_timervalidate(kev->fflags, kev->data, &ts); @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str } /* Reset the timer. Any pending events are discarded. */ + mtx_enter(&ft->ft_mtx); + ft->ft_reschedule = 0; + mtx_leave(&ft->ft_mtx); - timeout_del_barrier(to); + timeout_del_barrier(&ft->ft_to); + mtx_enter(&ft->ft_mtx); mtx_enter(&kq->kq_lock); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str kn->kn_data = 0; knote_assign(kev, kn); /* Reinit timeout to invoke tick adjustment again. */ - timeout_set(to, filt_timerexpire, kn); + timeout_set(&ft->ft_to, filt_timerexpire, kn); filt_timeradd(kn, &ts); + mtx_leave(&ft->ft_mtx); return (0); } @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str int filt_timerprocess(struct knote *kn, struct kevent *kev) { - int active, s; + struct filt_timer *ft = kn->kn_hook; + int active; - s = splsoftclock(); + mtx_enter(&ft->ft_mtx); active = (kn->kn_data != 0); if (active) knote_submit(kn, kev); - splx(s); + mtx_leave(&ft->ft_mtx); return (active); }