From: Vitaliy Makkoveev Subject: kevent(2): make EVFILT_TIMER mp-safe To: Visa Hankala , tech@openbsd.org Date: Thu, 8 May 2025 03:32:22 +0300 The last one kernel locked from kern/kern_event.c. Introduce 'filt_timer' opaque structure to keep the timeout(9), the `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler could reschedule itself, so `ft_reschedule' to prevent this while we canceling timeout. The `ft_mtx' mutex(9) also protects the re-initialization, because it could be concurrently accessed from another filt_timermodify() thread. Note, this diff keeps the timeout handler under kernel lock. We don't touch kernel lock while we perform timeout_add(9) or timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not stuck in kernel lock, meanwhile the timeout firing will take kernel lock in most cases even for mp-safe timeouts. Index: sys/kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.201 diff -u -p -r1.201 kern_event.c --- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201 +++ sys/kern/kern_event.c 8 May 2025 00:06:42 -0000 @@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn); int filt_signal(struct knote *kn, long hint); int filt_fileattach(struct knote *kn); void filt_timerexpire(void *knx); +void filt_dotimerexpire(struct knote *kn); int filt_timerattach(struct knote *kn); void filt_timerdetach(struct knote *kn); int filt_timermodify(struct kevent *kev, struct knote *kn); @@ -172,7 +173,7 @@ const struct filterops file_filtops = { }; const struct filterops timer_filtops = { - .f_flags = 0, + .f_flags = FILTEROP_MPSAFE, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = NULL, @@ -180,12 +181,19 @@ const struct filterops timer_filtops = { .f_process = filt_timerprocess, }; +/* + * Locking: + * I immutable after creation + * a atomic operations + * t ft_mtx + */ + struct pool knote_pool; struct pool kqueue_pool; struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl"); -int kq_ntimeouts = 0; -int kq_timeoutmax = (4 * 1024); +int kq_ntimeouts = 0; /* [a] */ +int kq_timeoutmax = (4 * 1024); /* [I] */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t return (0); } +struct filt_timer { + struct mutex ft_mtx; + struct timeout ft_to; /* [t] */ + int ft_reschedule; /* [t] */ +}; + static void filt_timeradd(struct knote *kn, struct timespec *ts) { struct timespec expiry, now; - struct timeout *to = kn->kn_hook; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; int tticks; + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); + + ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 && + (kn->kn_sfflags & NOTE_ABSTIME) == 0); + if (kn->kn_sfflags & NOTE_ABSTIME) { nanotime(&now); if (timespeccmp(ts, &now, >)) { timespecsub(ts, &now, &expiry); /* XXX timeout_abs_ts with CLOCK_REALTIME */ - timeout_add(to, tstohz(&expiry)); + timeout_add(&ft->ft_to, tstohz(&expiry)); } else { /* Expire immediately. */ - filt_timerexpire(kn); + filt_dotimerexpire(kn); } return; } tticks = tstohz(ts); /* Remove extra tick from tstohz() if timeout has fired before. */ - if (timeout_triggered(to)) + if (timeout_triggered(&ft->ft_to)) tticks--; - timeout_add(to, (tticks > 0) ? tticks : 1); + timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1); } void -filt_timerexpire(void *knx) +filt_dotimerexpire(struct knote *kn) { struct timespec ts; - struct knote *kn = knx; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; struct kqueue *kq = kn->kn_kq; + MUTEX_ASSERT_LOCKED(&ft->ft_mtx); + kn->kn_data++; + mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); - if ((kn->kn_flags & EV_ONESHOT) == 0 && - (kn->kn_sfflags & NOTE_ABSTIME) == 0) { + if (ft->ft_reschedule) { (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); filt_timeradd(kn, &ts); } } +void +filt_timerexpire(void *knx) +{ + struct knote *kn = knx; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; + + mtx_enter(&ft->ft_mtx); + filt_dotimerexpire(kn); + mtx_leave(&ft->ft_mtx); +} + /* * data contains amount of time to sleep */ @@ -651,23 +683,30 @@ int filt_timerattach(struct knote *kn) { struct timespec ts; - struct timeout *to; + struct filt_timer *ft; int error; error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts); if (error != 0) return (error); - if (kq_ntimeouts > kq_timeoutmax) + if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) { + atomic_dec_int(&kq_ntimeouts); return (ENOMEM); + } kq_ntimeouts++; if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ - to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); - timeout_set(to, filt_timerexpire, kn); - kn->kn_hook = to; + + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK); + mtx_init(&ft->ft_mtx, IPL_MPSAFE); + timeout_set(&ft->ft_to, filt_timerexpire, kn); + kn->kn_hook = ft; + + mtx_enter(&ft->ft_mtx); filt_timeradd(kn, &ts); + mtx_leave(&ft->ft_mtx); return (0); } @@ -675,12 +714,15 @@ filt_timerattach(struct knote *kn) void filt_timerdetach(struct knote *kn) { - struct timeout *to; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; - to = (struct timeout *)kn->kn_hook; - timeout_del_barrier(to); - free(to, M_KEVENT, sizeof(*to)); - kq_ntimeouts--; + mtx_enter(&ft->ft_mtx); + ft->ft_reschedule = 0; + mtx_leave(&ft->ft_mtx); + + timeout_del_barrier(&ft->ft_to); + free(ft, M_KEVENT, sizeof(*ft)); + atomic_dec_int(&kq_ntimeouts); } int @@ -688,7 +730,7 @@ filt_timermodify(struct kevent *kev, str { struct timespec ts; struct kqueue *kq = kn->kn_kq; - struct timeout *to = kn->kn_hook; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; int error; error = filt_timervalidate(kev->fflags, kev->data, &ts); @@ -699,9 +741,13 @@ filt_timermodify(struct kevent *kev, str } /* Reset the timer. Any pending events are discarded. */ + mtx_enter(&ft->ft_mtx); + ft->ft_reschedule = 0; + mtx_leave(&ft->ft_mtx); - timeout_del_barrier(to); + timeout_del_barrier(&ft->ft_to); + mtx_enter(&ft->ft_mtx); mtx_enter(&kq->kq_lock); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); @@ -711,8 +757,9 @@ filt_timermodify(struct kevent *kev, str kn->kn_data = 0; knote_assign(kev, kn); /* Reinit timeout to invoke tick adjustment again. */ - timeout_set(to, filt_timerexpire, kn); + timeout_set(&ft->ft_to, filt_timerexpire, kn); filt_timeradd(kn, &ts); + mtx_leave(&ft->ft_mtx); return (0); } @@ -720,13 +767,14 @@ filt_timermodify(struct kevent *kev, str int filt_timerprocess(struct knote *kn, struct kevent *kev) { - int active, s; + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook; + int active; - s = splsoftclock(); + mtx_enter(&ft->ft_mtx); active = (kn->kn_data != 0); if (active) knote_submit(kn, kev); - splx(s); + mtx_leave(&ft->ft_mtx); return (active); }