Download raw body.
kevent(2): make EVFILT_TIMER mp-safe
On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote:
> On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> > The last one kernel locked from kern/kern_event.c.
> >
> > Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> > could reschedule itself, so `ft_reschedule' to prevent this while we
> > canceling timeout. The `ft_mtx' mutex(9) also protects the
> > re-initialization, because it could be concurrently accessed from
> > another filt_timermodify() thread.
Concurrent filt_timermodify() calls cannot happen, though, because
the kqueue subsystem serializes the same knote's f_attach, f_detach,
f_modify, and f_process calls automatically.
> >
> > Note, this diff keeps the timeout handler under kernel lock. We don't
> > touch kernel lock while we perform timeout_add(9) or
> > timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not
> > stuck in kernel lock, meanwhile the timeout firing will take kernel lock
> > in most cases even for mp-safe timeouts.
> >
>
> Oops, forgot to remove "kq_ntimeouts++;".
>
> Index: sys/kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.201
> diff -u -p -r1.201 kern_event.c
> --- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201
> +++ sys/kern/kern_event.c 8 May 2025 00:41:41 -0000
> @@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn);
> int filt_signal(struct knote *kn, long hint);
> int filt_fileattach(struct knote *kn);
> void filt_timerexpire(void *knx);
> +void filt_dotimerexpire(struct knote *kn);
> int filt_timerattach(struct knote *kn);
> void filt_timerdetach(struct knote *kn);
> int filt_timermodify(struct kevent *kev, struct knote *kn);
> @@ -172,7 +173,7 @@ const struct filterops file_filtops = {
> };
>
> const struct filterops timer_filtops = {
> - .f_flags = 0,
> + .f_flags = FILTEROP_MPSAFE,
> .f_attach = filt_timerattach,
> .f_detach = filt_timerdetach,
> .f_event = NULL,
> @@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
> .f_process = filt_timerprocess,
> };
>
> +/*
> + * Locking:
> + * I immutable after creation
> + * a atomic operations
> + * t ft_mtx
> + */
> +
> struct pool knote_pool;
> struct pool kqueue_pool;
> struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
> struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
> -int kq_ntimeouts = 0;
> -int kq_timeoutmax = (4 * 1024);
> +int kq_ntimeouts = 0; /* [a] */
> +int kq_timeoutmax = (4 * 1024); /* [I] */
>
> #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
>
> @@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t
> return (0);
> }
>
> +struct filt_timer {
> + struct mutex ft_mtx;
> + struct timeout ft_to; /* [t] */
> + int ft_reschedule; /* [t] */
> +};
> +
> static void
> filt_timeradd(struct knote *kn, struct timespec *ts)
> {
> struct timespec expiry, now;
> - struct timeout *to = kn->kn_hook;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
Please leave out these explicit type casts when getting ft from kn_hook.
The casts are not required by the C language, and hence mainly add
visual clutter (and make the already weak type checking even weaker by
allowing more types as the source, should there be a mishap with the
choice of the right-hand side variable).
> int tticks;
>
> + MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
> + ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
> + (kn->kn_sfflags & NOTE_ABSTIME) == 0);
> +
> if (kn->kn_sfflags & NOTE_ABSTIME) {
> nanotime(&now);
> if (timespeccmp(ts, &now, >)) {
> timespecsub(ts, &now, &expiry);
> /* XXX timeout_abs_ts with CLOCK_REALTIME */
> - timeout_add(to, tstohz(&expiry));
> + timeout_add(&ft->ft_to, tstohz(&expiry));
> } else {
> /* Expire immediately. */
> - filt_timerexpire(kn);
> + filt_dotimerexpire(kn);
> }
> return;
> }
>
> tticks = tstohz(ts);
> /* Remove extra tick from tstohz() if timeout has fired before. */
> - if (timeout_triggered(to))
> + if (timeout_triggered(&ft->ft_to))
> tticks--;
> - timeout_add(to, (tticks > 0) ? tticks : 1);
> + timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
> }
>
> void
> -filt_timerexpire(void *knx)
> +filt_dotimerexpire(struct knote *kn)
> {
> struct timespec ts;
> - struct knote *kn = knx;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> struct kqueue *kq = kn->kn_kq;
>
> + MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
> +
> kn->kn_data++;
> +
> mtx_enter(&kq->kq_lock);
> knote_activate(kn);
> mtx_leave(&kq->kq_lock);
>
> - if ((kn->kn_flags & EV_ONESHOT) == 0 &&
> - (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
> + if (ft->ft_reschedule) {
> (void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
> filt_timeradd(kn, &ts);
> }
> }
>
> +void
> +filt_timerexpire(void *knx)
> +{
> + struct knote *kn = knx;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> +
> + mtx_enter(&ft->ft_mtx);
> + filt_dotimerexpire(kn);
> + mtx_leave(&ft->ft_mtx);
> +}
> +
> /*
> * data contains amount of time to sleep
> */
> @@ -651,23 +683,29 @@ int
> filt_timerattach(struct knote *kn)
> {
> struct timespec ts;
> - struct timeout *to;
> + struct filt_timer *ft;
> int error;
>
> error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
> if (error != 0)
> return (error);
>
> - if (kq_ntimeouts > kq_timeoutmax)
> + if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
> + atomic_dec_int(&kq_ntimeouts);
> return (ENOMEM);
> - kq_ntimeouts++;
> + }
>
> if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
> kn->kn_flags |= EV_CLEAR; /* automatically set */
> - to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
> - timeout_set(to, filt_timerexpire, kn);
> - kn->kn_hook = to;
> +
> + ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
> + mtx_init(&ft->ft_mtx, IPL_MPSAFE);
Shouldn't the level be IPL_SOFTCLOCK?
> + timeout_set(&ft->ft_to, filt_timerexpire, kn);
> + kn->kn_hook = ft;
> +
> + mtx_enter(&ft->ft_mtx);
> filt_timeradd(kn, &ts);
> + mtx_leave(&ft->ft_mtx);
>
> return (0);
> }
> @@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
> void
> filt_timerdetach(struct knote *kn)
> {
> - struct timeout *to;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
>
> - to = (struct timeout *)kn->kn_hook;
> - timeout_del_barrier(to);
> - free(to, M_KEVENT, sizeof(*to));
> - kq_ntimeouts--;
> + mtx_enter(&ft->ft_mtx);
> + ft->ft_reschedule = 0;
> + mtx_leave(&ft->ft_mtx);
> +
> + timeout_del_barrier(&ft->ft_to);
> + free(ft, M_KEVENT, sizeof(*ft));
> + atomic_dec_int(&kq_ntimeouts);
> }
>
> int
> @@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
> {
> struct timespec ts;
> struct kqueue *kq = kn->kn_kq;
> - struct timeout *to = kn->kn_hook;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> int error;
>
> error = filt_timervalidate(kev->fflags, kev->data, &ts);
> @@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
> }
>
> /* Reset the timer. Any pending events are discarded. */
> + mtx_enter(&ft->ft_mtx);
> + ft->ft_reschedule = 0;
> + mtx_leave(&ft->ft_mtx);
>
> - timeout_del_barrier(to);
> + timeout_del_barrier(&ft->ft_to);
>
> + mtx_enter(&ft->ft_mtx);
> mtx_enter(&kq->kq_lock);
> if (kn->kn_status & KN_QUEUED)
> knote_dequeue(kn);
> @@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
> kn->kn_data = 0;
> knote_assign(kev, kn);
> /* Reinit timeout to invoke tick adjustment again. */
> - timeout_set(to, filt_timerexpire, kn);
> + timeout_set(&ft->ft_to, filt_timerexpire, kn);
> filt_timeradd(kn, &ts);
> + mtx_leave(&ft->ft_mtx);
>
> return (0);
> }
> @@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
> int
> filt_timerprocess(struct knote *kn, struct kevent *kev)
> {
> - int active, s;
> + struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
> + int active;
>
> - s = splsoftclock();
> + mtx_enter(&ft->ft_mtx);
> active = (kn->kn_data != 0);
> if (active)
> knote_submit(kn, kev);
> - splx(s);
> + mtx_leave(&ft->ft_mtx);
>
> return (active);
> }
>
kevent(2): make EVFILT_TIMER mp-safe