Download raw body.
kevent(2): make EVFILT_TIMER mp-safe
On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> The last one kernel locked from kern/kern_event.c.
>
> Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> could reschedule itself, so `ft_reschedule' to prevent this while we
> canceling timeout. The `ft_mtx' mutex(9) also protects the
> re-initialization, because it could be concurrently accessed from
> another filt_timermodify() thread.
>
> Note, this diff keeps the timeout handler under kernel lock. We don't
> touch kernel lock while we perform timeout_add(9) or
> timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not
> stuck in kernel lock, meanwhile the timeout firing will take kernel lock
> in most cases even for mp-safe timeouts.
>
Oops, forgot to remove "kq_ntimeouts++;".
Index: sys/kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.201
diff -u -p -r1.201 kern_event.c
--- sys/kern/kern_event.c 10 Feb 2025 16:45:46 -0000 1.201
+++ sys/kern/kern_event.c 8 May 2025 00:41:41 -0000
@@ -131,6 +131,7 @@ void filt_sigdetach(struct knote *kn);
int filt_signal(struct knote *kn, long hint);
int filt_fileattach(struct knote *kn);
void filt_timerexpire(void *knx);
+void filt_dotimerexpire(struct knote *kn);
int filt_timerattach(struct knote *kn);
void filt_timerdetach(struct knote *kn);
int filt_timermodify(struct kevent *kev, struct knote *kn);
@@ -172,7 +173,7 @@ const struct filterops file_filtops = {
};
const struct filterops timer_filtops = {
- .f_flags = 0,
+ .f_flags = FILTEROP_MPSAFE,
.f_attach = filt_timerattach,
.f_detach = filt_timerdetach,
.f_event = NULL,
@@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
.f_process = filt_timerprocess,
};
+/*
+ * Locking:
+ * I immutable after creation
+ * a atomic operations
+ * t ft_mtx
+ */
+
struct pool knote_pool;
struct pool kqueue_pool;
struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
struct rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
-int kq_ntimeouts = 0;
-int kq_timeoutmax = (4 * 1024);
+int kq_ntimeouts = 0; /* [a] */
+int kq_timeoutmax = (4 * 1024); /* [I] */
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
@@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t
return (0);
}
+struct filt_timer {
+ struct mutex ft_mtx;
+ struct timeout ft_to; /* [t] */
+ int ft_reschedule; /* [t] */
+};
+
static void
filt_timeradd(struct knote *kn, struct timespec *ts)
{
struct timespec expiry, now;
- struct timeout *to = kn->kn_hook;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
int tticks;
+ MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
+
+ ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
+ (kn->kn_sfflags & NOTE_ABSTIME) == 0);
+
if (kn->kn_sfflags & NOTE_ABSTIME) {
nanotime(&now);
if (timespeccmp(ts, &now, >)) {
timespecsub(ts, &now, &expiry);
/* XXX timeout_abs_ts with CLOCK_REALTIME */
- timeout_add(to, tstohz(&expiry));
+ timeout_add(&ft->ft_to, tstohz(&expiry));
} else {
/* Expire immediately. */
- filt_timerexpire(kn);
+ filt_dotimerexpire(kn);
}
return;
}
tticks = tstohz(ts);
/* Remove extra tick from tstohz() if timeout has fired before. */
- if (timeout_triggered(to))
+ if (timeout_triggered(&ft->ft_to))
tticks--;
- timeout_add(to, (tticks > 0) ? tticks : 1);
+ timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
}
void
-filt_timerexpire(void *knx)
+filt_dotimerexpire(struct knote *kn)
{
struct timespec ts;
- struct knote *kn = knx;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
struct kqueue *kq = kn->kn_kq;
+ MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
+
kn->kn_data++;
+
mtx_enter(&kq->kq_lock);
knote_activate(kn);
mtx_leave(&kq->kq_lock);
- if ((kn->kn_flags & EV_ONESHOT) == 0 &&
- (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
+ if (ft->ft_reschedule) {
(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
filt_timeradd(kn, &ts);
}
}
+void
+filt_timerexpire(void *knx)
+{
+ struct knote *kn = knx;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
+
+ mtx_enter(&ft->ft_mtx);
+ filt_dotimerexpire(kn);
+ mtx_leave(&ft->ft_mtx);
+}
+
/*
* data contains amount of time to sleep
*/
@@ -651,23 +683,29 @@ int
filt_timerattach(struct knote *kn)
{
struct timespec ts;
- struct timeout *to;
+ struct filt_timer *ft;
int error;
error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
if (error != 0)
return (error);
- if (kq_ntimeouts > kq_timeoutmax)
+ if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
+ atomic_dec_int(&kq_ntimeouts);
return (ENOMEM);
- kq_ntimeouts++;
+ }
if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
kn->kn_flags |= EV_CLEAR; /* automatically set */
- to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
- timeout_set(to, filt_timerexpire, kn);
- kn->kn_hook = to;
+
+ ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
+ mtx_init(&ft->ft_mtx, IPL_MPSAFE);
+ timeout_set(&ft->ft_to, filt_timerexpire, kn);
+ kn->kn_hook = ft;
+
+ mtx_enter(&ft->ft_mtx);
filt_timeradd(kn, &ts);
+ mtx_leave(&ft->ft_mtx);
return (0);
}
@@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
void
filt_timerdetach(struct knote *kn)
{
- struct timeout *to;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
- to = (struct timeout *)kn->kn_hook;
- timeout_del_barrier(to);
- free(to, M_KEVENT, sizeof(*to));
- kq_ntimeouts--;
+ mtx_enter(&ft->ft_mtx);
+ ft->ft_reschedule = 0;
+ mtx_leave(&ft->ft_mtx);
+
+ timeout_del_barrier(&ft->ft_to);
+ free(ft, M_KEVENT, sizeof(*ft));
+ atomic_dec_int(&kq_ntimeouts);
}
int
@@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
{
struct timespec ts;
struct kqueue *kq = kn->kn_kq;
- struct timeout *to = kn->kn_hook;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
int error;
error = filt_timervalidate(kev->fflags, kev->data, &ts);
@@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
}
/* Reset the timer. Any pending events are discarded. */
+ mtx_enter(&ft->ft_mtx);
+ ft->ft_reschedule = 0;
+ mtx_leave(&ft->ft_mtx);
- timeout_del_barrier(to);
+ timeout_del_barrier(&ft->ft_to);
+ mtx_enter(&ft->ft_mtx);
mtx_enter(&kq->kq_lock);
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
@@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
kn->kn_data = 0;
knote_assign(kev, kn);
/* Reinit timeout to invoke tick adjustment again. */
- timeout_set(to, filt_timerexpire, kn);
+ timeout_set(&ft->ft_to, filt_timerexpire, kn);
filt_timeradd(kn, &ts);
+ mtx_leave(&ft->ft_mtx);
return (0);
}
@@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
int
filt_timerprocess(struct knote *kn, struct kevent *kev)
{
- int active, s;
+ struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
+ int active;
- s = splsoftclock();
+ mtx_enter(&ft->ft_mtx);
active = (kn->kn_data != 0);
if (active)
knote_submit(kn, kev);
- splx(s);
+ mtx_leave(&ft->ft_mtx);
return (active);
}
kevent(2): make EVFILT_TIMER mp-safe