Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: kevent(2): make EVFILT_TIMER mp-safe
To:
Visa Hankala <visa@openbsd.org>, tech@openbsd.org
Date:
Thu, 8 May 2025 03:43:13 +0300

Download raw body.

Thread
On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> The last one kernel locked from kern/kern_event.c. 
> 
> Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> could reschedule itself, so `ft_reschedule' to prevent this while we
> canceling timeout. The `ft_mtx' mutex(9) also protects the
> re-initialization, because it could be concurrently accessed from
> another filt_timermodify() thread.
> 
> Note, this diff keeps the timeout handler under kernel lock. We don't
> touch kernel lock while we perform timeout_add(9) or
> timeout_del_barier(9) on non mp-safe timeouts, so kevent(2) will not
> stuck in kernel lock, meanwhile the timeout firing will take kernel lock
> in most cases even for mp-safe timeouts.
> 

Oops, forgot to remove "kq_ntimeouts++;".

Index: sys/kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.201
diff -u -p -r1.201 kern_event.c
--- sys/kern/kern_event.c	10 Feb 2025 16:45:46 -0000	1.201
+++ sys/kern/kern_event.c	8 May 2025 00:41:41 -0000
@@ -131,6 +131,7 @@ void	filt_sigdetach(struct knote *kn);
 int	filt_signal(struct knote *kn, long hint);
 int	filt_fileattach(struct knote *kn);
 void	filt_timerexpire(void *knx);
+void	filt_dotimerexpire(struct knote *kn);
 int	filt_timerattach(struct knote *kn);
 void	filt_timerdetach(struct knote *kn);
 int	filt_timermodify(struct kevent *kev, struct knote *kn);
@@ -172,7 +173,7 @@ const struct filterops file_filtops = {
 };
 
 const struct filterops timer_filtops = {
-	.f_flags	= 0,
+	.f_flags	= FILTEROP_MPSAFE,
 	.f_attach	= filt_timerattach,
 	.f_detach	= filt_timerdetach,
 	.f_event	= NULL,
@@ -180,12 +181,19 @@ const struct filterops timer_filtops = {
 	.f_process	= filt_timerprocess,
 };
 
+/*
+ * Locking:
+ *	I	immutable after creation
+ *	a	atomic operations
+ *	t	ft_mtx
+ */
+
 struct	pool knote_pool;
 struct	pool kqueue_pool;
 struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
 struct	rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
-int kq_ntimeouts = 0;
-int kq_timeoutmax = (4 * 1024);
+int kq_ntimeouts = 0;		/* [a] */
+int kq_timeoutmax = (4 * 1024);	/* [I] */
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
@@ -598,52 +606,76 @@ filt_timervalidate(int sfflags, int64_t 
 	return (0);
 }
 
+struct filt_timer {
+	struct mutex	ft_mtx;
+	struct timeout	ft_to;		/* [t] */
+	int		ft_reschedule;	/* [t] */
+};
+
 static void
 filt_timeradd(struct knote *kn, struct timespec *ts)
 {
 	struct timespec expiry, now;
-	struct timeout *to = kn->kn_hook;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
 	int tticks;
 
+	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
+
+	ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
+	    (kn->kn_sfflags & NOTE_ABSTIME) == 0);
+
 	if (kn->kn_sfflags & NOTE_ABSTIME) {
 		nanotime(&now);
 		if (timespeccmp(ts, &now, >)) {
 			timespecsub(ts, &now, &expiry);
 			/* XXX timeout_abs_ts with CLOCK_REALTIME */
-			timeout_add(to, tstohz(&expiry));
+			timeout_add(&ft->ft_to, tstohz(&expiry));
 		} else {
 			/* Expire immediately. */
-			filt_timerexpire(kn);
+			filt_dotimerexpire(kn);
 		}
 		return;
 	}
 
 	tticks = tstohz(ts);
 	/* Remove extra tick from tstohz() if timeout has fired before. */
-	if (timeout_triggered(to))
+	if (timeout_triggered(&ft->ft_to))
 		tticks--;
-	timeout_add(to, (tticks > 0) ? tticks : 1);
+	timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
 }
 
 void
-filt_timerexpire(void *knx)
+filt_dotimerexpire(struct knote *kn)
 {
 	struct timespec ts;
-	struct knote *kn = knx;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
 	struct kqueue *kq = kn->kn_kq;
 
+	MUTEX_ASSERT_LOCKED(&ft->ft_mtx);
+
 	kn->kn_data++;
+
 	mtx_enter(&kq->kq_lock);
 	knote_activate(kn);
 	mtx_leave(&kq->kq_lock);
 
-	if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-	    (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
+	if (ft->ft_reschedule) {
 		(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
 		filt_timeradd(kn, &ts);
 	}
 }
 
+void
+filt_timerexpire(void *knx)
+{
+	struct knote *kn = knx;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
+
+	mtx_enter(&ft->ft_mtx);
+	filt_dotimerexpire(kn);
+	mtx_leave(&ft->ft_mtx);
+}
+
 /*
  * data contains amount of time to sleep
  */
@@ -651,23 +683,29 @@ int
 filt_timerattach(struct knote *kn)
 {
 	struct timespec ts;
-	struct timeout *to;
+	struct filt_timer *ft;
 	int error;
 
 	error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
 	if (error != 0)
 		return (error);
 
-	if (kq_ntimeouts > kq_timeoutmax)
+	if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
+		atomic_dec_int(&kq_ntimeouts);
 		return (ENOMEM);
-	kq_ntimeouts++;
+	}
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
-	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
-	timeout_set(to, filt_timerexpire, kn);
-	kn->kn_hook = to;
+
+	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
+	mtx_init(&ft->ft_mtx, IPL_MPSAFE);
+	timeout_set(&ft->ft_to, filt_timerexpire, kn);
+	kn->kn_hook = ft;
+
+	mtx_enter(&ft->ft_mtx);
 	filt_timeradd(kn, &ts);
+	mtx_leave(&ft->ft_mtx);
 
 	return (0);
 }
@@ -675,12 +713,15 @@ filt_timerattach(struct knote *kn)
 void
 filt_timerdetach(struct knote *kn)
 {
-	struct timeout *to;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
 
-	to = (struct timeout *)kn->kn_hook;
-	timeout_del_barrier(to);
-	free(to, M_KEVENT, sizeof(*to));
-	kq_ntimeouts--;
+	mtx_enter(&ft->ft_mtx);
+	ft->ft_reschedule = 0;
+	mtx_leave(&ft->ft_mtx);
+
+	timeout_del_barrier(&ft->ft_to);
+	free(ft, M_KEVENT, sizeof(*ft));
+	atomic_dec_int(&kq_ntimeouts);
 }
 
 int
@@ -688,7 +729,7 @@ filt_timermodify(struct kevent *kev, str
 {
 	struct timespec ts;
 	struct kqueue *kq = kn->kn_kq;
-	struct timeout *to = kn->kn_hook;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
 	int error;
 
 	error = filt_timervalidate(kev->fflags, kev->data, &ts);
@@ -699,9 +740,13 @@ filt_timermodify(struct kevent *kev, str
 	}
 
 	/* Reset the timer. Any pending events are discarded. */
+	mtx_enter(&ft->ft_mtx);
+	ft->ft_reschedule = 0;
+	mtx_leave(&ft->ft_mtx);
 
-	timeout_del_barrier(to);
+	timeout_del_barrier(&ft->ft_to);
 
+	mtx_enter(&ft->ft_mtx);
 	mtx_enter(&kq->kq_lock);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
@@ -711,8 +756,9 @@ filt_timermodify(struct kevent *kev, str
 	kn->kn_data = 0;
 	knote_assign(kev, kn);
 	/* Reinit timeout to invoke tick adjustment again. */
-	timeout_set(to, filt_timerexpire, kn);
+	timeout_set(&ft->ft_to, filt_timerexpire, kn);
 	filt_timeradd(kn, &ts);
+	mtx_leave(&ft->ft_mtx);
 
 	return (0);
 }
@@ -720,13 +766,14 @@ filt_timermodify(struct kevent *kev, str
 int
 filt_timerprocess(struct knote *kn, struct kevent *kev)
 {
-	int active, s;
+	struct filt_timer *ft = (struct filt_timer *)kn->kn_hook;
+	int active;
 
-	s = splsoftclock();
+	mtx_enter(&ft->ft_mtx);
 	active = (kn->kn_data != 0);
 	if (active)
 		knote_submit(kn, kev);
-	splx(s);
+	mtx_leave(&ft->ft_mtx);
 
 	return (active);
 }