Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: kevent(2): make EVFILT_TIMER mp-safe
To:
Visa Hankala <visa@hankala.org>
Cc:
tech@openbsd.org
Date:
Thu, 8 May 2025 19:39:12 +0300

Download raw body.

Thread
On Thu, May 08, 2025 at 02:25:04PM +0000, Visa Hankala wrote:
> On Thu, May 08, 2025 at 03:43:13AM +0300, Vitaliy Makkoveev wrote:
> > On Thu, May 08, 2025 at 03:32:22AM +0300, Vitaliy Makkoveev wrote:
> > > The last one kernel locked from kern/kern_event.c. 
> > > 
> > > Introduce 'filt_timer' opaque structure to keep the timeout(9), the
> > > `ft_reschedule' flag and protecting mutex(9). The timeout(9) handler
> > > could reschedule itself, so `ft_reschedule' to prevent this while we
> > > canceling timeout. The `ft_mtx' mutex(9) also protects the
> > > re-initialization, because it could be concurrently accessed from
> > > another filt_timermodify() thread.
> 
> Concurrent filt_timermodify() calls cannot happen, though, because
> the kqueue subsystem serializes the same knote's f_attach, f_detach,
> f_modify, and f_process calls automatically.
> 

Nice to hear. So I need serialization only with running timeout handler.
I removed recursion from filt_timeradd()/filt_timerexpire() to reduce
`ft_mtx' usage.

The type casts also removed.

Index: sys/kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.201
diff -u -p -r1.201 kern_event.c
--- sys/kern/kern_event.c	10 Feb 2025 16:45:46 -0000	1.201
+++ sys/kern/kern_event.c	8 May 2025 15:59:50 -0000
@@ -172,7 +172,7 @@ const struct filterops file_filtops = {
 };
 
 const struct filterops timer_filtops = {
-	.f_flags	= 0,
+	.f_flags	= FILTEROP_MPSAFE,
 	.f_attach	= filt_timerattach,
 	.f_detach	= filt_timerdetach,
 	.f_event	= NULL,
@@ -180,12 +180,19 @@ const struct filterops timer_filtops = {
 	.f_process	= filt_timerprocess,
 };
 
+/*
+ * Locking:
+ *	I	immutable after creation
+ *	a	atomic operations
+ *	t	ft_mtx
+ */
+
 struct	pool knote_pool;
 struct	pool kqueue_pool;
 struct	mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
 struct	rwlock kqueue_ps_list_lock = RWLOCK_INITIALIZER("kqpsl");
-int kq_ntimeouts = 0;
-int kq_timeoutmax = (4 * 1024);
+int kq_ntimeouts = 0;		/* [a] */
+int kq_timeoutmax = (4 * 1024);	/* [I] */
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
@@ -598,50 +605,81 @@ filt_timervalidate(int sfflags, int64_t 
 	return (0);
 }
 
+struct filt_timer {
+	struct mutex	ft_mtx;
+	struct timeout	ft_to;
+	int		ft_reschedule;	/* [t] */
+};
+
 static void
 filt_timeradd(struct knote *kn, struct timespec *ts)
 {
 	struct timespec expiry, now;
-	struct timeout *to = kn->kn_hook;
+	struct filt_timer *ft = kn->kn_hook;
 	int tticks;
 
+	/*
+	 * Unlocked `ft_reschedule' and `kn_data' access is fine,
+	 * we have no concurrent threads at this point.
+	 */
+
+	ft->ft_reschedule = ((kn->kn_flags & EV_ONESHOT) == 0 &&
+	    (kn->kn_sfflags & NOTE_ABSTIME) == 0);
+
 	if (kn->kn_sfflags & NOTE_ABSTIME) {
 		nanotime(&now);
 		if (timespeccmp(ts, &now, >)) {
 			timespecsub(ts, &now, &expiry);
 			/* XXX timeout_abs_ts with CLOCK_REALTIME */
-			timeout_add(to, tstohz(&expiry));
+			timeout_add(&ft->ft_to, tstohz(&expiry));
 		} else {
+			struct kqueue *kq = kn->kn_kq;
+
 			/* Expire immediately. */
-			filt_timerexpire(kn);
+			kn->kn_data++;
+			mtx_enter(&kq->kq_lock);
+			knote_activate(kn);
+			mtx_leave(&kq->kq_lock);
 		}
 		return;
 	}
 
 	tticks = tstohz(ts);
 	/* Remove extra tick from tstohz() if timeout has fired before. */
-	if (timeout_triggered(to))
+	if (timeout_triggered(&ft->ft_to))
 		tticks--;
-	timeout_add(to, (tticks > 0) ? tticks : 1);
+	timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
 }
 
 void
 filt_timerexpire(void *knx)
 {
-	struct timespec ts;
 	struct knote *kn = knx;
 	struct kqueue *kq = kn->kn_kq;
+	struct filt_timer *ft = kn->kn_hook;
 
+	mtx_enter(&ft->ft_mtx);
 	kn->kn_data++;
+
 	mtx_enter(&kq->kq_lock);
 	knote_activate(kn);
 	mtx_leave(&kq->kq_lock);
 
-	if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-	    (kn->kn_sfflags & NOTE_ABSTIME) == 0) {
+	if (ft->ft_reschedule) {
+		struct timespec ts;
+		int tticks;
+
 		(void)filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
-		filt_timeradd(kn, &ts);
+		tticks = tstohz(&ts);
+		/*
+		 * Remove extra tick from tstohz() if timeout has
+		 * fired before.
+		 */
+		if (timeout_triggered(&ft->ft_to))
+			tticks--;
+		timeout_add(&ft->ft_to, (tticks > 0) ? tticks : 1);
 	}
+	mtx_leave(&ft->ft_mtx);
 }
 
 /*
@@ -651,22 +689,25 @@ int
 filt_timerattach(struct knote *kn)
 {
 	struct timespec ts;
-	struct timeout *to;
+	struct filt_timer *ft;
 	int error;
 
 	error = filt_timervalidate(kn->kn_sfflags, kn->kn_sdata, &ts);
 	if (error != 0)
 		return (error);
 
-	if (kq_ntimeouts > kq_timeoutmax)
+	if (atomic_inc_int_nv(&kq_ntimeouts) > kq_timeoutmax) {
+		atomic_dec_int(&kq_ntimeouts);
 		return (ENOMEM);
-	kq_ntimeouts++;
+	}
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
-	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
-	timeout_set(to, filt_timerexpire, kn);
-	kn->kn_hook = to;
+
+	ft = malloc(sizeof(*ft), M_KEVENT, M_WAITOK);
+	mtx_init(&ft->ft_mtx, IPL_MPSAFE);
+	timeout_set(&ft->ft_to, filt_timerexpire, kn);
+	kn->kn_hook = ft;
 	filt_timeradd(kn, &ts);
 
 	return (0);
@@ -675,12 +716,15 @@ filt_timerattach(struct knote *kn)
 void
 filt_timerdetach(struct knote *kn)
 {
-	struct timeout *to;
+	struct filt_timer *ft = kn->kn_hook;
 
-	to = (struct timeout *)kn->kn_hook;
-	timeout_del_barrier(to);
-	free(to, M_KEVENT, sizeof(*to));
-	kq_ntimeouts--;
+	mtx_enter(&ft->ft_mtx);
+	ft->ft_reschedule = 0;
+	mtx_leave(&ft->ft_mtx);
+
+	timeout_del_barrier(&ft->ft_to);
+	free(ft, M_KEVENT, sizeof(*ft));
+	atomic_dec_int(&kq_ntimeouts);
 }
 
 int
@@ -688,7 +732,7 @@ filt_timermodify(struct kevent *kev, str
 {
 	struct timespec ts;
 	struct kqueue *kq = kn->kn_kq;
-	struct timeout *to = kn->kn_hook;
+	struct filt_timer *ft = kn->kn_hook;
 	int error;
 
 	error = filt_timervalidate(kev->fflags, kev->data, &ts);
@@ -699,8 +743,11 @@ filt_timermodify(struct kevent *kev, str
 	}
 
 	/* Reset the timer. Any pending events are discarded. */
+	mtx_enter(&ft->ft_mtx);
+	ft->ft_reschedule = 0;
+	mtx_leave(&ft->ft_mtx);
 
-	timeout_del_barrier(to);
+	timeout_del_barrier(&ft->ft_to);
 
 	mtx_enter(&kq->kq_lock);
 	if (kn->kn_status & KN_QUEUED)
@@ -711,7 +758,7 @@ filt_timermodify(struct kevent *kev, str
 	kn->kn_data = 0;
 	knote_assign(kev, kn);
 	/* Reinit timeout to invoke tick adjustment again. */
-	timeout_set(to, filt_timerexpire, kn);
+	timeout_set(&ft->ft_to, filt_timerexpire, kn);
 	filt_timeradd(kn, &ts);
 
 	return (0);
@@ -720,13 +767,14 @@ filt_timermodify(struct kevent *kev, str
 int
 filt_timerprocess(struct knote *kn, struct kevent *kev)
 {
-	int active, s;
+	struct filt_timer *ft = kn->kn_hook;
+	int active;
 
-	s = splsoftclock();
+	mtx_enter(&ft->ft_mtx);
 	active = (kn->kn_data != 0);
 	if (active)
 		knote_submit(kn, kev);
-	splx(s);
+	mtx_leave(&ft->ft_mtx);
 
 	return (active);
 }