From: Scott Cheloha <scottcheloha@gmail.com>
Subject: Re: clockintr: add clockintr_unbind()
To: tech@openbsd.org
Date: Sun, 4 Feb 2024 12:08:25 -0600

On Fri, Jan 26, 2024 at 12:27:18PM -0600, Scott Cheloha wrote:
> The last thing needed before we can move dt(4) out of the hardclock is
> a way to "destroy" a clockintr.
> 
> This patch adds clockintr_unbind().  It is essentially the opposite
> of clockintr_bind().  An updated manpage documenting the interface is
> included with this patch.
> 
> I'm sending out the dt(4) patch separately.  Take a look at that to
> see the interface is used in practice.
> 
> Things of note:
> 
> - Both clockintr_cancel() and clockintr_unbind() need to cancel
>   the given clockintr, so this patch refactors the bulk of
>   clockintr_cancel() into clockintr_cancel_locked() where both
>   interfaces can use the code.
> 
> - clockintr_cancel() does not block, so for behavioral symmetry
>   clockintr_unbind() does not block by default.  The caller may
>   pass the CL_BARRIER flag to set an execution barrier.  If
>   CL_BARRIER is passed and the clockintr's callback is executing,
>   the caller blocks until the callback returns.  We need the
>   execution barrier for dt(4).
> 
> - Now that you can "destroy" a clockintr, there is room for
>   error when using the API.  In an effort to catch mistakes,
>   e.g. double binds, we now panic in clockintr_bind() if cl_queue
>   is not NULL.  Note that clockintr_unbind() resets cl_queue to
>   NULL, so correct API use will not panic.
> 
> The only sort-of tricky thing here is the execution barrier.
> It goes likes this:
> 
>   1. If the caller passes CL_BARRIER to clockintr_unbind() and the
>      clockintr's callback is executing at that moment, the caller
>      sets CQ_NEED_WAKEUP and calls msleep_nsec().  There is no more
>      work for the caller to do so we pass PNORELOCK.
> 
>   2. When the callback function returns to clockintr_dispatch(),
>      the thread sees CQ_NEED_WAKEUP, clears it, and sends a wakeup.
> 
>   3. The caller is awoken, returns from msleep_nsec(), and then
>      immediately returns from clockintr_unbind().
> 
> Note that clockintr_cancel_locked() sets CQ_IGNORE_REQUEST before the
> caller sleeps during step (1), so clockintr_dispatch() does not touch
> the clockintr object during step (2).
> 
> ok?

Ping.

Index: share/man/man9/clockintr_bind.9
===================================================================
RCS file: share/man/man9/clockintr_bind.9
diff -N share/man/man9/clockintr_bind.9
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ share/man/man9/clockintr_bind.9	4 Feb 2024 18:06:24 -0000
@@ -0,0 +1,296 @@
+.\"	$OpenBSD$
+.\"
+.\" Copyright (c) 2023-2024 Scott Cheloha <cheloha@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate$
+.Dt CLOCKINTR_BIND 9
+.Os
+.Sh NAME
+.Nm clockintr_bind ,
+.Nm clockintr_schedule ,
+.Nm clockintr_advance ,
+.Nm clockintr_cancel ,
+.Nm clockintr_unbind ,
+.Nm clockintr_stagger ,
+.Nm clockrequest_advance
+.Nd execute a function in a clock interrupt context
+.Sh SYNOPSIS
+.In sys/clockintr.h
+.Ft void
+.Fo clockintr_bind
+.Fa "struct clockintr *cl"
+.Fa "struct cpu_info *cpu"
+.Fa "void (*callback)(struct clockrequest *cr, void *cf, void *arg)"
+.Fa "void *arg"
+.Fc
+.Ft void
+.Fo clockintr_schedule
+.Fa "struct clockintr *cl"
+.Fa "uint64_t abs"
+.Fc
+.Ft uint64_t
+.Fo clockintr_advance
+.Fa "struct clockintr *cl"
+.Fa "uint64_t interval"
+.Fc
+.Ft void
+.Fo clockintr_cancel
+.Fa "struct clockintr *cl"
+.Fc
+.Ft void
+.Fo clockintr_unbind
+.Fa "struct clockintr *cl"
+.Fa "uint32_t flags"
+.Fc
+.Ft void
+.Fo clockintr_stagger
+.Fa "struct clockintr *cl"
+.Fa "uint64_t interval"
+.Fa "uint32_t numer"
+.Fa "uint32_t denom"
+.Fc
+.Ft uint64_t
+.Fo clockrequest_advance
+.Fa "struct clockrequest *cr"
+.Fa "uint64_t interval"
+.Fc
+.\" .Fn clockrequest_advance_random is intentionally undocumented.
+.\"  It may be removed in the future.  New code should not use it.
+.Sh DESCRIPTION
+The clock interrupt subsystem schedules functions for asynchronous execution
+from the clock interrupt context on a particular CPU.
+.Pp
+Clock interrupts are well-suited for timekeeping,
+scheduling,
+and statistical profiling.
+Applications with more relaxed latency requirements should use timeouts
+to schedule asynchronous execution;
+see
+.Xr timeout_add 9
+for details.
+.Pp
+The
+.Fn clockintr_bind
+function initializes the clock interrupt object
+.Fa cl .
+When
+.Fa cl
+expires,
+its
+.Fa callback
+function is executed from the
+.Dv IPL_CLOCK
+context on its host
+.Fa cpu
+without any locks or mutexes.
+The callback function must not block.
+Its parameters are as follows:
+.Bl -tag -width indent
+.It Fa cr
+A private
+.Vt clockrequest
+object.
+May be used to request rescheduling;
+see
+.Fn clockrequest_advance
+below.
+.It Fa cf
+The
+.Fa cpu Ns 's
+current machine-dependent clockframe.
+.It Fa arg
+The
+.Fa arg
+given to
+.Fn clockintr_bind .
+.El
+.Pp
+The memory pointed to by
+.Fa cl
+must be zeroed before it is first bound.
+It is an error to use
+.Fa cl
+as argument to any other function in the
+.Vt clockintr
+API before it is bound.
+It is an error to rebind
+.Fa cl
+without first unbinding it;
+see
+.Fn clockintr_unbind
+below.
+.Pp
+The
+.Fn clockintr_schedule
+function schedules
+.Fa cl
+to expire at the absolute time
+.Fa abs
+on the system uptime clock.
+The subsystem will never execute
+.Fa cl Ns 's
+callback function before this expiration time,
+though its execution may be delayed by other activity on the system.
+.Pp
+The
+.Fn clockintr_advance
+function schedules
+.Fa cl
+to expire at the next terminus of the given
+.Fa interval ,
+a non-zero count of nanoseconds,
+relative to
+.Fa cl Ns 's
+current expiration time.
+Periodic clock interrupts should be scheduled with
+.Fn clockintr_advance
+to keep the execution period from drifting.
+.Pp
+The
+.Fn clockintr_cancel
+function cancels any pending expiration of
+.Fa cl .
+.Pp
+The
+.Fn clockintr_unbind
+function cancels any pending expiration of
+.Fa cl
+and severs the binding between
+.Fa cl
+and its host
+.Fa cpu .
+Upon return,
+.Fa cl
+may be rebound with
+.Fn clockintr_bind .
+The call may be configured with zero or more of the following
+.Fa flags :
+.Bl -tag -width CL_BARRIER
+.It Dv CL_BARRIER
+If
+.Fa cl Ns 's
+callback function is executing,
+block until it returns.
+By default,
+the caller does not block.
+Useful when
+.Fa arg
+is a shared reference.
+.El
+.Pp
+The
+.Fn clockintr_stagger
+function resets
+.Fa cl Ns 's
+expiration time to a fraction of the given
+.Fa interval ,
+a count of nanoseconds.
+Specifically,
+.Fa cl Ns 's
+expiration time is reset to
+.Pq Fa interval Ms / Fa denom Ms * Fa numer .
+Periodic clock interrupts bound to multiple CPUs may be staggered
+to reduce the likelihood that their callback functions will execute
+simultaneously and compete for a shared resource.
+It is an error if
+.Fa numer
+is greater than or equal to
+.Fa denom .
+It is an error if
+.Fa cl
+is already scheduled to expire.
+.Pp
+The
+.Fn clockrequest_advance
+function is similar to
+.Fn clockintr_advance ,
+except that
+(a) it may only be called during the execution of a
+.Fa callback
+function,
+(b) it accepts a
+.Vt clockrequest
+pointer as argument,
+and (c) scheduling requests submitted with the interface are not fulfilled
+until the callback function returns.
+When the callback function returns,
+scheduling requests are only committed to the underlying clock interrupt
+object if that object was not manipulated during the execution of the
+callback function.
+Otherwise,
+outstanding requests are discarded.
+.Sh CONTEXT
+The
+.Fn clockintr_bind
+function may only be called from process context.
+.Pp
+The
+.Fn clockintr_advance ,
+.Fn clockintr_cancel ,
+.Fn clockintr_schedule ,
+and
+.Fn clockintr_stagger
+functions may be called from process context or from interrupt context.
+.Pp
+The
+.Fn clockintr_unbind
+function may normally be called from process context or from interrupt context.
+However,
+if the
+.Dv CL_BARRIER
+flag is set in
+.Fa flags ,
+the function may only be called from process context.
+.Pp
+The
+.Fn clockrequest_advance
+function may only be called during execution of a
+.Fa callback
+function.
+.Sh RETURN VALUES
+The
+.Fn clockintr_advance
+and
+.Fn clockrequest_advance
+functions return the number of
+.Fa interval Ns s
+that have elapsed since
+.Fa cl
+was scheduled to expire,
+or zero if
+.Fa cl
+has not yet expired.
+.Sh CODE REFERENCES
+.Pa sys/kern/kern_clockintr.c
+.Sh SEE ALSO
+.Xr microtime 9 ,
+.Xr spl 9 ,
+.Xr timeout 9
+.Rs
+.%A Richard McDougall
+.%A Jim Mauro
+.%B Solaris Internals: Solaris 10 and OpenSolaris Kernel Architecture
+.%I Prentice Hall
+.%I Sun Microsystems Press
+.%D 2nd Edition, 2007
+.%P pp. 912\(en925
+.Re
+.Sh HISTORY
+The
+.Vt clockintr
+and
+.Vt clockrequest
+APIs first appeared in
+.Ox 7.5 .
Index: sys/sys/clockintr.h
===================================================================
RCS file: /cvs/src/sys/sys/clockintr.h,v
diff -u -p -r1.25 clockintr.h
--- sys/sys/clockintr.h	24 Jan 2024 19:23:38 -0000	1.25
+++ sys/sys/clockintr.h	4 Feb 2024 18:06:24 -0000
@@ -113,7 +113,8 @@ struct clockintr_queue {
 #define CQ_INIT			0x00000001	/* clockintr_cpu_init() done */
 #define CQ_INTRCLOCK		0x00000002	/* intrclock installed */
 #define CQ_IGNORE_REQUEST	0x00000004	/* ignore callback requests */
-#define CQ_STATE_MASK		0x00000007
+#define CQ_NEED_WAKEUP		0x00000008	/* caller at barrier */
+#define CQ_STATE_MASK		0x0000000f
 
 void clockintr_cpu_init(const struct intrclock *);
 int clockintr_dispatch(void *);
@@ -123,12 +124,16 @@ void clockintr_trigger(void);
  * Kernel API
  */
 
+#define CL_BARRIER	0x00000001	/* block if callback is running */
+#define CL_FLAG_MASK	0x00000001
+
 uint64_t clockintr_advance(struct clockintr *, uint64_t);
 void clockintr_bind(struct clockintr *, struct cpu_info *,
     void (*)(struct clockrequest *, void *, void *), void *);
 void clockintr_cancel(struct clockintr *);
 void clockintr_schedule(struct clockintr *, uint64_t);
 void clockintr_stagger(struct clockintr *, uint64_t, uint32_t, uint32_t);
+void clockintr_unbind(struct clockintr *, uint32_t);
 uint64_t clockrequest_advance(struct clockrequest *, uint64_t);
 uint64_t clockrequest_advance_random(struct clockrequest *, uint64_t, uint32_t);
 void clockqueue_init(struct clockintr_queue *);
Index: sys/kern/kern_clockintr.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
diff -u -p -r1.64 kern_clockintr.c
--- sys/kern/kern_clockintr.c	24 Jan 2024 19:23:38 -0000	1.64
+++ sys/kern/kern_clockintr.c	4 Feb 2024 18:06:24 -0000
@@ -31,6 +31,7 @@
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
+void clockintr_cancel_locked(struct clockintr *);
 void clockintr_hardclock(struct clockrequest *, void *, void *);
 void clockintr_schedule_locked(struct clockintr *, uint64_t);
 void clockqueue_intrclock_install(struct clockintr_queue *,
@@ -226,6 +227,12 @@ clockintr_dispatch(void *frame)
 			CLR(request->cr_flags, CR_RESCHEDULE);
 			clockqueue_pend_insert(cq, cl, request->cr_expiration);
 		}
+		if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) {
+			CLR(cq->cq_flags, CQ_NEED_WAKEUP);
+			mtx_leave(&cq->cq_mtx);
+			wakeup(&cq->cq_running);
+			mtx_enter(&cq->cq_mtx);
+		}
 		run++;
 	}
 
@@ -317,9 +324,20 @@ void
 clockintr_cancel(struct clockintr *cl)
 {
 	struct clockintr_queue *cq = cl->cl_queue;
-	int was_next;
 
 	mtx_enter(&cq->cq_mtx);
+	clockintr_cancel_locked(cl);
+	mtx_leave(&cq->cq_mtx);
+}
+
+void
+clockintr_cancel_locked(struct clockintr *cl)
+{
+	struct clockintr_queue *cq = cl->cl_queue;
+	int was_next;
+
+	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
+
 	if (ISSET(cl->cl_flags, CLST_PENDING)) {
 		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
 		clockqueue_pend_delete(cq, cl);
@@ -332,7 +350,6 @@ clockintr_cancel(struct clockintr *cl)
 	}
 	if (cl == cq->cq_running)
 		SET(cq->cq_flags, CQ_IGNORE_REQUEST);
-	mtx_leave(&cq->cq_mtx);
 }
 
 void
@@ -341,13 +358,41 @@ clockintr_bind(struct clockintr *cl, str
 {
 	struct clockintr_queue *cq = &ci->ci_queue;
 
+	splassert(IPL_NONE);
+
+	if (cl->cl_queue != NULL)
+		panic("%s: cl_queue is not NULL", __func__);
+
+	mtx_enter(&cq->cq_mtx);
 	cl->cl_arg = arg;
 	cl->cl_func = func;
 	cl->cl_queue = cq;
-
-	mtx_enter(&cq->cq_mtx);
 	TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
 	mtx_leave(&cq->cq_mtx);
+}
+
+void
+clockintr_unbind(struct clockintr *cl, uint32_t flags)
+{
+	struct clockintr_queue *cq = cl->cl_queue;
+
+	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
+
+	mtx_enter(&cq->cq_mtx);
+
+	clockintr_cancel_locked(cl);
+
+	cl->cl_arg = NULL;
+	cl->cl_func = NULL;
+	cl->cl_queue = NULL;
+	TAILQ_REMOVE(&cq->cq_all, cl, cl_alink);
+
+	if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) {
+		SET(cq->cq_flags, CQ_NEED_WAKEUP);
+		msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK,
+		    "clkbar", INFSLP);
+	} else
+		mtx_leave(&cq->cq_mtx);
 }
 
 void