From: Scott Cheloha Subject: Re: clockintr: add clockintr_unbind() To: tech@openbsd.org Date: Sun, 4 Feb 2024 12:08:25 -0600 On Fri, Jan 26, 2024 at 12:27:18PM -0600, Scott Cheloha wrote: > The last thing needed before we can move dt(4) out of the hardclock is > a way to "destroy" a clockintr. > > This patch adds clockintr_unbind(). It is essentially the opposite > of clockintr_bind(). An updated manpage documenting the interface is > included with this patch. > > I'm sending out the dt(4) patch separately. Take a look at that to > see the interface is used in practice. > > Things of note: > > - Both clockintr_cancel() and clockintr_unbind() need to cancel > the given clockintr, so this patch refactors the bulk of > clockintr_cancel() into clockintr_cancel_locked() where both > interfaces can use the code. > > - clockintr_cancel() does not block, so for behavioral symmetry > clockintr_unbind() does not block by default. The caller may > pass the CL_BARRIER flag to set an execution barrier. If > CL_BARRIER is passed and the clockintr's callback is executing, > the caller blocks until the callback returns. We need the > execution barrier for dt(4). > > - Now that you can "destroy" a clockintr, there is room for > error when using the API. In an effort to catch mistakes, > e.g. double binds, we now panic in clockintr_bind() if cl_queue > is not NULL. Note that clockintr_unbind() resets cl_queue to > NULL, so correct API use will not panic. > > The only sort-of tricky thing here is the execution barrier. > It goes likes this: > > 1. If the caller passes CL_BARRIER to clockintr_unbind() and the > clockintr's callback is executing at that moment, the caller > sets CQ_NEED_WAKEUP and calls msleep_nsec(). There is no more > work for the caller to do so we pass PNORELOCK. > > 2. When the callback function returns to clockintr_dispatch(), > the thread sees CQ_NEED_WAKEUP, clears it, and sends a wakeup. > > 3. The caller is awoken, returns from msleep_nsec(), and then > immediately returns from clockintr_unbind(). > > Note that clockintr_cancel_locked() sets CQ_IGNORE_REQUEST before the > caller sleeps during step (1), so clockintr_dispatch() does not touch > the clockintr object during step (2). > > ok? Ping. Index: share/man/man9/clockintr_bind.9 =================================================================== RCS file: share/man/man9/clockintr_bind.9 diff -N share/man/man9/clockintr_bind.9 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ share/man/man9/clockintr_bind.9 4 Feb 2024 18:06:24 -0000 @@ -0,0 +1,296 @@ +.\" $OpenBSD$ +.\" +.\" Copyright (c) 2023-2024 Scott Cheloha +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate$ +.Dt CLOCKINTR_BIND 9 +.Os +.Sh NAME +.Nm clockintr_bind , +.Nm clockintr_schedule , +.Nm clockintr_advance , +.Nm clockintr_cancel , +.Nm clockintr_unbind , +.Nm clockintr_stagger , +.Nm clockrequest_advance +.Nd execute a function in a clock interrupt context +.Sh SYNOPSIS +.In sys/clockintr.h +.Ft void +.Fo clockintr_bind +.Fa "struct clockintr *cl" +.Fa "struct cpu_info *cpu" +.Fa "void (*callback)(struct clockrequest *cr, void *cf, void *arg)" +.Fa "void *arg" +.Fc +.Ft void +.Fo clockintr_schedule +.Fa "struct clockintr *cl" +.Fa "uint64_t abs" +.Fc +.Ft uint64_t +.Fo clockintr_advance +.Fa "struct clockintr *cl" +.Fa "uint64_t interval" +.Fc +.Ft void +.Fo clockintr_cancel +.Fa "struct clockintr *cl" +.Fc +.Ft void +.Fo clockintr_unbind +.Fa "struct clockintr *cl" +.Fa "uint32_t flags" +.Fc +.Ft void +.Fo clockintr_stagger +.Fa "struct clockintr *cl" +.Fa "uint64_t interval" +.Fa "uint32_t numer" +.Fa "uint32_t denom" +.Fc +.Ft uint64_t +.Fo clockrequest_advance +.Fa "struct clockrequest *cr" +.Fa "uint64_t interval" +.Fc +.\" .Fn clockrequest_advance_random is intentionally undocumented. +.\" It may be removed in the future. New code should not use it. +.Sh DESCRIPTION +The clock interrupt subsystem schedules functions for asynchronous execution +from the clock interrupt context on a particular CPU. +.Pp +Clock interrupts are well-suited for timekeeping, +scheduling, +and statistical profiling. +Applications with more relaxed latency requirements should use timeouts +to schedule asynchronous execution; +see +.Xr timeout_add 9 +for details. +.Pp +The +.Fn clockintr_bind +function initializes the clock interrupt object +.Fa cl . +When +.Fa cl +expires, +its +.Fa callback +function is executed from the +.Dv IPL_CLOCK +context on its host +.Fa cpu +without any locks or mutexes. +The callback function must not block. +Its parameters are as follows: +.Bl -tag -width indent +.It Fa cr +A private +.Vt clockrequest +object. +May be used to request rescheduling; +see +.Fn clockrequest_advance +below. +.It Fa cf +The +.Fa cpu Ns 's +current machine-dependent clockframe. +.It Fa arg +The +.Fa arg +given to +.Fn clockintr_bind . +.El +.Pp +The memory pointed to by +.Fa cl +must be zeroed before it is first bound. +It is an error to use +.Fa cl +as argument to any other function in the +.Vt clockintr +API before it is bound. +It is an error to rebind +.Fa cl +without first unbinding it; +see +.Fn clockintr_unbind +below. +.Pp +The +.Fn clockintr_schedule +function schedules +.Fa cl +to expire at the absolute time +.Fa abs +on the system uptime clock. +The subsystem will never execute +.Fa cl Ns 's +callback function before this expiration time, +though its execution may be delayed by other activity on the system. +.Pp +The +.Fn clockintr_advance +function schedules +.Fa cl +to expire at the next terminus of the given +.Fa interval , +a non-zero count of nanoseconds, +relative to +.Fa cl Ns 's +current expiration time. +Periodic clock interrupts should be scheduled with +.Fn clockintr_advance +to keep the execution period from drifting. +.Pp +The +.Fn clockintr_cancel +function cancels any pending expiration of +.Fa cl . +.Pp +The +.Fn clockintr_unbind +function cancels any pending expiration of +.Fa cl +and severs the binding between +.Fa cl +and its host +.Fa cpu . +Upon return, +.Fa cl +may be rebound with +.Fn clockintr_bind . +The call may be configured with zero or more of the following +.Fa flags : +.Bl -tag -width CL_BARRIER +.It Dv CL_BARRIER +If +.Fa cl Ns 's +callback function is executing, +block until it returns. +By default, +the caller does not block. +Useful when +.Fa arg +is a shared reference. +.El +.Pp +The +.Fn clockintr_stagger +function resets +.Fa cl Ns 's +expiration time to a fraction of the given +.Fa interval , +a count of nanoseconds. +Specifically, +.Fa cl Ns 's +expiration time is reset to +.Pq Fa interval Ms / Fa denom Ms * Fa numer . +Periodic clock interrupts bound to multiple CPUs may be staggered +to reduce the likelihood that their callback functions will execute +simultaneously and compete for a shared resource. +It is an error if +.Fa numer +is greater than or equal to +.Fa denom . +It is an error if +.Fa cl +is already scheduled to expire. +.Pp +The +.Fn clockrequest_advance +function is similar to +.Fn clockintr_advance , +except that +(a) it may only be called during the execution of a +.Fa callback +function, +(b) it accepts a +.Vt clockrequest +pointer as argument, +and (c) scheduling requests submitted with the interface are not fulfilled +until the callback function returns. +When the callback function returns, +scheduling requests are only committed to the underlying clock interrupt +object if that object was not manipulated during the execution of the +callback function. +Otherwise, +outstanding requests are discarded. +.Sh CONTEXT +The +.Fn clockintr_bind +function may only be called from process context. +.Pp +The +.Fn clockintr_advance , +.Fn clockintr_cancel , +.Fn clockintr_schedule , +and +.Fn clockintr_stagger +functions may be called from process context or from interrupt context. +.Pp +The +.Fn clockintr_unbind +function may normally be called from process context or from interrupt context. +However, +if the +.Dv CL_BARRIER +flag is set in +.Fa flags , +the function may only be called from process context. +.Pp +The +.Fn clockrequest_advance +function may only be called during execution of a +.Fa callback +function. +.Sh RETURN VALUES +The +.Fn clockintr_advance +and +.Fn clockrequest_advance +functions return the number of +.Fa interval Ns s +that have elapsed since +.Fa cl +was scheduled to expire, +or zero if +.Fa cl +has not yet expired. +.Sh CODE REFERENCES +.Pa sys/kern/kern_clockintr.c +.Sh SEE ALSO +.Xr microtime 9 , +.Xr spl 9 , +.Xr timeout 9 +.Rs +.%A Richard McDougall +.%A Jim Mauro +.%B Solaris Internals: Solaris 10 and OpenSolaris Kernel Architecture +.%I Prentice Hall +.%I Sun Microsystems Press +.%D 2nd Edition, 2007 +.%P pp. 912\(en925 +.Re +.Sh HISTORY +The +.Vt clockintr +and +.Vt clockrequest +APIs first appeared in +.Ox 7.5 . Index: sys/sys/clockintr.h =================================================================== RCS file: /cvs/src/sys/sys/clockintr.h,v diff -u -p -r1.25 clockintr.h --- sys/sys/clockintr.h 24 Jan 2024 19:23:38 -0000 1.25 +++ sys/sys/clockintr.h 4 Feb 2024 18:06:24 -0000 @@ -113,7 +113,8 @@ struct clockintr_queue { #define CQ_INIT 0x00000001 /* clockintr_cpu_init() done */ #define CQ_INTRCLOCK 0x00000002 /* intrclock installed */ #define CQ_IGNORE_REQUEST 0x00000004 /* ignore callback requests */ -#define CQ_STATE_MASK 0x00000007 +#define CQ_NEED_WAKEUP 0x00000008 /* caller at barrier */ +#define CQ_STATE_MASK 0x0000000f void clockintr_cpu_init(const struct intrclock *); int clockintr_dispatch(void *); @@ -123,12 +124,16 @@ void clockintr_trigger(void); * Kernel API */ +#define CL_BARRIER 0x00000001 /* block if callback is running */ +#define CL_FLAG_MASK 0x00000001 + uint64_t clockintr_advance(struct clockintr *, uint64_t); void clockintr_bind(struct clockintr *, struct cpu_info *, void (*)(struct clockrequest *, void *, void *), void *); void clockintr_cancel(struct clockintr *); void clockintr_schedule(struct clockintr *, uint64_t); void clockintr_stagger(struct clockintr *, uint64_t, uint32_t, uint32_t); +void clockintr_unbind(struct clockintr *, uint32_t); uint64_t clockrequest_advance(struct clockrequest *, uint64_t); uint64_t clockrequest_advance_random(struct clockrequest *, uint64_t, uint32_t); void clockqueue_init(struct clockintr_queue *); Index: sys/kern/kern_clockintr.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clockintr.c,v diff -u -p -r1.64 kern_clockintr.c --- sys/kern/kern_clockintr.c 24 Jan 2024 19:23:38 -0000 1.64 +++ sys/kern/kern_clockintr.c 4 Feb 2024 18:06:24 -0000 @@ -31,6 +31,7 @@ #include #include +void clockintr_cancel_locked(struct clockintr *); void clockintr_hardclock(struct clockrequest *, void *, void *); void clockintr_schedule_locked(struct clockintr *, uint64_t); void clockqueue_intrclock_install(struct clockintr_queue *, @@ -226,6 +227,12 @@ clockintr_dispatch(void *frame) CLR(request->cr_flags, CR_RESCHEDULE); clockqueue_pend_insert(cq, cl, request->cr_expiration); } + if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) { + CLR(cq->cq_flags, CQ_NEED_WAKEUP); + mtx_leave(&cq->cq_mtx); + wakeup(&cq->cq_running); + mtx_enter(&cq->cq_mtx); + } run++; } @@ -317,9 +324,20 @@ void clockintr_cancel(struct clockintr *cl) { struct clockintr_queue *cq = cl->cl_queue; - int was_next; mtx_enter(&cq->cq_mtx); + clockintr_cancel_locked(cl); + mtx_leave(&cq->cq_mtx); +} + +void +clockintr_cancel_locked(struct clockintr *cl) +{ + struct clockintr_queue *cq = cl->cl_queue; + int was_next; + + MUTEX_ASSERT_LOCKED(&cq->cq_mtx); + if (ISSET(cl->cl_flags, CLST_PENDING)) { was_next = cl == TAILQ_FIRST(&cq->cq_pend); clockqueue_pend_delete(cq, cl); @@ -332,7 +350,6 @@ clockintr_cancel(struct clockintr *cl) } if (cl == cq->cq_running) SET(cq->cq_flags, CQ_IGNORE_REQUEST); - mtx_leave(&cq->cq_mtx); } void @@ -341,13 +358,41 @@ clockintr_bind(struct clockintr *cl, str { struct clockintr_queue *cq = &ci->ci_queue; + splassert(IPL_NONE); + + if (cl->cl_queue != NULL) + panic("%s: cl_queue is not NULL", __func__); + + mtx_enter(&cq->cq_mtx); cl->cl_arg = arg; cl->cl_func = func; cl->cl_queue = cq; - - mtx_enter(&cq->cq_mtx); TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); mtx_leave(&cq->cq_mtx); +} + +void +clockintr_unbind(struct clockintr *cl, uint32_t flags) +{ + struct clockintr_queue *cq = cl->cl_queue; + + KASSERT(!ISSET(flags, ~CL_FLAG_MASK)); + + mtx_enter(&cq->cq_mtx); + + clockintr_cancel_locked(cl); + + cl->cl_arg = NULL; + cl->cl_func = NULL; + cl->cl_queue = NULL; + TAILQ_REMOVE(&cq->cq_all, cl, cl_alink); + + if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) { + SET(cq->cq_flags, CQ_NEED_WAKEUP); + msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK, + "clkbar", INFSLP); + } else + mtx_leave(&cq->cq_mtx); } void