Download raw body.
clockintr: add clockintr_unbind()
On Mon, Feb 05, 2024 at 05:02:31PM +0100, Martin Pieuchot wrote:
> On 26/01/24(Fri) 12:27, Scott Cheloha wrote:
> > The last thing needed before we can move dt(4) out of the hardclock is
> > a way to "destroy" a clockintr.
> >
> > This patch adds clockintr_unbind(). It is essentially the opposite
> > of clockintr_bind(). An updated manpage documenting the interface is
> > included with this patch.
> >
> > I'm sending out the dt(4) patch separately. Take a look at that to
> > see the interface is used in practice.
> >
> > Things of note:
> >
> > - Both clockintr_cancel() and clockintr_unbind() need to cancel
> > the given clockintr, so this patch refactors the bulk of
> > clockintr_cancel() into clockintr_cancel_locked() where both
> > interfaces can use the code.
> >
> > - clockintr_cancel() does not block, so for behavioral symmetry
> > clockintr_unbind() does not block by default. The caller may
> > pass the CL_BARRIER flag to set an execution barrier. If
> > CL_BARRIER is passed and the clockintr's callback is executing,
> > the caller blocks until the callback returns. We need the
> > execution barrier for dt(4).
> >
> > - Now that you can "destroy" a clockintr, there is room for
> > error when using the API. In an effort to catch mistakes,
> > e.g. double binds, we now panic in clockintr_bind() if cl_queue
> > is not NULL. Note that clockintr_unbind() resets cl_queue to
> > NULL, so correct API use will not panic.
> >
> > The only sort-of tricky thing here is the execution barrier.
> > It goes likes this:
> >
> > 1. If the caller passes CL_BARRIER to clockintr_unbind() and the
> > clockintr's callback is executing at that moment, the caller
> > sets CQ_NEED_WAKEUP and calls msleep_nsec(). There is no more
> > work for the caller to do so we pass PNORELOCK.
> >
> > 2. When the callback function returns to clockintr_dispatch(),
> > the thread sees CQ_NEED_WAKEUP, clears it, and sends a wakeup.
> >
> > 3. The caller is awoken, returns from msleep_nsec(), and then
> > immediately returns from clockintr_unbind().
> >
> > Note that clockintr_cancel_locked() sets CQ_IGNORE_REQUEST before the
> > caller sleeps during step (1), so clockintr_dispatch() does not touch
> > the clockintr object during step (2).
>
> Running btrace(8) regression test with these two diff gets me the
> following panic:
>
> uvm_fault(0xfffffd8160028168, 0x20, 0, 1) -> e
> kernel: page fault trap, code=0
> Stopped at mtx_enter_try+0x42: movl 0x8(%rdi),%edi
> TID PID UID PRFLAGS PFLAGS CPU COMMAND
> *197451 64462 0 0x18000003 0 1K btrace
> mtx_enter_try(18) at mtx_enter_try+0x42
> mtx_enter(18) at mtx_enter+0x35
> clockintr_advance(ffff8000007bc860,0) at clockintr_advance+0x36
> dt_ioctl_record_start(ffff8000007a1a00) at dt_ioctl_record_start+0x8b
> VOP_IOCTL(fffffd8136693a70,80044403,ffff80002d154600,1,fffffd817f7c55b0,ffff800
> 02d0702a8) at VOP_IOCTL+0x5d
> vn_ioctl(fffffd815f8353d0,80044403,ffff80002d154600,ffff80002d0702a8) at vn_ioc
> tl+0x79
> sys_ioctl(ffff80002d0702a8,ffff80002d1547d0,ffff80002d154740) at sys_ioctl+0x2c
> 4
> syscall(ffff80002d1547d0) at syscall+0x55b
> Xsyscall() at Xsyscall+0x128
>
> I don't think clockintr_advance() should be called unconditionally.
The problem was in the dt(4) patch. Corrected patch posted in that
thread.
Index: share/man/man9/clockintr_bind.9
===================================================================
RCS file: share/man/man9/clockintr_bind.9
diff -N share/man/man9/clockintr_bind.9
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ share/man/man9/clockintr_bind.9 4 Feb 2024 18:06:24 -0000
@@ -0,0 +1,296 @@
+.\" $OpenBSD$
+.\"
+.\" Copyright (c) 2023-2024 Scott Cheloha <cheloha@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate$
+.Dt CLOCKINTR_BIND 9
+.Os
+.Sh NAME
+.Nm clockintr_bind ,
+.Nm clockintr_schedule ,
+.Nm clockintr_advance ,
+.Nm clockintr_cancel ,
+.Nm clockintr_unbind ,
+.Nm clockintr_stagger ,
+.Nm clockrequest_advance
+.Nd execute a function in a clock interrupt context
+.Sh SYNOPSIS
+.In sys/clockintr.h
+.Ft void
+.Fo clockintr_bind
+.Fa "struct clockintr *cl"
+.Fa "struct cpu_info *cpu"
+.Fa "void (*callback)(struct clockrequest *cr, void *cf, void *arg)"
+.Fa "void *arg"
+.Fc
+.Ft void
+.Fo clockintr_schedule
+.Fa "struct clockintr *cl"
+.Fa "uint64_t abs"
+.Fc
+.Ft uint64_t
+.Fo clockintr_advance
+.Fa "struct clockintr *cl"
+.Fa "uint64_t interval"
+.Fc
+.Ft void
+.Fo clockintr_cancel
+.Fa "struct clockintr *cl"
+.Fc
+.Ft void
+.Fo clockintr_unbind
+.Fa "struct clockintr *cl"
+.Fa "uint32_t flags"
+.Fc
+.Ft void
+.Fo clockintr_stagger
+.Fa "struct clockintr *cl"
+.Fa "uint64_t interval"
+.Fa "uint32_t numer"
+.Fa "uint32_t denom"
+.Fc
+.Ft uint64_t
+.Fo clockrequest_advance
+.Fa "struct clockrequest *cr"
+.Fa "uint64_t interval"
+.Fc
+.\" .Fn clockrequest_advance_random is intentionally undocumented.
+.\" It may be removed in the future. New code should not use it.
+.Sh DESCRIPTION
+The clock interrupt subsystem schedules functions for asynchronous execution
+from the clock interrupt context on a particular CPU.
+.Pp
+Clock interrupts are well-suited for timekeeping,
+scheduling,
+and statistical profiling.
+Applications with more relaxed latency requirements should use timeouts
+to schedule asynchronous execution;
+see
+.Xr timeout_add 9
+for details.
+.Pp
+The
+.Fn clockintr_bind
+function initializes the clock interrupt object
+.Fa cl .
+When
+.Fa cl
+expires,
+its
+.Fa callback
+function is executed from the
+.Dv IPL_CLOCK
+context on its host
+.Fa cpu
+without any locks or mutexes.
+The callback function must not block.
+Its parameters are as follows:
+.Bl -tag -width indent
+.It Fa cr
+A private
+.Vt clockrequest
+object.
+May be used to request rescheduling;
+see
+.Fn clockrequest_advance
+below.
+.It Fa cf
+The
+.Fa cpu Ns 's
+current machine-dependent clockframe.
+.It Fa arg
+The
+.Fa arg
+given to
+.Fn clockintr_bind .
+.El
+.Pp
+The memory pointed to by
+.Fa cl
+must be zeroed before it is first bound.
+It is an error to use
+.Fa cl
+as argument to any other function in the
+.Vt clockintr
+API before it is bound.
+It is an error to rebind
+.Fa cl
+without first unbinding it;
+see
+.Fn clockintr_unbind
+below.
+.Pp
+The
+.Fn clockintr_schedule
+function schedules
+.Fa cl
+to expire at the absolute time
+.Fa abs
+on the system uptime clock.
+The subsystem will never execute
+.Fa cl Ns 's
+callback function before this expiration time,
+though its execution may be delayed by other activity on the system.
+.Pp
+The
+.Fn clockintr_advance
+function schedules
+.Fa cl
+to expire at the next terminus of the given
+.Fa interval ,
+a non-zero count of nanoseconds,
+relative to
+.Fa cl Ns 's
+current expiration time.
+Periodic clock interrupts should be scheduled with
+.Fn clockintr_advance
+to keep the execution period from drifting.
+.Pp
+The
+.Fn clockintr_cancel
+function cancels any pending expiration of
+.Fa cl .
+.Pp
+The
+.Fn clockintr_unbind
+function cancels any pending expiration of
+.Fa cl
+and severs the binding between
+.Fa cl
+and its host
+.Fa cpu .
+Upon return,
+.Fa cl
+may be rebound with
+.Fn clockintr_bind .
+The call may be configured with zero or more of the following
+.Fa flags :
+.Bl -tag -width CL_BARRIER
+.It Dv CL_BARRIER
+If
+.Fa cl Ns 's
+callback function is executing,
+block until it returns.
+By default,
+the caller does not block.
+Useful when
+.Fa arg
+is a shared reference.
+.El
+.Pp
+The
+.Fn clockintr_stagger
+function resets
+.Fa cl Ns 's
+expiration time to a fraction of the given
+.Fa interval ,
+a count of nanoseconds.
+Specifically,
+.Fa cl Ns 's
+expiration time is reset to
+.Pq Fa interval Ms / Fa denom Ms * Fa numer .
+Periodic clock interrupts bound to multiple CPUs may be staggered
+to reduce the likelihood that their callback functions will execute
+simultaneously and compete for a shared resource.
+It is an error if
+.Fa numer
+is greater than or equal to
+.Fa denom .
+It is an error if
+.Fa cl
+is already scheduled to expire.
+.Pp
+The
+.Fn clockrequest_advance
+function is similar to
+.Fn clockintr_advance ,
+except that
+(a) it may only be called during the execution of a
+.Fa callback
+function,
+(b) it accepts a
+.Vt clockrequest
+pointer as argument,
+and (c) scheduling requests submitted with the interface are not fulfilled
+until the callback function returns.
+When the callback function returns,
+scheduling requests are only committed to the underlying clock interrupt
+object if that object was not manipulated during the execution of the
+callback function.
+Otherwise,
+outstanding requests are discarded.
+.Sh CONTEXT
+The
+.Fn clockintr_bind
+function may only be called from process context.
+.Pp
+The
+.Fn clockintr_advance ,
+.Fn clockintr_cancel ,
+.Fn clockintr_schedule ,
+and
+.Fn clockintr_stagger
+functions may be called from process context or from interrupt context.
+.Pp
+The
+.Fn clockintr_unbind
+function may normally be called from process context or from interrupt context.
+However,
+if the
+.Dv CL_BARRIER
+flag is set in
+.Fa flags ,
+the function may only be called from process context.
+.Pp
+The
+.Fn clockrequest_advance
+function may only be called during execution of a
+.Fa callback
+function.
+.Sh RETURN VALUES
+The
+.Fn clockintr_advance
+and
+.Fn clockrequest_advance
+functions return the number of
+.Fa interval Ns s
+that have elapsed since
+.Fa cl
+was scheduled to expire,
+or zero if
+.Fa cl
+has not yet expired.
+.Sh CODE REFERENCES
+.Pa sys/kern/kern_clockintr.c
+.Sh SEE ALSO
+.Xr microtime 9 ,
+.Xr spl 9 ,
+.Xr timeout 9
+.Rs
+.%A Richard McDougall
+.%A Jim Mauro
+.%B Solaris Internals: Solaris 10 and OpenSolaris Kernel Architecture
+.%I Prentice Hall
+.%I Sun Microsystems Press
+.%D 2nd Edition, 2007
+.%P pp. 912\(en925
+.Re
+.Sh HISTORY
+The
+.Vt clockintr
+and
+.Vt clockrequest
+APIs first appeared in
+.Ox 7.5 .
Index: sys/sys/clockintr.h
===================================================================
RCS file: /cvs/src/sys/sys/clockintr.h,v
diff -u -p -r1.25 clockintr.h
--- sys/sys/clockintr.h 24 Jan 2024 19:23:38 -0000 1.25
+++ sys/sys/clockintr.h 4 Feb 2024 18:06:24 -0000
@@ -113,7 +113,8 @@ struct clockintr_queue {
#define CQ_INIT 0x00000001 /* clockintr_cpu_init() done */
#define CQ_INTRCLOCK 0x00000002 /* intrclock installed */
#define CQ_IGNORE_REQUEST 0x00000004 /* ignore callback requests */
-#define CQ_STATE_MASK 0x00000007
+#define CQ_NEED_WAKEUP 0x00000008 /* caller at barrier */
+#define CQ_STATE_MASK 0x0000000f
void clockintr_cpu_init(const struct intrclock *);
int clockintr_dispatch(void *);
@@ -123,12 +124,16 @@ void clockintr_trigger(void);
* Kernel API
*/
+#define CL_BARRIER 0x00000001 /* block if callback is running */
+#define CL_FLAG_MASK 0x00000001
+
uint64_t clockintr_advance(struct clockintr *, uint64_t);
void clockintr_bind(struct clockintr *, struct cpu_info *,
void (*)(struct clockrequest *, void *, void *), void *);
void clockintr_cancel(struct clockintr *);
void clockintr_schedule(struct clockintr *, uint64_t);
void clockintr_stagger(struct clockintr *, uint64_t, uint32_t, uint32_t);
+void clockintr_unbind(struct clockintr *, uint32_t);
uint64_t clockrequest_advance(struct clockrequest *, uint64_t);
uint64_t clockrequest_advance_random(struct clockrequest *, uint64_t, uint32_t);
void clockqueue_init(struct clockintr_queue *);
Index: sys/kern/kern_clockintr.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
diff -u -p -r1.64 kern_clockintr.c
--- sys/kern/kern_clockintr.c 24 Jan 2024 19:23:38 -0000 1.64
+++ sys/kern/kern_clockintr.c 4 Feb 2024 18:06:24 -0000
@@ -31,6 +31,7 @@
#include <sys/sysctl.h>
#include <sys/time.h>
+void clockintr_cancel_locked(struct clockintr *);
void clockintr_hardclock(struct clockrequest *, void *, void *);
void clockintr_schedule_locked(struct clockintr *, uint64_t);
void clockqueue_intrclock_install(struct clockintr_queue *,
@@ -226,6 +227,12 @@ clockintr_dispatch(void *frame)
CLR(request->cr_flags, CR_RESCHEDULE);
clockqueue_pend_insert(cq, cl, request->cr_expiration);
}
+ if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) {
+ CLR(cq->cq_flags, CQ_NEED_WAKEUP);
+ mtx_leave(&cq->cq_mtx);
+ wakeup(&cq->cq_running);
+ mtx_enter(&cq->cq_mtx);
+ }
run++;
}
@@ -317,9 +324,20 @@ void
clockintr_cancel(struct clockintr *cl)
{
struct clockintr_queue *cq = cl->cl_queue;
- int was_next;
mtx_enter(&cq->cq_mtx);
+ clockintr_cancel_locked(cl);
+ mtx_leave(&cq->cq_mtx);
+}
+
+void
+clockintr_cancel_locked(struct clockintr *cl)
+{
+ struct clockintr_queue *cq = cl->cl_queue;
+ int was_next;
+
+ MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
+
if (ISSET(cl->cl_flags, CLST_PENDING)) {
was_next = cl == TAILQ_FIRST(&cq->cq_pend);
clockqueue_pend_delete(cq, cl);
@@ -332,7 +350,6 @@ clockintr_cancel(struct clockintr *cl)
}
if (cl == cq->cq_running)
SET(cq->cq_flags, CQ_IGNORE_REQUEST);
- mtx_leave(&cq->cq_mtx);
}
void
@@ -341,13 +358,41 @@ clockintr_bind(struct clockintr *cl, str
{
struct clockintr_queue *cq = &ci->ci_queue;
+ splassert(IPL_NONE);
+
+ if (cl->cl_queue != NULL)
+ panic("%s: cl_queue is not NULL", __func__);
+
+ mtx_enter(&cq->cq_mtx);
cl->cl_arg = arg;
cl->cl_func = func;
cl->cl_queue = cq;
-
- mtx_enter(&cq->cq_mtx);
TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
mtx_leave(&cq->cq_mtx);
+}
+
+void
+clockintr_unbind(struct clockintr *cl, uint32_t flags)
+{
+ struct clockintr_queue *cq = cl->cl_queue;
+
+ KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
+
+ mtx_enter(&cq->cq_mtx);
+
+ clockintr_cancel_locked(cl);
+
+ cl->cl_arg = NULL;
+ cl->cl_func = NULL;
+ cl->cl_queue = NULL;
+ TAILQ_REMOVE(&cq->cq_all, cl, cl_alink);
+
+ if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) {
+ SET(cq->cq_flags, CQ_NEED_WAKEUP);
+ msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK,
+ "clkbar", INFSLP);
+ } else
+ mtx_leave(&cq->cq_mtx);
}
void
clockintr: add clockintr_unbind()