From: Martin Pieuchot <mpi@openbsd.org>
Subject: Re: clockintr: add clockintr_unbind()
To: Scott Cheloha <scottcheloha@gmail.com>
Cc: tech@openbsd.org, claudio@openbsd.org, mlarkin@openbsd.org
Date: Mon, 5 Feb 2024 17:02:31 +0100

On 26/01/24(Fri) 12:27, Scott Cheloha wrote:
> The last thing needed before we can move dt(4) out of the hardclock is
> a way to "destroy" a clockintr.
> 
> This patch adds clockintr_unbind().  It is essentially the opposite
> of clockintr_bind().  An updated manpage documenting the interface is
> included with this patch.
> 
> I'm sending out the dt(4) patch separately.  Take a look at that to
> see the interface is used in practice.
> 
> Things of note:
> 
> - Both clockintr_cancel() and clockintr_unbind() need to cancel
>   the given clockintr, so this patch refactors the bulk of
>   clockintr_cancel() into clockintr_cancel_locked() where both
>   interfaces can use the code.
> 
> - clockintr_cancel() does not block, so for behavioral symmetry
>   clockintr_unbind() does not block by default.  The caller may
>   pass the CL_BARRIER flag to set an execution barrier.  If
>   CL_BARRIER is passed and the clockintr's callback is executing,
>   the caller blocks until the callback returns.  We need the
>   execution barrier for dt(4).
> 
> - Now that you can "destroy" a clockintr, there is room for
>   error when using the API.  In an effort to catch mistakes,
>   e.g. double binds, we now panic in clockintr_bind() if cl_queue
>   is not NULL.  Note that clockintr_unbind() resets cl_queue to
>   NULL, so correct API use will not panic.
> 
> The only sort-of tricky thing here is the execution barrier.
> It goes likes this:
> 
>   1. If the caller passes CL_BARRIER to clockintr_unbind() and the
>      clockintr's callback is executing at that moment, the caller
>      sets CQ_NEED_WAKEUP and calls msleep_nsec().  There is no more
>      work for the caller to do so we pass PNORELOCK.
> 
>   2. When the callback function returns to clockintr_dispatch(),
>      the thread sees CQ_NEED_WAKEUP, clears it, and sends a wakeup.
> 
>   3. The caller is awoken, returns from msleep_nsec(), and then
>      immediately returns from clockintr_unbind().
> 
> Note that clockintr_cancel_locked() sets CQ_IGNORE_REQUEST before the
> caller sleeps during step (1), so clockintr_dispatch() does not touch
> the clockintr object during step (2).

Running btrace(8) regression test with these two diff gets me the
following panic:

uvm_fault(0xfffffd8160028168, 0x20, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at      mtx_enter_try+0x42:     movl    0x8(%rdi),%edi
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
*197451  64462      0  0x18000003          0    1K btrace
mtx_enter_try(18) at mtx_enter_try+0x42
mtx_enter(18) at mtx_enter+0x35
clockintr_advance(ffff8000007bc860,0) at clockintr_advance+0x36
dt_ioctl_record_start(ffff8000007a1a00) at dt_ioctl_record_start+0x8b
VOP_IOCTL(fffffd8136693a70,80044403,ffff80002d154600,1,fffffd817f7c55b0,ffff800
02d0702a8) at VOP_IOCTL+0x5d
vn_ioctl(fffffd815f8353d0,80044403,ffff80002d154600,ffff80002d0702a8) at vn_ioc
tl+0x79
sys_ioctl(ffff80002d0702a8,ffff80002d1547d0,ffff80002d154740) at sys_ioctl+0x2c
4
syscall(ffff80002d1547d0) at syscall+0x55b
Xsyscall() at Xsyscall+0x128

I don't think clockintr_advance() should be called unconditionally. 


> 
> ok?
> 
> Index: share/man/man9/clockintr_bind.9
> ===================================================================
> RCS file: share/man/man9/clockintr_bind.9
> diff -N share/man/man9/clockintr_bind.9
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ share/man/man9/clockintr_bind.9	26 Jan 2024 17:40:48 -0000
> @@ -0,0 +1,296 @@
> +.\"	$OpenBSD$
> +.\"
> +.\" Copyright (c) 2023-2024 Scott Cheloha <cheloha@openbsd.org>
> +.\"
> +.\" Permission to use, copy, modify, and distribute this software for any
> +.\" purpose with or without fee is hereby granted, provided that the above
> +.\" copyright notice and this permission notice appear in all copies.
> +.\"
> +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> +.\"
> +.Dd $Mdocdate$
> +.Dt CLOCKINTR_BIND 9
> +.Os
> +.Sh NAME
> +.Nm clockintr_bind ,
> +.Nm clockintr_schedule ,
> +.Nm clockintr_advance ,
> +.Nm clockintr_cancel ,
> +.Nm clockintr_unbind ,
> +.Nm clockintr_stagger ,
> +.Nm clockrequest_advance
> +.Nd execute a function in a clock interrupt context
> +.Sh SYNOPSIS
> +.In sys/clockintr.h
> +.Ft void
> +.Fo clockintr_bind
> +.Fa "struct clockintr *cl"
> +.Fa "struct cpu_info *cpu"
> +.Fa "void (*callback)(struct clockrequest *cr, void *cf, void *arg)"
> +.Fa "void *arg"
> +.Fc
> +.Ft void
> +.Fo clockintr_schedule
> +.Fa "struct clockintr *cl"
> +.Fa "uint64_t abs"
> +.Fc
> +.Ft uint64_t
> +.Fo clockintr_advance
> +.Fa "struct clockintr *cl"
> +.Fa "uint64_t interval"
> +.Fc
> +.Ft void
> +.Fo clockintr_cancel
> +.Fa "struct clockintr *cl"
> +.Fc
> +.Ft void
> +.Fo clockintr_unbind
> +.Fa "struct clockintr *cl"
> +.Fa "uint32_t flags"
> +.Fc
> +.Ft void
> +.Fo clockintr_stagger
> +.Fa "struct clockintr *cl"
> +.Fa "uint64_t interval"
> +.Fa "uint32_t numer"
> +.Fa "uint32_t denom"
> +.Fc
> +.Ft uint64_t
> +.Fo clockrequest_advance
> +.Fa "struct clockrequest *cr"
> +.Fa "uint64_t interval"
> +.Fc
> +.\" .Fn clockrequest_advance_random is intentionally undocumented.
> +.\"  It may be removed in the future.  New code should not use it.
> +.Sh DESCRIPTION
> +The clock interrupt subsystem schedules functions for asynchronous execution
> +from the clock interrupt context on a particular CPU.
> +.Pp
> +Clock interrupts are well-suited for timekeeping,
> +scheduling,
> +and statistical profiling.
> +Applications with more relaxed latency requirements should use timeouts
> +to schedule asynchronous execution;
> +see
> +.Xr timeout_add 9
> +for details.
> +.Pp
> +The
> +.Fn clockintr_bind
> +function initializes the clock interrupt object
> +.Fa cl .
> +When
> +.Fa cl
> +expires,
> +its
> +.Fa callback
> +function is executed from the
> +.Dv IPL_CLOCK
> +context on its host
> +.Fa cpu
> +without any locks or mutexes.
> +The callback function must not block.
> +Its parameters are as follows:
> +.Bl -tag -width indent
> +.It Fa cr
> +A private
> +.Vt clockrequest
> +object.
> +May be used to request rescheduling;
> +see
> +.Fn clockrequest_advance
> +below.
> +.It Fa cf
> +The
> +.Fa cpu Ns 's
> +current machine-dependent clockframe.
> +.It Fa arg
> +The
> +.Fa arg
> +given to
> +.Fn clockintr_bind .
> +.El
> +.Pp
> +The memory pointed to by
> +.Fa cl
> +must be zeroed before it is first bound.
> +It is an error to use
> +.Fa cl
> +as argument to any other function in the
> +.Vt clockintr
> +API before it is bound.
> +It is an error to rebind
> +.Fa cl
> +without first unbinding it;
> +see
> +.Fn clockintr_unbind
> +below.
> +.Pp
> +The
> +.Fn clockintr_schedule
> +function schedules
> +.Fa cl
> +to expire at the absolute time
> +.Fa abs
> +on the system uptime clock.
> +The subsystem will never execute
> +.Fa cl Ns 's
> +callback function before this expiration time,
> +though its execution may be delayed by other activity on the system.
> +.Pp
> +The
> +.Fn clockintr_advance
> +function schedules
> +.Fa cl
> +to expire at the next terminus of the given
> +.Fa interval ,
> +a non-zero count of nanoseconds,
> +relative to
> +.Fa cl Ns 's
> +current expiration time.
> +Periodic clock interrupts should be scheduled with
> +.Fn clockintr_advance
> +to keep the execution period from drifting.
> +.Pp
> +The
> +.Fn clockintr_cancel
> +function cancels any pending expiration of
> +.Fa cl .
> +.Pp
> +The
> +.Fn clockintr_unbind
> +function cancels any pending expiration of
> +.Fa cl
> +and severs the binding between
> +.Fa cl
> +and its host
> +.Fa cpu .
> +Upon return,
> +.Fa cl
> +may be rebound with
> +.Fn clockintr_bind .
> +The call may be configured with zero or more of the following
> +.Fa flags :
> +.Bl -tag -width CL_BARRIER
> +.It Dv CL_BARRIER
> +If
> +.Fa cl Ns 's
> +callback function is executing,
> +block until it returns.
> +By default,
> +the caller does not block.
> +Useful when
> +.Fa arg
> +is a shared reference.
> +.El
> +.Pp
> +The
> +.Fn clockintr_stagger
> +function resets
> +.Fa cl Ns 's
> +expiration time to a fraction of the given
> +.Fa interval ,
> +a count of nanoseconds.
> +Specifically,
> +.Fa cl Ns 's
> +expiration time is reset to
> +.Pq Fa interval Ms / Fa denom Ms * Fa numer .
> +Periodic clock interrupts bound to multiple CPUs may be staggered
> +to reduce the likelihood that their callback functions will execute
> +simultaneously and compete for a shared resource.
> +It is an error if
> +.Fa numer
> +is greater than or equal to
> +.Fa denom .
> +It is an error if
> +.Fa cl
> +is already scheduled to expire.
> +.Pp
> +The
> +.Fn clockrequest_advance
> +function is similar to
> +.Fn clockintr_advance ,
> +except that
> +(a) it may only be called during the execution of a
> +.Fa callback
> +function,
> +(b) it accepts a
> +.Vt clockrequest
> +pointer as argument,
> +and (c) scheduling requests submitted with the interface are not fulfilled
> +until the callback function returns.
> +When the callback function returns,
> +scheduling requests are only committed to the underlying clock interrupt
> +object if that object was not manipulated during the execution of the
> +callback function.
> +Otherwise,
> +outstanding requests are discarded.
> +.Sh CONTEXT
> +The
> +.Fn clockintr_bind
> +function may only be called from process context.
> +.Pp
> +The
> +.Fn clockintr_advance ,
> +.Fn clockintr_cancel ,
> +.Fn clockintr_schedule ,
> +and
> +.Fn clockintr_stagger
> +functions may be called from process context or from interrupt context.
> +.Pp
> +The
> +.Fn clockintr_unbind
> +function may normally be called from process context or from interrupt context.
> +However,
> +if the
> +.Dv CL_BARRIER
> +flag is set in
> +.Fa flags ,
> +the function may only be called from process context.
> +.Pp
> +The
> +.Fn clockrequest_advance
> +function may only be called during execution of a
> +.Fa callback
> +function.
> +.Sh RETURN VALUES
> +The
> +.Fn clockintr_advance
> +and
> +.Fn clockrequest_advance
> +functions return the number of
> +.Fa interval Ns s
> +that have elapsed since
> +.Fa cl
> +was scheduled to expire,
> +or zero if
> +.Fa cl
> +has not yet expired.
> +.Sh CODE REFERENCES
> +.Pa sys/kern/kern_clockintr.c
> +.Sh SEE ALSO
> +.Xr microtime 9 ,
> +.Xr spl 9 ,
> +.Xr timeout 9
> +.Rs
> +.%A Richard McDougall
> +.%A Jim Mauro
> +.%B Solaris Internals: Solaris 10 and OpenSolaris Kernel Architecture
> +.%I Prentice Hall
> +.%I Sun Microsystems Press
> +.%D 2nd Edition, 2007
> +.%P pp. 912\(en925
> +.Re
> +.Sh HISTORY
> +The
> +.Vt clockintr
> +and
> +.Vt clockrequest
> +APIs first appeared in
> +.Ox 7.5 .
> Index: sys/sys/clockintr.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/clockintr.h,v
> diff -u -p -r1.25 clockintr.h
> --- sys/sys/clockintr.h	24 Jan 2024 19:23:38 -0000	1.25
> +++ sys/sys/clockintr.h	26 Jan 2024 17:40:48 -0000
> @@ -113,7 +113,8 @@ struct clockintr_queue {
>  #define CQ_INIT			0x00000001	/* clockintr_cpu_init() done */
>  #define CQ_INTRCLOCK		0x00000002	/* intrclock installed */
>  #define CQ_IGNORE_REQUEST	0x00000004	/* ignore callback requests */
> -#define CQ_STATE_MASK		0x00000007
> +#define CQ_NEED_WAKEUP		0x00000008	/* caller at barrier */
> +#define CQ_STATE_MASK		0x0000000f
>  
>  void clockintr_cpu_init(const struct intrclock *);
>  int clockintr_dispatch(void *);
> @@ -123,12 +124,16 @@ void clockintr_trigger(void);
>   * Kernel API
>   */
>  
> +#define CL_BARRIER	0x00000001	/* block if callback is running */
> +#define CL_FLAG_MASK	0x00000001
> +
>  uint64_t clockintr_advance(struct clockintr *, uint64_t);
>  void clockintr_bind(struct clockintr *, struct cpu_info *,
>      void (*)(struct clockrequest *, void *, void *), void *);
>  void clockintr_cancel(struct clockintr *);
>  void clockintr_schedule(struct clockintr *, uint64_t);
>  void clockintr_stagger(struct clockintr *, uint64_t, uint32_t, uint32_t);
> +void clockintr_unbind(struct clockintr *, uint32_t);
>  uint64_t clockrequest_advance(struct clockrequest *, uint64_t);
>  uint64_t clockrequest_advance_random(struct clockrequest *, uint64_t, uint32_t);
>  void clockqueue_init(struct clockintr_queue *);
> Index: sys/kern/kern_clockintr.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
> diff -u -p -r1.64 kern_clockintr.c
> --- sys/kern/kern_clockintr.c	24 Jan 2024 19:23:38 -0000	1.64
> +++ sys/kern/kern_clockintr.c	26 Jan 2024 17:40:48 -0000
> @@ -31,6 +31,7 @@
>  #include <sys/sysctl.h>
>  #include <sys/time.h>
>  
> +void clockintr_cancel_locked(struct clockintr *);
>  void clockintr_hardclock(struct clockrequest *, void *, void *);
>  void clockintr_schedule_locked(struct clockintr *, uint64_t);
>  void clockqueue_intrclock_install(struct clockintr_queue *,
> @@ -226,6 +227,12 @@ clockintr_dispatch(void *frame)
>  			CLR(request->cr_flags, CR_RESCHEDULE);
>  			clockqueue_pend_insert(cq, cl, request->cr_expiration);
>  		}
> +		if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) {
> +			CLR(cq->cq_flags, CQ_NEED_WAKEUP);
> +			mtx_leave(&cq->cq_mtx);
> +			wakeup(&cq->cq_running);
> +			mtx_enter(&cq->cq_mtx);
> +		}
>  		run++;
>  	}
>  
> @@ -317,9 +324,20 @@ void
>  clockintr_cancel(struct clockintr *cl)
>  {
>  	struct clockintr_queue *cq = cl->cl_queue;
> -	int was_next;
>  
>  	mtx_enter(&cq->cq_mtx);
> +	clockintr_cancel_locked(cl);
> +	mtx_leave(&cq->cq_mtx);
> +}
> +
> +void
> +clockintr_cancel_locked(struct clockintr *cl)
> +{
> +	struct clockintr_queue *cq = cl->cl_queue;
> +	int was_next;
> +
> +	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
> +
>  	if (ISSET(cl->cl_flags, CLST_PENDING)) {
>  		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
>  		clockqueue_pend_delete(cq, cl);
> @@ -332,7 +350,6 @@ clockintr_cancel(struct clockintr *cl)
>  	}
>  	if (cl == cq->cq_running)
>  		SET(cq->cq_flags, CQ_IGNORE_REQUEST);
> -	mtx_leave(&cq->cq_mtx);
>  }
>  
>  void
> @@ -341,13 +358,41 @@ clockintr_bind(struct clockintr *cl, str
>  {
>  	struct clockintr_queue *cq = &ci->ci_queue;
>  
> +	splassert(IPL_NONE);
> +
> +	if (cl->cl_queue != NULL)
> +		panic("%s: cl_queue is not NULL", __func__);
> +
> +	mtx_enter(&cq->cq_mtx);
>  	cl->cl_arg = arg;
>  	cl->cl_func = func;
>  	cl->cl_queue = cq;
> -
> -	mtx_enter(&cq->cq_mtx);
>  	TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
>  	mtx_leave(&cq->cq_mtx);
> +}
> +
> +void
> +clockintr_unbind(struct clockintr *cl, uint32_t flags)
> +{
> +	struct clockintr_queue *cq = cl->cl_queue;
> +
> +	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
> +
> +	mtx_enter(&cq->cq_mtx);
> +
> +	clockintr_cancel_locked(cl);
> +
> +	cl->cl_arg = NULL;
> +	cl->cl_func = NULL;
> +	cl->cl_queue = NULL;
> +	TAILQ_REMOVE(&cq->cq_all, cl, cl_alink);
> +
> +	if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) {
> +		SET(cq->cq_flags, CQ_NEED_WAKEUP);
> +		msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK,
> +		    "clkbar", INFSLP);
> +	} else
> +		mtx_leave(&cq->cq_mtx);
>  }
>  
>  void