Index | Thread | Search

From:
Kirill A. Korinsky <kirill@korins.ky>
Subject:
Re: sys/uvideo: avoid using queue.h
To:
Marcus Glocker <marcus@nazgul.ch>
Cc:
tech@openbsd.org
Date:
Sun, 16 Mar 2025 11:10:22 +0100

Download raw body.

Thread
On Sun, 16 Mar 2025 08:24:29 +0100,
Marcus Glocker <marcus@nazgul.ch> wrote:
>
> On Fri, Mar 14, 2025 at 02:16:23PM GMT, Kirill A. Korinsky wrote:
>
> > On Fri, 14 Mar 2025 14:10:01 +0100,
> > Kirill A. Korinsky <kirill@korins.ky> wrote:
> > >
> > > On Thu, 06 Mar 2025 09:36:46 +0100,
> > > Kirill A. Korinsky <kirill@korins.ky> wrote:
> > > >
> > > > tech@,
> > > >
> > > > macroses from queue.h doesn't thread safe and wraping it by mutex on
> > > > uvideo use case seems isn't so good idea.
> > > >
> > > > It has a hardcoded number of used buffers as 8 and already has a code
> > > > which iterates against them to find the buffer which is ready for
> > > > queueing in uvideo_mmap_getbuf.
> > > >
> > > > Here I'd like to replace SIMPLEQ_* by similar logic.
> > > >
> > > > It fixes two bugs:
> > > >
> > > > 1. If isochronous webcam sends two frames in one transfer,
> > > > SIMPLEQ_INSERT_TAIL adds the same element twice and creates infinity
> > > > loop inside sc_mmap_q which will be iterating in uvideo_vs_free_frame.
> > > >
> > > > 2. Prevents from calling SIMPLEQ_INSERT_TAIL and SIMPLEQ_REMOVE_HEAD for
> > > > the same element in the queue which may lead to unexpected state.
> > > >
> > > > This diff was suceffuly tested on:
> > > >  - Elgato Facecam Pro (isochronous)
> > > >  - Jabra PanaCast (bulk)
> > > >
> > >
> > > Here an alternative approach where I had implemented Multi Producer Multi
> > > Consumer, wait-free and fixed size FIFO queue which is thread safe.
> > >
> > > I had replaced both for-loop to it, and it works.
> > >
> > > It can be done faster by replacing % (reminder) by mask but it requires to
> > > change UVIDEO_MAX_BUFFERS to something in form 2^x - 1.
> > >
> > > Use 7 as UVIDEO_MAX_BUFFERS is safe in term of memory usage, but at least
> > > ffmpeg had 8 as hardcoded constant which changes its behaviour, if I recall
> > > rigth.
> > >
> > > Put 15 should be safe as well, but 4K frames allocates ~16 Mb memory and it
> > > turns to ~240Mb for mmap memory. It should be fine for amd64 and arm64, but
> > > I have no idea about other architectures.
> > >
> > > So, here reminder.
> > >
> > > Regarding the queue algorithm itself.
> > >
> > > I don't know who invented it on the first place, but this is a version which
> > > I had invented independently in 2016 or 2017 for some large Java app, and
> > > which is uses it under load for years without any knwon to me issues.
> > >
> > > So, it's fine to be licensed under ISC and as far as I know it isn't
> > > patented, but I haven't done any patent search.
> > >
> > > But it is quite trivial, and I assume that it is well known algorithm
> > > for someone.
> > >
> > > Any test and feedback is welcome.
> > >
> >
> > Ooops, I had sent a wrong diff. Here the right one.
>
> I'm not familiar with this specific queuing algorithm, but it seems to
> work :-)
>

Yeah, and it is quite obvious, just a two counters.

> I gave it a spin, and I can see that FIFO is working, and that it always
> iterates through all the buffers, also with lower frame rates.  This is
> in line with the behavior of the Linux driver, and is also an improvement
> compared to the current SIMPLEQ implementation, which is only consuming
> buffer zero, mostly.
>
> I will need to do some more testing.
>

Thanks! I, locally, test it as well with only
https://marc.info/?l=openbsd-tech&m=174180759929893&w=2

Everything quite stable on my usecases.

> Some initial in-line comments.
>
> > +#define UVIDEO_MMAP_QUEUE_SIZE	(UVIDEO_MAX_BUFFERS + 1)
> > +
> > +struct uvideo_mmap_queue_item {
> > +	unsigned long				 qi_seq;
> > +	int					 qi_idx;
> > +};
> > +
> > +struct uvideo_mmap_queue {
> > +	struct uvideo_mmap_queue_item		 q_items[UVIDEO_MMAP_QUEUE_SIZE];
>
> Line break, maybe remove the additional space indentations for these new
> structures.
>

Or I may follow style of uvideo_softc and call it as q_item.

At least fold(1) thinks it's ok.

>
> #include <sys/queue.h> should be removed as well.

Do you think it is worth to move FIFO queue into sys/queue.h?

Anyway, here an updated diff:

Index: sys/dev/usb/uvideo.c
===================================================================
RCS file: /home/cvs/src/sys/dev/usb/uvideo.c,v
diff -u -p -r1.252 uvideo.c
--- sys/dev/usb/uvideo.c	12 Mar 2025 14:08:51 -0000	1.252
+++ sys/dev/usb/uvideo.c	16 Mar 2025 10:05:04 -0000
@@ -24,6 +24,7 @@
 #include <sys/stat.h>
 #include <sys/kthread.h>
 #include <sys/stdint.h>
+#include <sys/atomic.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -50,6 +51,20 @@ int uvideo_debug = 1;
 #define byteof(x) ((x) >> 3)
 #define bitof(x)  (1L << ((x) & 0x7))
 
+/* queue needs at least one empty element, to avoid infinity loop */
+#define UVIDEO_MMAP_QUEUE_SIZE	(UVIDEO_MAX_BUFFERS + 1)
+
+struct uvideo_mmap_queue_item {
+	unsigned long				 qi_seq;
+	int					 qi_idx;
+};
+
+struct uvideo_mmap_queue {
+	struct uvideo_mmap_queue_item		 q_item[UVIDEO_MMAP_QUEUE_SIZE];
+	unsigned long				 q_head_seq;
+	unsigned long				 q_tail_seq;
+};
+
 struct uvideo_softc {
 	struct device				 sc_dev;
 	struct usbd_device			*sc_udev;
@@ -68,10 +83,12 @@ struct uvideo_softc {
 	struct uvideo_mmap			 sc_mmap[UVIDEO_MAX_BUFFERS];
 	uint8_t					*sc_mmap_buffer;
 	size_t					 sc_mmap_buffer_size;
-	q_mmap					 sc_mmap_q;
 	int					 sc_mmap_count;
 	int					 sc_mmap_flag;
 
+	struct uvideo_mmap_queue		 sc_mmap_q_in;
+	struct uvideo_mmap_queue		 sc_mmap_q_out;
+
 	struct vnode				*sc_vp;
 	struct usb_task				 sc_task_write;
 
@@ -173,6 +190,8 @@ void		uvideo_vs_decode_stream_header(str
 void		uvideo_vs_decode_stream_header_isight(struct uvideo_softc *,
 		    uint8_t *, int);
 void		uvideo_mmap_queue(struct uvideo_softc *, uint8_t *, int, int);
+int		uvideo_mmap_queue_dequeue(struct uvideo_mmap_queue *);
+int		uvideo_mmap_queue_enqueue(struct uvideo_mmap_queue *, int);
 void		uvideo_read(struct uvideo_softc *, uint8_t *, int);
 usbd_status	uvideo_usb_control(struct uvideo_softc *, uint8_t, uint8_t,
 		    uint16_t, uint8_t *, size_t);
@@ -669,8 +688,6 @@ uvideo_attach_hook(struct device *self)
 	if (error != USBD_NORMAL_COMPLETION)
 		return;
 
-	/* init mmap queue */
-	SIMPLEQ_INIT(&sc->sc_mmap_q);
 	sc->sc_mmap_count = 0;
 
 	DPRINTF(1, "uvideo_attach: doing video_attach_mi\n");
@@ -1960,9 +1977,6 @@ uvideo_vs_free_frame(struct uvideo_softc
 		sc->sc_mmap_buffer_size = 0;
 	}
 
-	while (!SIMPLEQ_EMPTY(&sc->sc_mmap_q))
-		SIMPLEQ_REMOVE_HEAD(&sc->sc_mmap_q, q_frames);
-
 	sc->sc_mmap_count = 0;
 }
 
@@ -2498,6 +2512,57 @@ uvideo_vs_decode_stream_header_isight(st
 	}
 }
 
+static inline int
+uvideo_mmap_queue_op(struct uvideo_mmap_queue_item *items,
+    unsigned long *seq, int dequeue, int idx) {
+	long diff;
+	unsigned long s;
+	struct uvideo_mmap_queue_item *item;
+
+	for (;;) {
+		s = atomic_load_long(seq);
+		item = &items[(int)(s % UVIDEO_MMAP_QUEUE_SIZE)];
+		diff = item->qi_seq - (s + dequeue);
+
+		if (diff == 0) {
+			/* item can be locked */
+			if (atomic_cas_ulong(seq, s, s + 1) != s)
+				continue;
+
+			/* it is locked */
+
+			if (dequeue) {
+				idx = item->qi_idx;
+				item->qi_seq = s + UVIDEO_MMAP_QUEUE_SIZE;
+				return (idx);
+			} else {
+				item->qi_idx = idx;
+				item->qi_seq++;
+				return (0);
+			}
+		}
+
+		if (diff < 0) {
+			/* queue is full */
+			return (-1);
+		}
+
+		/* diff > 0 - means another thread moved sequence */
+	}
+}
+
+int
+uvideo_mmap_queue_dequeue(struct uvideo_mmap_queue *q)
+{
+	return uvideo_mmap_queue_op(q->q_item, &q->q_tail_seq, 1, -1);
+}
+
+int
+uvideo_mmap_queue_enqueue(struct uvideo_mmap_queue *q, int idx)
+{
+	return uvideo_mmap_queue_op(q->q_item, &q->q_head_seq, 0, idx);
+}
+
 void
 uvideo_mmap_queue(struct uvideo_softc *sc, uint8_t *buf, int len, int err)
 {
@@ -2507,11 +2572,9 @@ uvideo_mmap_queue(struct uvideo_softc *s
 		panic("%s: mmap buffers not allocated", __func__);
 
 	/* find a buffer which is ready for queueing */
-	for (i = 0; i < sc->sc_mmap_count; i++) {
-		if (sc->sc_mmap[i].v4l2_buf.flags & V4L2_BUF_FLAG_QUEUED)
-			break;
-	}
-	if (i == sc->sc_mmap_count) {
+	i = uvideo_mmap_queue_dequeue(&sc->sc_mmap_q_in);
+
+	if (i < 0) {
 		DPRINTF(1, "%s: %s: mmap queue is full!\n",
 		    DEVNAME(sc), __func__);
 		return;
@@ -2537,7 +2600,10 @@ uvideo_mmap_queue(struct uvideo_softc *s
 	/* queue it */
 	sc->sc_mmap[i].v4l2_buf.flags |= V4L2_BUF_FLAG_DONE;
 	sc->sc_mmap[i].v4l2_buf.flags &= ~V4L2_BUF_FLAG_QUEUED;
-	SIMPLEQ_INSERT_TAIL(&sc->sc_mmap_q, &sc->sc_mmap[i], q_frames);
+
+	if (uvideo_mmap_queue_enqueue(&sc->sc_mmap_q_out, i))
+		panic("%s: mmap queue overflow", __func__);
+
 	DPRINTF(2, "%s: %s: frame queued on index %d\n",
 	    DEVNAME(sc), __func__, i);
 
@@ -3553,6 +3619,20 @@ uvideo_reqbufs(void *v, struct v4l2_requ
 		    sc->sc_mmap[i].v4l2_buf.length);
 	}
 
+	/* reset queues */
+	for (i = 0; i < UVIDEO_MMAP_QUEUE_SIZE; i++) {
+		sc->sc_mmap_q_in.q_item[i].qi_seq = i;
+		sc->sc_mmap_q_in.q_item[i].qi_idx = -1;
+
+		sc->sc_mmap_q_out.q_item[i].qi_seq = i;
+		sc->sc_mmap_q_out.q_item[i].qi_idx = -1;
+	}
+
+	sc->sc_mmap_q_in.q_head_seq = 0;
+	sc->sc_mmap_q_in.q_tail_seq = 0;
+	sc->sc_mmap_q_out.q_head_seq = 0;
+	sc->sc_mmap_q_out.q_tail_seq = 0;
+
 	/* tell how many buffers we have really allocated */
 	rb->count = sc->sc_mmap_count;
 
@@ -3586,6 +3666,7 @@ uvideo_querybuf(void *v, struct v4l2_buf
 int
 uvideo_qbuf(void *v, struct v4l2_buffer *qb)
 {
+	int flags;
 	struct uvideo_softc *sc = v;
 
 	if (qb->type != V4L2_BUF_TYPE_VIDEO_CAPTURE ||
@@ -3593,9 +3674,15 @@ uvideo_qbuf(void *v, struct v4l2_buffer 
 	    qb->index >= sc->sc_mmap_count)
 		return (EINVAL);
 
+	flags = sc->sc_mmap[qb->index].v4l2_buf.flags;
 	sc->sc_mmap[qb->index].v4l2_buf.flags &= ~V4L2_BUF_FLAG_DONE;
 	sc->sc_mmap[qb->index].v4l2_buf.flags |= V4L2_BUF_FLAG_QUEUED;
 
+	if (uvideo_mmap_queue_enqueue(&sc->sc_mmap_q_in, qb->index)) {
+		sc->sc_mmap[qb->index].v4l2_buf.flags = flags;
+		return (EINVAL);
+	}
+
 	DPRINTF(2, "%s: %s: buffer on index %d ready for queueing\n",
 	    DEVNAME(sc), __func__, qb->index);
 
@@ -3605,24 +3692,26 @@ uvideo_qbuf(void *v, struct v4l2_buffer 
 int
 uvideo_dqbuf(void *v, struct v4l2_buffer *dqb)
 {
-	struct uvideo_softc *sc = v;
 	struct uvideo_mmap *mmap;
-	int error;
+	struct uvideo_softc *sc = v;
+	int error, i;
 
 	if (dqb->type != V4L2_BUF_TYPE_VIDEO_CAPTURE ||
 	    dqb->memory != V4L2_MEMORY_MMAP)
 		return (EINVAL);
 
-	if (SIMPLEQ_EMPTY(&sc->sc_mmap_q)) {
+again:
+	i = uvideo_mmap_queue_dequeue(&sc->sc_mmap_q_out);
+
+	if (i < 0) {
 		/* mmap queue is empty, block until first frame is queued */
 		error = tsleep_nsec(sc, 0, "vid_mmap", SEC_TO_NSEC(10));
 		if (error)
 			return (EINVAL);
+		goto again;
 	}
 
-	mmap = SIMPLEQ_FIRST(&sc->sc_mmap_q);
-	if (mmap == NULL)
-		panic("uvideo_dqbuf: NULL pointer!");
+	mmap = &sc->sc_mmap[i];
 
 	bcopy(&mmap->v4l2_buf, dqb, sizeof(struct v4l2_buffer));
 
@@ -3631,7 +3720,6 @@ uvideo_dqbuf(void *v, struct v4l2_buffer
 
 	DPRINTF(2, "%s: %s: frame dequeued from index %d\n",
 	    DEVNAME(sc), __func__, mmap->v4l2_buf.index);
-	SIMPLEQ_REMOVE_HEAD(&sc->sc_mmap_q, q_frames);
 
 	return (0);
 }
Index: sys/dev/usb/uvideo.h
===================================================================
RCS file: /home/cvs/src/sys/dev/usb/uvideo.h,v
diff -u -p -r1.64 uvideo.h
--- sys/dev/usb/uvideo.h	26 Feb 2025 20:50:46 -0000	1.64
+++ sys/dev/usb/uvideo.h	16 Mar 2025 10:00:52 -0000
@@ -17,7 +17,6 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-#include <sys/queue.h>
 #include <sys/videoio.h>
 
 /*
@@ -553,11 +552,9 @@ struct uvideo_frame_buffer {
  */
 #define UVIDEO_MAX_BUFFERS	8
 struct uvideo_mmap {
-	SIMPLEQ_ENTRY(uvideo_mmap)	q_frames;
 	uint8_t				*buf;
 	struct v4l2_buffer		 v4l2_buf;
 };
-typedef SIMPLEQ_HEAD(, uvideo_mmap) q_mmap;
 
 struct uvideo_format_group {
 	uint32_t				 pixelformat;