Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: tcp(4): use per-sockbuf mutex to protect `so_snd' socket buffer
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
OpenBSD tech <tech@openbsd.org>
Date:
Tue, 24 Dec 2024 22:21:55 +0300

Download raw body.

Thread
On Tue, Dec 24, 2024 at 08:53:14PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Dec 24, 2024 at 07:59:38PM +0300, Vitaliy Makkoveev wrote:
> > > On 24 Dec 2024, at 19:49, Alexander Bluhm <alexander.bluhm@gmx.net> wrote:
> > > 
> > > On Sat, Dec 21, 2024 at 08:16:05AM +0300, Vitaliy Makkoveev wrote:
> > >> This diff only unlocks sosend() path, all the rest is still locked
> > >> exclusively.
> > >> 
> > >> Even for tcp(4) case, sosend() only checks `so_snd' free space and
> > >> sleeps if necessary, actual buffer handling happening within exclusively
> > >> locked PCB layer. In the PCB layer we don't need to protect read-only
> > >> access to `so_snd' because corresponding read-write access is serialized
> > >> by socket lock.
> > >> 
> > >> sosend() needs to be serialized with somove(), so sotask() takes
> > >> sblock() on `so_snd' of spliced socket. This discards previous
> > >> protection of tcp(4) spliced sockets where solock() was used to prevent
> > >> concurrent unsplicing. This scheme was used to avoid sleep in sofree().
> > >> 
> > >> However, sleep is sofree() is possible. We have two cases:
> > >> 
> > >> 1. Socket was not yet accept(2)ed. Such sockets can't be accessed from
> > >> the userland and can't be spliced. sofree() could be called only from
> > >> PCB layer and don't need to sleep.
> > >> 
> > >> 2. Socket was accepted. While called form PCB layer, sofree() ignores it
> > >> because SS_NOFDREF bit is not set. Socket remains spliced, but without
> > >> PCB. Sockets without PCB can't be accessed from PCB layer, only from
> > >> userland. So, soclose()/sofree() thread needs to wait concurrent
> > >> soclose()/sofree() thread for spliced socket as is is already done for
> > >> udp(4) case. In such case it is safe to release solock() and sleep
> > >> within sofree().
> > >> 
> > >> Note, I intentionally left all "if (dosolock)" dances as is.
> > > 
> > > While running your diff though my performnce tests, I got this panic
> > > twice.  I have never seen it before, so I guess it is related.
> > > 
> > > panic: softclock: invalid to_clock: 19668320
> > > Stopped at      db_enter+0x14:  popq    %rbp
> > >    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
> > > 120146  91986     89   0x1100012          0    1  relayd
> > >  65124  13759     89   0x1100012          0    3  relayd
> > >  33160  19438      0     0x14000      0x200    2  softnet0
> > > db_enter() at db_enter+0x14
> > > panic(ffffffff82883be3) at panic+0xdd
> > > softclock(0) at softclock+0x1d5
> > > softintr_dispatch(0) at softintr_dispatch+0xe6
> > > Xsoftclock() at Xsoftclock+0x27
> > > acpicpu_idle() at acpicpu_idle+0x2b9
> > > sched_idle(ffffffff83593ff0) at sched_idle+0x298
> > > end trace frame: 0x0, count: 8
> > > https://www.openbsd.org/ddb.html describes the minimum info required in bug
> > > reports.  Insufficient info makes it difficult to find and fix bugs.
> > > 
> > > ddb{0}> x/s version
> > > version:        OpenBSD 7.6-current (GENERIC.MP) #cvs : D2024.12.23.00.00.00: Tue Dec 24 14:53:39 CET 2024
> > >    root@ot15.obsd-lab.genua.de:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> > > 
> > > ddb{0}> show panic
> > > *cpu0: softclock: invalid to_clock: 19668320
> > 
> > Interesting. Which test triggers this? I suspect sosplice.
> 
> This diff contains only sosplice() relared hunks. Regardless `so_snd'
> unlocking, our current sosend() releases solock() around uiomove(), so
> concurrent somove() thread could break us. This diff prevents this by
> taking sblock() on `so_snd' of spliced socket.
> 
> I performed couple of kern/sosplice runs without any issue. Alexander,
> could you test it too? I suspect the problem could be in timeout
> reinitialisation in sofree(), but I can't trigger it during regress run.
> 

This is the updated diff. It uses barriers instead of `ssp_idleto'
timeout and `ssp_task' task redefinition.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.347 uipc_socket.c
--- sys/kern/uipc_socket.c	19 Dec 2024 22:11:35 -0000	1.347
+++ sys/kern/uipc_socket.c	24 Dec 2024 19:12:56 -0000
@@ -62,8 +62,6 @@ int	sosplice(struct socket *, int, off_t
 void	sounsplice(struct socket *, struct socket *, int);
 void	soidle(void *);
 void	sotask(void *);
-void	soreaper(void *);
-void	soput(void *);
 int	somove(struct socket *, int);
 void	sorflush(struct socket *);
 
@@ -327,17 +325,14 @@ sofree(struct socket *so, int keep_lock)
 			sounlock(head);
 	}
 
-	switch (so->so_proto->pr_domain->dom_family) {
-	case AF_INET:
-	case AF_INET6:
-		if (so->so_proto->pr_type == SOCK_STREAM)
-			break;
-		/* FALLTHROUGH */
-	default:
+	if (!keep_lock) {
+		/*
+		 * sofree() was called from soclose(). Sleep is safe
+		 * even for tcp(4) sockets.
+		 */
 		sounlock(so);
 		refcnt_finalize(&so->so_refcnt, "sofinal");
 		solock(so);
-		break;
 	}
 
 	sigio_free(&so->so_sigio);
@@ -358,20 +353,20 @@ sofree(struct socket *so, int keep_lock)
 		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 	m_purge(so->so_rcv.sb_mb);
 
-	if (!keep_lock)
+	if (!keep_lock) {
 		sounlock(so);
 
 #ifdef SOCKET_SPLICE
-	if (so->so_sp) {
-		/* Reuse splice idle, sounsplice() has been called before. */
-		timeout_set_flags(&so->so_sp->ssp_idleto, soreaper, so,
-		    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
-		timeout_add(&so->so_sp->ssp_idleto, 0);
-	} else
+		if (so->so_sp) {
+			timeout_del_barrier(&so->so_sp->ssp_idleto);
+			task_del(sosplice_taskq, &so->so_sp->ssp_task);
+			taskq_barrier(sosplice_taskq);
+			pool_put(&sosplice_pool, so->so_sp);
+		}
 #endif /* SOCKET_SPLICE */
-	{
-		pool_put(&socket_pool, so);
 	}
+
+	pool_put(&socket_pool, so);
 }
 
 static inline uint64_t
@@ -450,39 +445,10 @@ drop:
 		}
 	}
 discard:
-	if (so->so_state & SS_NOFDREF)
-		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
-	so->so_state |= SS_NOFDREF;
-
 #ifdef SOCKET_SPLICE
 	if (so->so_sp) {
 		struct socket *soback;
 
-		if (so->so_proto->pr_flags & PR_WANTRCVD) {
-			/*
-			 * Copy - Paste, but can't relock and sleep in
-			 * sofree() in tcp(4) case. That's why tcp(4)
-			 * still rely on solock() for splicing and
-			 * unsplicing.
-			 */
-
-			if (issplicedback(so)) {
-				int freeing = SOSP_FREEING_WRITE;
-
-				if (so->so_sp->ssp_soback == so)
-					freeing |= SOSP_FREEING_READ;
-				sounsplice(so->so_sp->ssp_soback, so, freeing);
-			}
-			if (isspliced(so)) {
-				int freeing = SOSP_FREEING_READ;
-
-				if (so == so->so_sp->ssp_socket)
-					freeing |= SOSP_FREEING_WRITE;
-				sounsplice(so, so->so_sp->ssp_socket, freeing);
-			}
-			goto free;
-		}
-
 		sounlock(so);
 		mtx_enter(&so->so_snd.sb_mtx);
 		/*
@@ -530,8 +496,12 @@ notsplicedback:
 
 		solock(so);
 	}
-free:
 #endif /* SOCKET_SPLICE */
+
+	if (so->so_state & SS_NOFDREF)
+		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
+	so->so_state |= SS_NOFDREF;
+
 	/* sofree() calls sounlock(). */
 	sofree(so, 0);
 	return (error);
@@ -1532,8 +1502,7 @@ sosplice(struct socket *so, int fd, off_
 void
 sounsplice(struct socket *so, struct socket *sosp, int freeing)
 {
-	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
-		sbassertlocked(&so->so_rcv);
+	sbassertlocked(&so->so_rcv);
 	soassertlocked(so);
 
 	task_del(sosplice_taskq, &so->so_splicetask);
@@ -1587,17 +1556,23 @@ sotask(void *arg)
 	 */
 
 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
-	if (sockstream)
-		solock(so);
-
 	if (so->so_rcv.sb_flags & SB_SPLICE) {
-		if (sockstream)
+		struct socket *sosp = so->so_sp->ssp_socket;
+
+		if (sockstream) {
+			sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR);
+			solock(so);
 			doyield = 1;
+		}
+
 		somove(so, M_DONTWAIT);
+
+		if (sockstream) {
+			sounlock(so);
+			sbunlock(&sosp->so_snd);
+		}
 	}
 
-	if (sockstream)
-		sounlock(so);
 	sbunlock(&so->so_rcv);
 
 	if (doyield) {
@@ -1607,32 +1582,6 @@ sotask(void *arg)
 }
 
 /*
- * The socket splicing task or idle timeout may sleep while grabbing the net
- * lock.  As sofree() can be called anytime, sotask() or soidle() could access
- * the socket memory of a freed socket after wakeup.  So delay the pool_put()
- * after all pending socket splicing tasks or timeouts have finished.  Do this
- * by scheduling it on the same threads.
- */
-void
-soreaper(void *arg)
-{
-	struct socket *so = arg;
-
-	/* Reuse splice task, sounsplice() has been called before. */
-	task_set(&so->so_sp->ssp_task, soput, so);
-	task_add(sosplice_taskq, &so->so_sp->ssp_task);
-}
-
-void
-soput(void *arg)
-{
-	struct socket *so = arg;
-
-	pool_put(&sosplice_pool, so->so_sp);
-	pool_put(&socket_pool, so);
-}
-
-/*
  * Move data from receive buffer of spliced source socket to send
  * buffer of drain socket.  Try to move as much as possible in one
  * big chunk.  It is a TCP only implementation.
@@ -1652,8 +1601,10 @@ somove(struct socket *so, int wait)
 
 	if (sockdgram)
 		sbassertlocked(&so->so_rcv);
-	else
+	else {
+		sbassertlocked(&sosp->so_snd);
 		soassertlocked(so);
+	}
 
 	mtx_enter(&so->so_rcv.sb_mtx);
 	mtx_enter(&sosp->so_snd.sb_mtx);