From: Vitaliy Makkoveev Subject: Re: Don't take solock() in sosend() and soreceive() paths for unix(4) sockets To: Alexander Bluhm Cc: tech@openbsd.org Date: Fri, 3 May 2024 11:30:39 +0300 On Fri, May 03, 2024 at 02:10:49AM +0300, Vitaliy Makkoveev wrote: > On Fri, May 03, 2024 at 12:26:32AM +0200, Alexander Bluhm wrote: > > On Wed, May 01, 2024 at 01:01:22AM +0300, Vitaliy Makkoveev wrote: > > > On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote: > > > > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and > > > > `so_rcv' of unix(4) sockets. > > > > > > > > The transmission of unix(4) sockets already half-unlocked because > > > > connected peer is not locked by solock() during sbappend*() call. Since > > > > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not > > > > required in uipc_rcvd() too. > > > > > > > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK > > > > was kept because checks against SB_MTXLOCK within sb*() routines look > > > > more consistent to me. > > > > > > > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected > > > > while solock() is held on `so'. That's why unlocked sorwakeup() and > > > > sowwakeup() are fine, corresponding paths will never be followed. > > > > > > > > > > Add forgotten fifofs chunks. Against previous, this diff does direct > > > `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead > > > and unlinked from everywhere include spliced peer. So concurrent > > > sotask() thread will just exit. This required to solve inode and so_rcv > > > locks ordering. Also this removes re-locking from sofree() for all > > > sockets. > > > > Passed regress with witness. > > > > One question, otherwise OK bluhm@ > > > > [skip] > > > > You change the order of EPIPE and ENOTCONN error checks here. > > The difference is relevant, EPIPE may signal a SIGPIPE. > > > > When writing into a socket that stops working, the error should be > > EPIPE. If there are two processes, A is writing to B, and B > > disconnects the socket, then A should get a EPIPE. > > > > Do you change the behavior? > > > > In sosend() we have a SS_CANTSENDMORE check with EPIPE error. > > So maybe the uipc_send() errors are irrelevant. > > > > Yes, I change behavior. I had concerns , so I checked FreeBSD version of > uipc_send() and found that they do "so->so_state & SS_ISCONNECTED" check > before "so->so_snd.sb_state & SBS_CANTSENDMORE". But sosend_generic() > has the different order. Since this code exists for a long time, I found > this order not significant. > > However, our fifofs is the only place where we do direct SS_CANTSENDMORE > flag modifications. We could keep solock() around them for serialization > with uipc_send() and keep existing checks order. > I checked other BSDs. As you can see, FreeBSD, OSX and DragonFlyBSD returns ENOTCONN before EPIPE. NetBSD doesn't check SS_CANTSENDMORE and only returns ENOTCONN. So which way do we prefer? 1. NetBSD unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct unpcb *unp = sotounpcb(so); int error = 0; u_int newhiwat; struct socket *so2; KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(m != NULL); /* * Note: unp_internalize() rejects any control message * other than SCM_RIGHTS, and only allows one. This * has the side-effect of preventing a caller from * forging SCM_CREDS. */ if (control) { sounlock(so); error = unp_internalize(&control); solock(so); if (error != 0) { m_freem(control); m_freem(m); return error; } } switch (so->so_type) { case SOCK_DGRAM: { /* SKIP */ break; } case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: #define rcv (&so2->so_rcv) #define snd (&so->so_snd) if (unp->unp_conn == NULL) { error = ENOTCONN; break; } so2 = unp->unp_conn->unp_socket; KASSERT(solocked2(so, so2)); if (unp->unp_conn->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM and SOCK_SEQPACKET. */ unp->unp_conn->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(l, control); } if (unp->unp_conn->unp_flags & UNP_OWANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM and SOCK_SEQPACKET. */ unp->unp_conn->unp_flags &= ~UNP_OWANTCRED; MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control), stub_compat_70_unp_addsockcred(curlwp, control), control); } /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control) { if (sbappendcontrol(rcv, m, control) != 0) control = NULL; } else { 2. MacOS uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, proc_t p) { int error = 0; struct unpcb *unp = sotounpcb(so); struct socket *so2; int32_t len = m_pktlen(m); if (unp == 0) { error = EINVAL; goto release; } if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control) { /* release lock to avoid deadlock (4436174) */ socket_unlock(so, 0); error = unp_internalize(control, p); socket_lock(so, 0); if (error) { goto release; } } switch (so->so_type) { case SOCK_DGRAM: { /* SKIP */ break; } case SOCK_STREAM: { int didreceive = 0; #define rcv (&so2->so_rcv) #define snd (&so->so_snd) /* Connect if not connected yet. */ /* * Note: A better implementation would complain * if not equal to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam) { error = unp_connect(so, nam, p); if (error) { so->so_state &= ~SS_ISCONNECTING; break; /* XXX */ } } else { error = ENOTCONN; break; } } if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; break; } 3. DragonFlyBSD uipc_send(netmsg_t msg) { struct unpcb *unp, *unp2; struct socket *so; struct socket *so2; struct mbuf *control; struct mbuf *m; int error = 0; so = msg->base.nm_so; control = msg->send.nm_control; m = msg->send.nm_m; /* * so_pcb is only modified with both the global and the unp * pool token held. */ so = msg->base.nm_so; unp = unp_getsocktoken(so); if (!UNP_ISATTACHED(unp)) { error = EINVAL; goto release; } if (msg->send.nm_flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } wakeup_start_delayed(); if (control && (error = unp_internalize(control, msg->send.nm_td))) goto release; switch (so->so_type) { case SOCK_DGRAM: { /* SKIP */ break; } case SOCK_STREAM: case SOCK_SEQPACKET: /* Connect if not connected yet. */ /* * Note: A better implementation would complain * if not equal to the peer's address. */ if (unp->unp_conn == NULL) { if (msg->send.nm_addr) { error = unp_connect(so, msg->send.nm_addr, msg->send.nm_td); if (error) break; /* XXX */ } /* * NOTE: * unp_conn still could be NULL, even if the * above unp_connect() succeeds; since the * current unp's token could be released due * to blocking operations after unp_conn is * assigned. */ if (unp->unp_conn == NULL) { error = ENOTCONN; break; } } if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; break; }