Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Fri, 3 May 2024 11:30:39 +0300

Download raw body.

Thread
On Fri, May 03, 2024 at 02:10:49AM +0300, Vitaliy Makkoveev wrote:
> On Fri, May 03, 2024 at 12:26:32AM +0200, Alexander Bluhm wrote:
> > On Wed, May 01, 2024 at 01:01:22AM +0300, Vitaliy Makkoveev wrote:
> > > On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote:
> > > > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and
> > > > `so_rcv' of unix(4) sockets.
> > > > 
> > > > The transmission of unix(4) sockets already half-unlocked because
> > > > connected peer is not locked by solock() during sbappend*() call. Since
> > > > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not
> > > > required in uipc_rcvd() too.
> > > > 
> > > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK
> > > > was kept because checks against SB_MTXLOCK within sb*() routines look
> > > > more consistent to me.
> > > > 
> > > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected
> > > > while solock() is held on `so'. That's why unlocked sorwakeup() and
> > > > sowwakeup() are fine, corresponding paths will never be followed.
> > > > 
> > > 
> > > Add forgotten fifofs chunks. Against previous, this diff does direct
> > > `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead
> > > and unlinked from everywhere include spliced peer. So concurrent
> > > sotask() thread will just exit. This required to solve inode and so_rcv
> > > locks ordering. Also this removes re-locking from sofree() for all
> > > sockets.
> > 
> > Passed regress with witness.
> > 
> > One question, otherwise OK bluhm@
> > 
> > [skip] 
> > 
> > You change the order of EPIPE and ENOTCONN error checks here.
> > The difference is relevant, EPIPE may signal a SIGPIPE.
> > 
> > When writing into a socket that stops working, the error should be
> > EPIPE.  If there are two processes, A is writing to B, and B
> > disconnects the socket, then A should get a EPIPE.
> > 
> > Do you change the behavior?
> > 
> > In sosend() we have a SS_CANTSENDMORE check with EPIPE error.
> > So maybe the uipc_send() errors are irrelevant.
> > 
> 
> Yes, I change behavior. I had concerns , so I checked FreeBSD version of
> uipc_send() and found that they do "so->so_state & SS_ISCONNECTED" check
> before "so->so_snd.sb_state & SBS_CANTSENDMORE". But sosend_generic()
> has the different order. Since this code exists for a long time, I found
> this order not significant.
> 
> However, our fifofs is the only place where we do direct SS_CANTSENDMORE
> flag modifications. We could keep solock() around them for serialization
> with uipc_send() and keep existing checks order.
> 

I checked other BSDs. As you can see, FreeBSD, OSX and DragonFlyBSD
returns ENOTCONN before EPIPE. NetBSD doesn't check SS_CANTSENDMORE
and only returns ENOTCONN.

So which way do we prefer?

1. NetBSD

unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{

	struct unpcb *unp = sotounpcb(so);
	int error = 0;
	u_int newhiwat;
	struct socket *so2;

	KASSERT(solocked(so));
	KASSERT(unp != NULL);
	KASSERT(m != NULL);

	/*
	 * Note: unp_internalize() rejects any control message
	 * other than SCM_RIGHTS, and only allows one.  This
	 * has the side-effect of preventing a caller from
	 * forging SCM_CREDS.
	 */
	if (control) {
		sounlock(so);
		error = unp_internalize(&control);
		solock(so);
		if (error != 0) {
			m_freem(control);
			m_freem(m);
			return error;
		}
	}

	switch (so->so_type) {

	case SOCK_DGRAM: {
		/* SKIP */
		break;
	}

	case SOCK_SEQPACKET: /* FALLTHROUGH */
	case SOCK_STREAM:
#define	rcv (&so2->so_rcv)
#define	snd (&so->so_snd)
		if (unp->unp_conn == NULL) {
			error = ENOTCONN;
			break;
		}
		so2 = unp->unp_conn->unp_socket;
		KASSERT(solocked2(so, so2));
		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
			/*
			 * Credentials are passed only once on
			 * SOCK_STREAM and SOCK_SEQPACKET.
			 */
			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
			control = unp_addsockcred(l, control);
		}
		if (unp->unp_conn->unp_flags & UNP_OWANTCRED) {
			/*
			 * Credentials are passed only once on
			 * SOCK_STREAM and SOCK_SEQPACKET.
			 */
			unp->unp_conn->unp_flags &= ~UNP_OWANTCRED;
			MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
			    stub_compat_70_unp_addsockcred(curlwp, control),
			    control);
		}
		/*
		 * Send to paired receive port, and then reduce
		 * send buffer hiwater marks to maintain backpressure.
		 * Wake up readers.
		 */
		if (control) {
			if (sbappendcontrol(rcv, m, control) != 0)
				control = NULL;
		} else {

2. MacOS

uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, proc_t p)
{
	int error = 0;
	struct unpcb *unp = sotounpcb(so);
	struct socket *so2;
	int32_t len = m_pktlen(m);

	if (unp == 0) {
		error = EINVAL;
		goto release;
	}
	if (flags & PRUS_OOB) {
		error = EOPNOTSUPP;
		goto release;
	}

	if (control) {
		/* release lock to avoid deadlock (4436174) */
		socket_unlock(so, 0);
		error = unp_internalize(control, p);
		socket_lock(so, 0);
		if (error) {
			goto release;
		}
	}

	switch (so->so_type) {
	case SOCK_DGRAM:
	{
		/* SKIP */
		break;
	}

	case SOCK_STREAM: {
		int didreceive = 0;
#define rcv (&so2->so_rcv)
#define snd (&so->so_snd)
		/* Connect if not connected yet. */
		/*
		 * Note: A better implementation would complain
		 * if not equal to the peer's address.
		 */
		if ((so->so_state & SS_ISCONNECTED) == 0) {
			if (nam) {
				error = unp_connect(so, nam, p);
				if (error) {
					so->so_state &= ~SS_ISCONNECTING;
					break;  /* XXX */
				}
			} else {
				error = ENOTCONN;
				break;
			}
		}

		if (so->so_state & SS_CANTSENDMORE) {
			error = EPIPE;
			break;
		}

3. DragonFlyBSD

uipc_send(netmsg_t msg)
{
	struct unpcb *unp, *unp2;
	struct socket *so;
	struct socket *so2;
	struct mbuf *control;
	struct mbuf *m;
	int error = 0;

	so = msg->base.nm_so;
	control = msg->send.nm_control;
	m = msg->send.nm_m;

	/*
	 * so_pcb is only modified with both the global and the unp
	 * pool token held.
	 */
	so = msg->base.nm_so;
	unp = unp_getsocktoken(so);

	if (!UNP_ISATTACHED(unp)) {
		error = EINVAL;
		goto release;
	}

	if (msg->send.nm_flags & PRUS_OOB) {
		error = EOPNOTSUPP;
		goto release;
	}

	wakeup_start_delayed();

	if (control && (error = unp_internalize(control, msg->send.nm_td)))
		goto release;

	switch (so->so_type) {
	case SOCK_DGRAM:
	{
		/* SKIP */
		break;
	}

	case SOCK_STREAM:
	case SOCK_SEQPACKET:
		/* Connect if not connected yet. */
		/*
		 * Note: A better implementation would complain
		 * if not equal to the peer's address.
		 */
		if (unp->unp_conn == NULL) {
			if (msg->send.nm_addr) {
				error = unp_connect(so,
						    msg->send.nm_addr,
						    msg->send.nm_td);
				if (error)
					break;	/* XXX */
			}
			/*
			 * NOTE:
			 * unp_conn still could be NULL, even if the
			 * above unp_connect() succeeds; since the
			 * current unp's token could be released due
			 * to blocking operations after unp_conn is
			 * assigned.
			 */
			if (unp->unp_conn == NULL) {
				error = ENOTCONN;
				break;
			}
		}
		if (so->so_state & SS_CANTSENDMORE) {
			error = EPIPE;
			break;
		}