From: Claudio Jeker Subject: Re: [patch 2 of 3] iscsid(8): avoid double free & race during cleanup To: Jack Burton Cc: tech@openbsd.org Date: Mon, 20 Jan 2025 13:54:20 +0100 On Fri, Jan 17, 2025 at 06:04:32PM +1030, Jack Burton wrote: > On Thu, 16 Jan 2025 21:27:34 +0100 > Claudio Jeker wrote: > > > On Thu, Jan 16, 2025 at 01:37:13PM +1030, Jack Burton wrote: > > > On Wed, 15 Jan 2025 14:46:02 +0100 > > > Claudio Jeker wrote: > > > > On Tue, Dec 31, 2024 at 06:33:08PM +1030, Jack Burton wrote: > > > > > Fixing that one unmasked a latent race condition during > > > > > shutdown. Depending on whether session reestablishment attempts > > > > > result in any new connections entering the login phase before > > > > > ISCSI_EXIT_WAIT expires, EITHER conn_free() would get called > > > > > multiple times on the same connection OR there'd be an outright > > > > > access after free. > > > > > > > > > > To avoid adding extra states to the fsm, simply check > > > > > exit_rounds (which gets set in shutdown_cb) before freeing > > > > > connections in sess_do_free(): if it's set then we're about to > > > > > exit anyway, so it doesn't matter that some connections may get > > > > > reallocated without freeing the old ones. > > > > > > > > Uhm. I don't understand how conn_free() can be called multiple > > > > times. > > > > > > sess_do_free() calls conn_free() in the loop for each connection in > > > the TAILQ. Then sess_do_conn_fail() calls conn_free() because > > > connection state is CONN_FREE. > > > > > > I don't really understand why sess_do_conn_fail() is getting called > > > at all, as after sess_do_free() the current session state should be > > > SESS_FREE (not SESS_RUNNING) ... but somehow it does get called. > > > > > > > Ah, I see. The problem is that sessev has another pointer to the > > connection and the sessev is not removed in conn_free(). > > I need to reread the code to see how we can fix that mess. > > But what happens is that we have a event queued with a pointer to a > > connection that was already freed. > > Thanks, that's just the clue I needed! > > You're right, it is a bit of a mess. Not your fault though: most of > that mess is necessary if we want to keep the fsm reasonably true to the > standard. > > Simplest way I could think of to avoid the double free (*without* the sledge-hammer approach of my original patch #2) was to keep track > of any one-shot event queued for each connection, dequeuing it (if any) > in conn_free(). > > [I am of course assuming that no connection ever gets more than one > one-shot event pending at the same time ... but in my testing I have not > been able to produce any scenario that violates that assumption.] > > The patch below does just that. With this patch applied I get clean > shutdowns on SIGTERM, regardless of which target(s) iscsid is connected > to and regardless of whether each of them is in login, running, or > failed when I send the signal. > > Is this a reasonable approach for the time being? > I took the liberty and cleaned up this a bit further. I added a struct sessev to the session and connection structs and use those for the session_fsm. So now a pending sessev event can be removed in connection_free. What you think about that? Can you test this against the non-netbsd targets? -- :wq Claudio Index: connection.c =================================================================== RCS file: /cvs/src/usr.sbin/iscsid/connection.c,v diff -u -p -r1.22 connection.c --- connection.c 16 Jan 2025 16:19:39 -0000 1.22 +++ connection.c 20 Jan 2025 10:40:02 -0000 @@ -71,6 +71,10 @@ conn_new(struct session *s, struct conne c->his = iscsi_conn_defaults; c->active = iscsi_conn_defaults; + c->sev.sess = s; + c->sev.conn = c; + evtimer_set(&c->sev.ev, session_fsm_callback, &c->sev); + TAILQ_INIT(&c->pdu_w); TAILQ_INIT(&c->tasks); TAILQ_INSERT_TAIL(&s->connections, c, entry); @@ -113,6 +117,7 @@ conn_free(struct connection *c) pdu_readbuf_free(&c->prbuf); pdu_free_queue(&c->pdu_w); + event_del(&c->sev.ev); event_del(&c->ev); event_del(&c->wev); if (c->fd != -1) @@ -435,7 +440,7 @@ c_do_connect(struct connection *c, enum if (c->fd == -1) { log_warnx("connect(%s), lost socket", log_sockaddr(&c->config.TargetAddr)); - session_fsm(c->session, SESS_EV_CONN_FAIL, c, 0); + session_fsm(&c->sev, SESS_EV_CONN_FAIL, 0); return CONN_FREE; } if (c->config.LocalAddr.ss_len != 0) { @@ -443,7 +448,7 @@ c_do_connect(struct connection *c, enum c->config.LocalAddr.ss_len) == -1) { log_warn("bind(%s)", log_sockaddr(&c->config.LocalAddr)); - session_fsm(c->session, SESS_EV_CONN_FAIL, c, 0); + session_fsm(&c->sev, SESS_EV_CONN_FAIL, 0); return CONN_FREE; } } @@ -456,7 +461,7 @@ c_do_connect(struct connection *c, enum } else { log_warn("connect(%s)", log_sockaddr(&c->config.TargetAddr)); - session_fsm(c->session, SESS_EV_CONN_FAIL, c, 0); + session_fsm(&c->sev, SESS_EV_CONN_FAIL, 0); return CONN_FREE; } } @@ -477,7 +482,7 @@ int c_do_loggedin(struct connection *c, enum c_event ev) { iscsi_merge_conn_params(&c->active, &c->mine, &c->his); - session_fsm(c->session, SESS_EV_CONN_LOGGED_IN, c, 0); + session_fsm(&c->sev, SESS_EV_CONN_LOGGED_IN, 0); return CONN_LOGGED_IN; } @@ -525,7 +530,7 @@ c_do_fail(struct connection *c, enum c_e taskq_cleanup(&c->tasks); /* session will take care of cleaning up the mess */ - session_fsm(c->session, SESS_EV_CONN_FAIL, c, 0); + session_fsm(&c->sev, SESS_EV_CONN_FAIL, 0); if (ev == CONN_EV_FREE || c->state & CONN_NEVER_LOGGED_IN) return CONN_FREE; Index: initiator.c =================================================================== RCS file: /cvs/src/usr.sbin/iscsid/initiator.c,v diff -u -p -r1.16 initiator.c --- initiator.c 16 Jan 2025 16:19:39 -0000 1.16 +++ initiator.c 20 Jan 2025 10:42:20 -0000 @@ -512,10 +512,10 @@ initiator_logout_cb(struct connection *c case ISCSI_LOGOUT_RESP_SUCCESS: if (tl->reason == ISCSI_LOGOUT_CLOSE_SESS) { conn_fsm(c, CONN_EV_LOGGED_OUT); - session_fsm(c->session, SESS_EV_CLOSED, NULL, 0); + session_fsm(&c->session->sev, SESS_EV_CLOSED, 0); } else { conn_fsm(tl->c, CONN_EV_LOGGED_OUT); - session_fsm(c->session, SESS_EV_CONN_CLOSED, tl->c, 0); + session_fsm(&tl->c->sev, SESS_EV_CONN_CLOSED, 0); } break; case ISCSI_LOGOUT_RESP_UNKN_CID: Index: iscsid.c =================================================================== RCS file: /cvs/src/usr.sbin/iscsid/iscsid.c,v diff -u -p -r1.22 iscsid.c --- iscsid.c 16 Apr 2021 14:37:06 -0000 1.22 +++ iscsid.c 20 Jan 2025 10:42:40 -0000 @@ -259,7 +259,7 @@ iscsid_ctrl_dispatch(void *ch, struct pd session_config(s, sc); if (s->state == SESS_INIT) - session_fsm(s, SESS_EV_START, NULL, 0); + session_fsm(&s->sev, SESS_EV_START, 0); control_compose(ch, CTRL_SUCCESS, NULL, 0); break; Index: iscsid.h =================================================================== RCS file: /cvs/src/usr.sbin/iscsid/iscsid.h,v diff -u -p -r1.20 iscsid.h --- iscsid.h 16 Jan 2025 16:19:39 -0000 1.20 +++ iscsid.h 20 Jan 2025 10:39:04 -0000 @@ -226,6 +226,7 @@ struct initiator { }; struct sessev { + struct event ev; struct session *sess; struct connection *conn; enum s_event event; @@ -233,6 +234,7 @@ struct sessev { struct session { TAILQ_ENTRY(session) entry; + struct sessev sev; struct connection_head connections; struct taskq tasks; struct session_config config; @@ -264,6 +266,7 @@ struct session_poll { struct connection { struct event ev; struct event wev; + struct sessev sev; TAILQ_ENTRY(connection) entry; struct connection_params mine; struct connection_params his; @@ -349,8 +352,8 @@ void session_config(struct session *, st void session_task_issue(struct session *, struct task *); void session_logout_issue(struct session *, struct task *); void session_schedule(struct session *); -void session_fsm(struct session *, enum s_event, struct connection *, - unsigned int); +void session_fsm(struct sessev *, enum s_event, unsigned int); +void session_fsm_callback(int, short, void *); void conn_new(struct session *, struct connection_config *); void conn_free(struct connection *); Index: session.c =================================================================== RCS file: /cvs/src/usr.sbin/iscsid/session.c,v diff -u -p -r1.10 session.c --- session.c 16 Jan 2025 16:17:32 -0000 1.10 +++ session.c 20 Jan 2025 10:43:33 -0000 @@ -35,7 +35,6 @@ #include "iscsid.h" #include "log.h" -void session_fsm_callback(int, short, void *); int sess_do_start(struct session *, struct sessev *); int sess_do_conn_loggedin(struct session *, struct sessev *); int sess_do_conn_fail(struct session *, struct sessev *); @@ -75,6 +74,9 @@ session_new(struct initiator *i, u_int8_ s->initiator = i; s->state = SESS_INIT; + s->sev.sess = s; + evtimer_set(&s->sev.ev, session_fsm_callback, &s->sev); + if (st == SESSION_TYPE_DISCOVERY) s->target = 0; else @@ -204,25 +206,20 @@ session_schedule(struct session *s) * The session FSM runs from a callback so that the connection FSM can finish. */ void -session_fsm(struct session *s, enum s_event ev, struct connection *c, - unsigned int timeout) +session_fsm(struct sessev *sev, enum s_event event, unsigned int timeout) { + struct session *s = sev->sess; struct timeval tv; - struct sessev *sev; log_debug("session_fsm[%s]: %s ev %s timeout %d", s->config.SessionName, sess_state(s->state), - sess_event(ev), timeout); + sess_event(event), timeout); - if ((sev = malloc(sizeof(*sev))) == NULL) - fatal("session_fsm"); - sev->conn = c; - sev->sess = s; - sev->event = ev; + sev->event = event; timerclear(&tv); tv.tv_sec = timeout; - if (event_once(-1, EV_TIMEOUT, session_fsm_callback, sev, &tv) == -1) + if (evtimer_add(&sev->ev, &tv) == -1) fatal("session_fsm"); } @@ -276,8 +273,6 @@ session_fsm_callback(int fd, short event sess_state(s->state), sess_event(sev->event)); fatalx("bjork bjork bjork"); } - free(sev); -log_debug("sess_fsm: done"); } int @@ -360,7 +355,7 @@ sess_do_conn_fail(struct session *s, str state = SESS_LOGGED_IN; } - session_fsm(s, SESS_EV_START, NULL, s->holdTimer); + session_fsm(&s->sev, SESS_EV_START, s->holdTimer); /* exponential back-off on constant failure */ if (s->holdTimer < ISCSID_HOLD_TIME_MAX) s->holdTimer = s->holdTimer ? s->holdTimer * 2 : 1;