Index | Thread | Search

From:
Jonathan Matthew <jonathan@d14n.org>
Subject:
nvme: add async event processing
To:
tech@openbsd.org
Date:
Fri, 26 Dec 2025 23:49:33 +1000

Download raw body.

Thread
  • Jonathan Matthew:

    nvme: add async event processing

nvme controllers can report asynchronous events to the host, for things
like health events, driver or controller internal errors, namespace
changes, and some other stuff.

The diff below implements processing of internal error and SMART/health
events, just by printing a message on the console. These might shed some
light on cases where an nvme device stops working, at least.

The driver has to first enable events it wants, then issue an
Asynchronous Event Request command, which only completes when there's an
event for the host to process. For some event types, the driver has to
read a log page off the controller to clear the event, allowing future
events of that type to be reported.

If the controller doesn't implement async events, we just stop after the
first AER command fails. Some hypervisors do this, but I'm not aware of
any real hardware that does.

Like the sensor code, this is almost all inside SMALL_KERNEL, so it only
adds a few bytes to ramdisk kernels.

ok?


Index: nvmereg.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmereg.h,v
diff -u -p -r1.16 nvmereg.h
--- nvmereg.h	13 Sep 2024 09:57:34 -0000	1.16
+++ nvmereg.h	26 Dec 2025 12:10:10 -0000
@@ -416,6 +416,23 @@ struct nvm_identify_namespace {
 	u_int8_t	vs[3712];
 } __packed __aligned(8);
 
+#define NVM_LOG_PAGE_ERROR_INFORMATION	0x01
+struct nvm_error_information {
+	u_int64_t	error_count;
+	u_int16_t	queue_id;
+	u_int16_t	command_id;
+	u_int16_t	status;
+	u_int16_t	error_location;
+	u_int64_t	lba;
+	u_int32_t	nsid;
+	u_int8_t	vendor_info;
+	u_int8_t	transport_type;
+	u_int16_t	_reserved1;
+	u_int64_t	command_info;
+	u_int16_t	transport_info;
+	u_int8_t	_reserved2[22];
+} __packed __aligned(8);
+
 #define NVM_LOG_PAGE_SMART_HEALTH	0x02
 struct nvm_smart_health {
 	u_int8_t	critical_warning;
@@ -453,3 +470,13 @@ struct nvm_smart_health {
 	
 	u_int8_t	_reserved2[280];
 } __packed __aligned(8);
+
+#define NVM_ASYNC_EVENT_INFO_MASK		0x0000ff00
+#define NVM_ASYNC_EVENT_INFO_SHIFT		8
+#define NVM_ASYNC_EVENT_TYPE_MASK		0x00000007
+
+#define NVM_ASYNC_EVENT_TYPE_ERROR		0
+#define NVM_ASYNC_EVENT_TYPE_SMART		1
+
+#define NVM_FEATURE_ASYNC_EVENT_ENABLE		0x0B
+#define  NVM_ASYNC_EVENT_ENABLE_SMART		0x00FF
Index: nvmevar.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmevar.h,v
diff -u -p -r1.32 nvmevar.h
--- nvmevar.h	16 Dec 2025 00:24:55 -0000	1.32
+++ nvmevar.h	26 Dec 2025 12:10:10 -0000
@@ -17,6 +17,7 @@
  */
 
 #include <sys/sensors.h>
+#include <sys/task.h>
 
 #define NVME_IO_Q	1
 #define NVME_HIB_Q	2
@@ -50,6 +51,7 @@ struct nvme_ccb {
 
 	u_int16_t		ccb_id;
 	u_int16_t		ccb_cqe_flags;
+	u_int32_t		ccb_cqe_cdw0;
 };
 SIMPLEQ_HEAD(nvme_ccb_list, nvme_ccb);
 
@@ -134,6 +136,12 @@ struct nvme_softc {
 	struct ksensor		sc_temp_sensor;
 	struct ksensor		sc_spare_sensor;
 	struct ksensor		sc_usage_sensor;
+
+	struct scsi_iohandler	sc_aer_handler;
+	struct nvme_ccb		*sc_aer_ccb;
+	u_int32_t		sc_aen_log_page;
+	size_t			sc_aen_log_page_size;
+	struct task		sc_aen_log_task;
 };
 
 #define DEVNAME(_sc) ((_sc)->sc_dev.dv_xname)
Index: nvme.c
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvme.c,v
diff -u -p -r1.125 nvme.c
--- nvme.c	16 Dec 2025 00:24:55 -0000	1.125
+++ nvme.c	26 Dec 2025 12:10:10 -0000
@@ -61,7 +61,13 @@ int	nvme_identify(struct nvme_softc *, u
 void	nvme_fill_identify(struct nvme_softc *, struct nvme_ccb *, void *);
 
 #ifndef SMALL_KERNEL
+struct nvme_dmamem *nvme_read_log_page(struct nvme_softc *, struct nvme_ccb *,
+	    u_int32_t, size_t);
 void	nvme_refresh_sensors(void *);
+
+void	nvme_start_aer(void *, void *);
+void	nvme_aen_read_log(void *);
+void	nvme_enable_async_events(struct nvme_softc *);
 #endif
 
 int	nvme_ccbs_alloc(struct nvme_softc *, u_int);
@@ -431,6 +437,8 @@ nvme_attach(struct nvme_softc *sc)
 		goto free_q;
 
 	sensordev_install(&sc->sc_sensordev);
+
+	nvme_enable_async_events(sc);
 #endif
 
 	sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev,
@@ -472,6 +480,11 @@ nvme_resume(struct nvme_softc *sc)
 		return (1);
 	}
 
+	/* outstanding aer commands are aborted on reset */
+	if (sc->sc_aer_ccb != NULL) {
+		scsi_io_put(&sc->sc_iopool, sc->sc_aer_ccb);
+	}
+
 	sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
 	if (sc->sc_q == NULL) {
 		printf("%s: unable to allocate io q\n", DEVNAME(sc));
@@ -485,6 +498,10 @@ nvme_resume(struct nvme_softc *sc)
 
 	nvme_write4(sc, NVME_INTMC, 1);
 
+#ifndef SMALL_KERNEL
+	nvme_enable_async_events(sc);
+#endif
+
 	return (0);
 
 free_q:
@@ -1213,6 +1230,7 @@ nvme_q_complete(struct nvme_softc *sc, s
 		sc->sc_ops->op_cq_done(sc, q, ccb);
 
 		ccb->ccb_cqe_flags = lemtoh16(&cqe->flags);
+		ccb->ccb_cqe_cdw0 = lemtoh32(&cqe->cdw0);
 		SIMPLEQ_INSERT_TAIL(&done_list, ccb, ccb_entry);
 
 		if (++head >= q->q_entries) {
@@ -2151,33 +2169,26 @@ nvme_bioctl_disk(struct nvme_softc *sc, 
 #endif	/* NBIO > 0 */
 
 #ifndef SMALL_KERNEL
-void
-nvme_refresh_sensors(void *arg)
+
+struct nvme_dmamem *
+nvme_read_log_page(struct nvme_softc *sc, struct nvme_ccb *ccb,
+    u_int32_t page, size_t length)
 {
-	struct nvme_softc 		*sc = arg;
 	struct nvme_sqe			 sqe;
 	struct nvme_dmamem		*mem = NULL;
-	struct nvme_ccb			*ccb = NULL;
-	struct nvm_smart_health 	*health;
 	uint32_t			 dwlen;
-	uint8_t 			 cw;
 	int				 flags;
-	int64_t				 temp;
 
-	ccb = nvme_ccb_get(sc);
-	if (ccb == NULL)
-		goto failed;
-
-	mem = nvme_dmamem_alloc(sc, sizeof(*health));
+	mem = nvme_dmamem_alloc(sc, length);
 	if (mem == NULL)
 		goto failed;
 	nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
 
-	dwlen = (sizeof(*health) >> 2) - 1;
+	dwlen = (length >> 2) - 1;
 	memset(&sqe, 0, sizeof(sqe));
 	sqe.opcode = NVM_ADMIN_GET_LOG_PG;
 	htolem32(&sqe.nsid, 0xffffffff);
-	htolem32(&sqe.cdw10, (dwlen << 16 | NVM_LOG_PAGE_SMART_HEALTH));
+	htolem32(&sqe.cdw10, (dwlen << 16 | page));
 	htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
 
 	ccb->ccb_done = nvme_empty_done;
@@ -2189,6 +2200,32 @@ nvme_refresh_sensors(void *arg)
 	if (flags != 0)
 		goto failed;
 
+	return mem;
+ failed:
+	if (mem != NULL)
+		nvme_dmamem_free(sc, mem);
+	return NULL;
+}
+
+void
+nvme_refresh_sensors(void *arg)
+{
+	struct nvme_softc 		*sc = arg;
+	struct nvme_ccb			*ccb;
+	struct nvme_dmamem		*mem = NULL;
+	struct nvm_smart_health 	*health;
+	uint8_t				 cw;
+	int64_t				 temp;
+
+	ccb = nvme_ccb_get(sc);
+	if (ccb == NULL)
+		goto failed;
+
+	mem = nvme_read_log_page(sc, ccb, NVM_LOG_PAGE_SMART_HEALTH,
+	    sizeof(*health));
+	if (mem == NULL)
+		goto failed;
+
 	health = NVME_DMA_KVA(mem); 
 	cw = health->critical_warning;
 
@@ -2215,4 +2252,148 @@ nvme_refresh_sensors(void *arg)
 	if (ccb != NULL)
 		nvme_ccb_put(sc, ccb);
 }
+
+void
+nvme_aen_read_log(void *xsc)
+{
+	struct nvme_softc		*sc = xsc;
+	struct nvme_dmamem		*mem = NULL;
+	struct nvme_ccb			*ccb;
+
+	ccb = scsi_io_get(&sc->sc_iopool, 0);
+	KASSERT(ccb != NULL);
+
+	mem = nvme_read_log_page(sc, ccb, sc->sc_aen_log_page,
+	    sc->sc_aen_log_page_size);
+	if (mem != NULL)
+		nvme_dmamem_free(sc, mem);
+
+	scsi_io_put(&sc->sc_iopool, ccb);
+	scsi_ioh_add(&sc->sc_aer_handler);
+}
+
+
+/* NVMe base spec rev 1.4c section 5.2.1 */
+static const char *error_events[] = {
+	"invalid doorbell register",
+	"invalid doorbell write",
+	"diagnostic failure",
+	"persistent internal error",
+	"transient internal error"
+};
+static const char *smart_events[] = {
+	"SMART/health reliability event",
+	"SMART/health temperature event",
+	"SMART/health spare capacity below threshold"
+};
+
+void
+nvme_aer_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
+{
+	int info, event, restart;
+
+	sc->sc_aer_ccb = NULL;
+
+	/* if the controller doesn't support async events, stop */
+	if (NVME_CQE_SCT(ccb->ccb_cqe_flags) == NVME_CQE_SCT_GENERIC &&
+	    NVME_CQE_SC(ccb->ccb_cqe_flags) == NVME_CQE_SC_INVALID_OPCODE &&
+	    (ccb->ccb_cqe_flags & NVME_CQE_DNR)) {
+		scsi_io_put(&sc->sc_iopool, ccb);
+		return;
+	}
+
+	info = (ccb->ccb_cqe_cdw0 & NVM_ASYNC_EVENT_INFO_MASK) >>
+	    NVM_ASYNC_EVENT_INFO_SHIFT;
+	event = ccb->ccb_cqe_cdw0 & NVM_ASYNC_EVENT_TYPE_MASK;
+
+	restart = 0;
+	switch (event) {
+	case NVM_ASYNC_EVENT_TYPE_ERROR:
+		if (info < nitems(error_events)) {
+			printf("%s: %s\n", DEVNAME(sc), error_events[info]);
+		} else {
+			printf("%s: error event %d\n", DEVNAME(sc), info);
+		}
+		sc->sc_aen_log_page = NVM_LOG_PAGE_ERROR_INFORMATION;
+		sc->sc_aen_log_page_size = sizeof(struct nvm_error_information);
+		task_add(systq, &sc->sc_aen_log_task);
+		break;
+
+	case NVM_ASYNC_EVENT_TYPE_SMART:
+		if (info <= nitems(smart_events)) {
+			printf("%s: %s\n", DEVNAME(sc), smart_events[info]);
+		} else {
+			printf("%s: SMART/health event %d\n", DEVNAME(sc),
+			    info);
+		}
+		sc->sc_aen_log_page = NVM_LOG_PAGE_SMART_HEALTH;
+		sc->sc_aen_log_page_size = sizeof(struct nvm_smart_health);
+		task_add(systq, &sc->sc_aen_log_task);
+		break;
+	default:
+		restart = 1;
+		printf("%s: async event type %d (cdw0 %x)\n",
+		    DEVNAME(sc), event, ccb->ccb_cqe_cdw0);
+		break;
+	}
+
+	scsi_io_put(&sc->sc_iopool, ccb);
+	if (restart)
+		scsi_ioh_add(&sc->sc_aer_handler);
+}
+
+void
+nvme_aer_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
+{
+	struct nvme_sqe *sqe = slot;
+	sqe->opcode = NVM_ADMIN_ASYNC_EV_REQ;
+}
+
+void
+nvme_start_aer(void *cookie, void *io)
+{
+	struct nvme_softc		*sc = cookie;
+	struct nvme_ccb			*ccb = io;
+
+	/* we need to return this ccb on controller reset */
+	sc->sc_aer_ccb = ccb;
+
+	ccb->ccb_done = nvme_aer_done;
+	ccb->ccb_cookie = sc;
+
+	nvme_q_submit(sc, sc->sc_admin_q, ccb, nvme_aer_fill);
+}
+
+void
+nvme_enable_async_events(struct nvme_softc *sc)
+{
+	struct nvme_ccb			*ccb;
+	struct nvme_sqe			sqe;
+	uint32_t			events;
+
+	ccb = scsi_io_get(&sc->sc_iopool, 0);
+	KASSERT(ccb != NULL);
+
+	ccb->ccb_done = nvme_empty_done;
+	ccb->ccb_cookie = &sqe;
+
+	events = NVM_ASYNC_EVENT_ENABLE_SMART;
+
+	memset(&sqe, 0, sizeof(sqe));
+	sqe.opcode = NVM_ADMIN_SET_FEATURES;
+	sqe.cdw10 = htole32(NVM_FEATURE_ASYNC_EVENT_ENABLE);
+	sqe.cdw11 = htole32(events);
+
+	/* ignore failures here */
+	nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill,
+	    NVME_TIMO_LOG_PAGE);
+
+	scsi_io_put(&sc->sc_iopool, ccb);
+
+	task_set(&sc->sc_aen_log_task, nvme_aen_read_log, sc);
+	scsi_ioh_set(&sc->sc_aer_handler, &sc->sc_iopool,
+	    nvme_start_aer, sc);
+	scsi_ioh_add(&sc->sc_aer_handler);
+}
+
 #endif /* SMALL_KERNEL */