Download raw body.
nvme: add async event processing
nvme controllers can report asynchronous events to the host, for things
like health events, driver or controller internal errors, namespace
changes, and some other stuff.
The diff below implements processing of internal error and SMART/health
events, just by printing a message on the console. These might shed some
light on cases where an nvme device stops working, at least.
The driver has to first enable events it wants, then issue an
Asynchronous Event Request command, which only completes when there's an
event for the host to process. For some event types, the driver has to
read a log page off the controller to clear the event, allowing future
events of that type to be reported.
If the controller doesn't implement async events, we just stop after the
first AER command fails. Some hypervisors do this, but I'm not aware of
any real hardware that does.
Like the sensor code, this is almost all inside SMALL_KERNEL, so it only
adds a few bytes to ramdisk kernels.
ok?
Index: nvmereg.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmereg.h,v
diff -u -p -r1.16 nvmereg.h
--- nvmereg.h 13 Sep 2024 09:57:34 -0000 1.16
+++ nvmereg.h 26 Dec 2025 12:10:10 -0000
@@ -416,6 +416,23 @@ struct nvm_identify_namespace {
u_int8_t vs[3712];
} __packed __aligned(8);
+#define NVM_LOG_PAGE_ERROR_INFORMATION 0x01
+struct nvm_error_information {
+ u_int64_t error_count;
+ u_int16_t queue_id;
+ u_int16_t command_id;
+ u_int16_t status;
+ u_int16_t error_location;
+ u_int64_t lba;
+ u_int32_t nsid;
+ u_int8_t vendor_info;
+ u_int8_t transport_type;
+ u_int16_t _reserved1;
+ u_int64_t command_info;
+ u_int16_t transport_info;
+ u_int8_t _reserved2[22];
+} __packed __aligned(8);
+
#define NVM_LOG_PAGE_SMART_HEALTH 0x02
struct nvm_smart_health {
u_int8_t critical_warning;
@@ -453,3 +470,13 @@ struct nvm_smart_health {
u_int8_t _reserved2[280];
} __packed __aligned(8);
+
+#define NVM_ASYNC_EVENT_INFO_MASK 0x0000ff00
+#define NVM_ASYNC_EVENT_INFO_SHIFT 8
+#define NVM_ASYNC_EVENT_TYPE_MASK 0x00000007
+
+#define NVM_ASYNC_EVENT_TYPE_ERROR 0
+#define NVM_ASYNC_EVENT_TYPE_SMART 1
+
+#define NVM_FEATURE_ASYNC_EVENT_ENABLE 0x0B
+#define NVM_ASYNC_EVENT_ENABLE_SMART 0x00FF
Index: nvmevar.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmevar.h,v
diff -u -p -r1.32 nvmevar.h
--- nvmevar.h 16 Dec 2025 00:24:55 -0000 1.32
+++ nvmevar.h 26 Dec 2025 12:10:10 -0000
@@ -17,6 +17,7 @@
*/
#include <sys/sensors.h>
+#include <sys/task.h>
#define NVME_IO_Q 1
#define NVME_HIB_Q 2
@@ -50,6 +51,7 @@ struct nvme_ccb {
u_int16_t ccb_id;
u_int16_t ccb_cqe_flags;
+ u_int32_t ccb_cqe_cdw0;
};
SIMPLEQ_HEAD(nvme_ccb_list, nvme_ccb);
@@ -134,6 +136,12 @@ struct nvme_softc {
struct ksensor sc_temp_sensor;
struct ksensor sc_spare_sensor;
struct ksensor sc_usage_sensor;
+
+ struct scsi_iohandler sc_aer_handler;
+ struct nvme_ccb *sc_aer_ccb;
+ u_int32_t sc_aen_log_page;
+ size_t sc_aen_log_page_size;
+ struct task sc_aen_log_task;
};
#define DEVNAME(_sc) ((_sc)->sc_dev.dv_xname)
Index: nvme.c
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvme.c,v
diff -u -p -r1.125 nvme.c
--- nvme.c 16 Dec 2025 00:24:55 -0000 1.125
+++ nvme.c 26 Dec 2025 12:10:10 -0000
@@ -61,7 +61,13 @@ int nvme_identify(struct nvme_softc *, u
void nvme_fill_identify(struct nvme_softc *, struct nvme_ccb *, void *);
#ifndef SMALL_KERNEL
+struct nvme_dmamem *nvme_read_log_page(struct nvme_softc *, struct nvme_ccb *,
+ u_int32_t, size_t);
void nvme_refresh_sensors(void *);
+
+void nvme_start_aer(void *, void *);
+void nvme_aen_read_log(void *);
+void nvme_enable_async_events(struct nvme_softc *);
#endif
int nvme_ccbs_alloc(struct nvme_softc *, u_int);
@@ -431,6 +437,8 @@ nvme_attach(struct nvme_softc *sc)
goto free_q;
sensordev_install(&sc->sc_sensordev);
+
+ nvme_enable_async_events(sc);
#endif
sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev,
@@ -472,6 +480,11 @@ nvme_resume(struct nvme_softc *sc)
return (1);
}
+ /* outstanding aer commands are aborted on reset */
+ if (sc->sc_aer_ccb != NULL) {
+ scsi_io_put(&sc->sc_iopool, sc->sc_aer_ccb);
+ }
+
sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
if (sc->sc_q == NULL) {
printf("%s: unable to allocate io q\n", DEVNAME(sc));
@@ -485,6 +498,10 @@ nvme_resume(struct nvme_softc *sc)
nvme_write4(sc, NVME_INTMC, 1);
+#ifndef SMALL_KERNEL
+ nvme_enable_async_events(sc);
+#endif
+
return (0);
free_q:
@@ -1213,6 +1230,7 @@ nvme_q_complete(struct nvme_softc *sc, s
sc->sc_ops->op_cq_done(sc, q, ccb);
ccb->ccb_cqe_flags = lemtoh16(&cqe->flags);
+ ccb->ccb_cqe_cdw0 = lemtoh32(&cqe->cdw0);
SIMPLEQ_INSERT_TAIL(&done_list, ccb, ccb_entry);
if (++head >= q->q_entries) {
@@ -2151,33 +2169,26 @@ nvme_bioctl_disk(struct nvme_softc *sc,
#endif /* NBIO > 0 */
#ifndef SMALL_KERNEL
-void
-nvme_refresh_sensors(void *arg)
+
+struct nvme_dmamem *
+nvme_read_log_page(struct nvme_softc *sc, struct nvme_ccb *ccb,
+ u_int32_t page, size_t length)
{
- struct nvme_softc *sc = arg;
struct nvme_sqe sqe;
struct nvme_dmamem *mem = NULL;
- struct nvme_ccb *ccb = NULL;
- struct nvm_smart_health *health;
uint32_t dwlen;
- uint8_t cw;
int flags;
- int64_t temp;
- ccb = nvme_ccb_get(sc);
- if (ccb == NULL)
- goto failed;
-
- mem = nvme_dmamem_alloc(sc, sizeof(*health));
+ mem = nvme_dmamem_alloc(sc, length);
if (mem == NULL)
goto failed;
nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
- dwlen = (sizeof(*health) >> 2) - 1;
+ dwlen = (length >> 2) - 1;
memset(&sqe, 0, sizeof(sqe));
sqe.opcode = NVM_ADMIN_GET_LOG_PG;
htolem32(&sqe.nsid, 0xffffffff);
- htolem32(&sqe.cdw10, (dwlen << 16 | NVM_LOG_PAGE_SMART_HEALTH));
+ htolem32(&sqe.cdw10, (dwlen << 16 | page));
htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
ccb->ccb_done = nvme_empty_done;
@@ -2189,6 +2200,32 @@ nvme_refresh_sensors(void *arg)
if (flags != 0)
goto failed;
+ return mem;
+ failed:
+ if (mem != NULL)
+ nvme_dmamem_free(sc, mem);
+ return NULL;
+}
+
+void
+nvme_refresh_sensors(void *arg)
+{
+ struct nvme_softc *sc = arg;
+ struct nvme_ccb *ccb;
+ struct nvme_dmamem *mem = NULL;
+ struct nvm_smart_health *health;
+ uint8_t cw;
+ int64_t temp;
+
+ ccb = nvme_ccb_get(sc);
+ if (ccb == NULL)
+ goto failed;
+
+ mem = nvme_read_log_page(sc, ccb, NVM_LOG_PAGE_SMART_HEALTH,
+ sizeof(*health));
+ if (mem == NULL)
+ goto failed;
+
health = NVME_DMA_KVA(mem);
cw = health->critical_warning;
@@ -2215,4 +2252,148 @@ nvme_refresh_sensors(void *arg)
if (ccb != NULL)
nvme_ccb_put(sc, ccb);
}
+
+void
+nvme_aen_read_log(void *xsc)
+{
+ struct nvme_softc *sc = xsc;
+ struct nvme_dmamem *mem = NULL;
+ struct nvme_ccb *ccb;
+
+ ccb = scsi_io_get(&sc->sc_iopool, 0);
+ KASSERT(ccb != NULL);
+
+ mem = nvme_read_log_page(sc, ccb, sc->sc_aen_log_page,
+ sc->sc_aen_log_page_size);
+ if (mem != NULL)
+ nvme_dmamem_free(sc, mem);
+
+ scsi_io_put(&sc->sc_iopool, ccb);
+ scsi_ioh_add(&sc->sc_aer_handler);
+}
+
+
+/* NVMe base spec rev 1.4c section 5.2.1 */
+static const char *error_events[] = {
+ "invalid doorbell register",
+ "invalid doorbell write",
+ "diagnostic failure",
+ "persistent internal error",
+ "transient internal error"
+};
+static const char *smart_events[] = {
+ "SMART/health reliability event",
+ "SMART/health temperature event",
+ "SMART/health spare capacity below threshold"
+};
+
+void
+nvme_aer_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
+{
+ int info, event, restart;
+
+ sc->sc_aer_ccb = NULL;
+
+ /* if the controller doesn't support async events, stop */
+ if (NVME_CQE_SCT(ccb->ccb_cqe_flags) == NVME_CQE_SCT_GENERIC &&
+ NVME_CQE_SC(ccb->ccb_cqe_flags) == NVME_CQE_SC_INVALID_OPCODE &&
+ (ccb->ccb_cqe_flags & NVME_CQE_DNR)) {
+ scsi_io_put(&sc->sc_iopool, ccb);
+ return;
+ }
+
+ info = (ccb->ccb_cqe_cdw0 & NVM_ASYNC_EVENT_INFO_MASK) >>
+ NVM_ASYNC_EVENT_INFO_SHIFT;
+ event = ccb->ccb_cqe_cdw0 & NVM_ASYNC_EVENT_TYPE_MASK;
+
+ restart = 0;
+ switch (event) {
+ case NVM_ASYNC_EVENT_TYPE_ERROR:
+ if (info < nitems(error_events)) {
+ printf("%s: %s\n", DEVNAME(sc), error_events[info]);
+ } else {
+ printf("%s: error event %d\n", DEVNAME(sc), info);
+ }
+ sc->sc_aen_log_page = NVM_LOG_PAGE_ERROR_INFORMATION;
+ sc->sc_aen_log_page_size = sizeof(struct nvm_error_information);
+ task_add(systq, &sc->sc_aen_log_task);
+ break;
+
+ case NVM_ASYNC_EVENT_TYPE_SMART:
+ if (info <= nitems(smart_events)) {
+ printf("%s: %s\n", DEVNAME(sc), smart_events[info]);
+ } else {
+ printf("%s: SMART/health event %d\n", DEVNAME(sc),
+ info);
+ }
+ sc->sc_aen_log_page = NVM_LOG_PAGE_SMART_HEALTH;
+ sc->sc_aen_log_page_size = sizeof(struct nvm_smart_health);
+ task_add(systq, &sc->sc_aen_log_task);
+ break;
+ default:
+ restart = 1;
+ printf("%s: async event type %d (cdw0 %x)\n",
+ DEVNAME(sc), event, ccb->ccb_cqe_cdw0);
+ break;
+ }
+
+ scsi_io_put(&sc->sc_iopool, ccb);
+ if (restart)
+ scsi_ioh_add(&sc->sc_aer_handler);
+}
+
+void
+nvme_aer_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
+{
+ struct nvme_sqe *sqe = slot;
+ sqe->opcode = NVM_ADMIN_ASYNC_EV_REQ;
+}
+
+void
+nvme_start_aer(void *cookie, void *io)
+{
+ struct nvme_softc *sc = cookie;
+ struct nvme_ccb *ccb = io;
+
+ /* we need to return this ccb on controller reset */
+ sc->sc_aer_ccb = ccb;
+
+ ccb->ccb_done = nvme_aer_done;
+ ccb->ccb_cookie = sc;
+
+ nvme_q_submit(sc, sc->sc_admin_q, ccb, nvme_aer_fill);
+}
+
+void
+nvme_enable_async_events(struct nvme_softc *sc)
+{
+ struct nvme_ccb *ccb;
+ struct nvme_sqe sqe;
+ uint32_t events;
+
+ ccb = scsi_io_get(&sc->sc_iopool, 0);
+ KASSERT(ccb != NULL);
+
+ ccb->ccb_done = nvme_empty_done;
+ ccb->ccb_cookie = &sqe;
+
+ events = NVM_ASYNC_EVENT_ENABLE_SMART;
+
+ memset(&sqe, 0, sizeof(sqe));
+ sqe.opcode = NVM_ADMIN_SET_FEATURES;
+ sqe.cdw10 = htole32(NVM_FEATURE_ASYNC_EVENT_ENABLE);
+ sqe.cdw11 = htole32(events);
+
+ /* ignore failures here */
+ nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill,
+ NVME_TIMO_LOG_PAGE);
+
+ scsi_io_put(&sc->sc_iopool, ccb);
+
+ task_set(&sc->sc_aen_log_task, nvme_aen_read_log, sc);
+ scsi_ioh_set(&sc->sc_aer_handler, &sc->sc_iopool,
+ nvme_start_aer, sc);
+ scsi_ioh_add(&sc->sc_aer_handler);
+}
+
#endif /* SMALL_KERNEL */
nvme: add async event processing