From: Andrew Lemin Subject: PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes' To: tech@openbsd.org Date: Sun, 1 Mar 2026 23:48:05 +1100 Hi, The HFSC queue scheduler stores bandwidth in `struct hfsc_sc` using `u_int m1` and `m2` (bits/sec). This limits maximum queue bandwidth to UINT_MAX = ~4.29 Gbps, which is no longer adequate for 10G/25G interfaces. The most straightforward fix would be widening `hfsc_sc.m1`/`m2` to `u_int64_t`, but that changes `struct hfsc_class_stats` which is copyout'd via DIOCGETQSTATS. While OpenBSD rebuilds kernel and userland together, the ioctl struct change ripples into pfctl, pftop, and any third-party tools reading queue stats. It also felt like using a sledgehammer. Instead, this diff adds `set queue-units bytes` to pf.conf. When set, K/M/G suffixes mean KB/s, MB/s, GB/s. The kernel HFSC code skips the /8 division when converting to internal scaled values (which already operate in bytes). A single unused bit (0x0004) in the existing `pf_queuespec.flags` field carries the mode through the ioctl. No structs change size or layout. The effective maximum becomes UINT_MAX bytes/sec = ~34.36 Gbps. Default is `bits`, so existing configs are unaffected. Example: set queue-units bytes queue rootq on em0 bandwidth 1250M max 1250M # 10 Gbps queue fast parent rootq bandwidth 500M default # 4 Gbps Also adds validation that rejects bandwidth values exceeding UINT_MAX in both modes. Today, `bandwidth 10G` silently truncates to ~1.4 Gbps. Tested on arm64 and amd64 with the pfctl regression suite (pf, selfpf, pfail, pfsimple all pass), and manual queue loading on a live system. Looking for feedback as this is my first kernel patch :) I suspect my regression tests need corrections? diff inline below. diff --git regress/sbin/pfctl/Makefile regress/sbin/pfctl/Makefile index 5dd2c948..85949377 100644 --- regress/sbin/pfctl/Makefile +++ regress/sbin/pfctl/Makefile @@ -18,11 +18,11 @@ PFTESTS=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 PFTESTS+=28 29 30 31 32 34 35 36 38 39 40 41 44 46 47 48 49 50 PFTESTS+=52 53 54 55 56 57 60 61 65 66 67 68 69 70 71 72 73 PFTESTS+=74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 -PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114 +PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114 115 PFFAIL=1 2 3 4 5 6 7 8 11 12 13 14 15 16 17 19 20 23 25 27 PFFAIL+=30 37 38 39 40 41 42 43 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 -PFFAIL+=63 64 65 66 67 -PFSIMPLE=1 2 +PFFAIL+=63 64 65 66 67 68 +PFSIMPLE=1 2 3 PFSETUP=1 4 PFLOAD=1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 23 24 25 26 27 28 29 PFLOAD+=30 31 32 34 36 38 39 40 44 46 47 48 49 54 56 60 61 65 66 67 68 69 70 71 diff --git regress/sbin/pfctl/pf115.in regress/sbin/pfctl/pf115.in new file mode 100644 index 00000000..894078e1 --- /dev/null +++ regress/sbin/pfctl/pf115.in @@ -0,0 +1,3 @@ +set queue-units bytes +queue rootq on lo1000000 bandwidth 1G +queue defq parent rootq bandwidth 500M default diff --git regress/sbin/pfctl/pf115.ok regress/sbin/pfctl/pf115.ok new file mode 100644 index 00000000..fa29c7a6 --- /dev/null +++ regress/sbin/pfctl/pf115.ok @@ -0,0 +1,2 @@ +queue rootq on lo1000000 bandwidth 1G +queue defq parent rootq bandwidth 500M default diff --git regress/sbin/pfctl/pfail68.in regress/sbin/pfctl/pfail68.in new file mode 100644 index 00000000..df5df227 --- /dev/null +++ regress/sbin/pfctl/pfail68.in @@ -0,0 +1,3 @@ +set queue-units bytes +queue rootq on lo1000000 bandwidth 5G +queue defq parent rootq bandwidth 500M default diff --git regress/sbin/pfctl/pfail68.ok regress/sbin/pfctl/pfail68.ok new file mode 100644 index 00000000..6f081086 --- /dev/null +++ regress/sbin/pfctl/pfail68.ok @@ -0,0 +1 @@ +stdin:2: bandwidth value exceeds maximum for bytes mode (~4.29 GB/s = ~34.36 Gb/s) diff --git regress/sbin/pfctl/pfsimple3.in regress/sbin/pfctl/pfsimple3.in new file mode 100644 index 00000000..c654a254 --- /dev/null +++ regress/sbin/pfctl/pfsimple3.in @@ -0,0 +1,3 @@ +set queue-units bytes +queue rootq on lo1000000 bandwidth 4G +queue defq parent rootq bandwidth 500M default diff --git sbin/pfctl/parse.y sbin/pfctl/parse.y index 01d7553c..02cae598 100644 --- sbin/pfctl/parse.y +++ sbin/pfctl/parse.y @@ -538,7 +538,7 @@ int parseport(char *, struct range *r, int); %token ANTISPOOF FOR INCLUDE MATCHES %token BITMASK RANDOM SOURCEHASH ROUNDROBIN LEASTSTATES STATICPORT PROBABILITY %token WEIGHT BANDWIDTH FLOWS QUANTUM -%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT +%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT QUEUEUNITS %token LOAD RULESET_OPTIMIZATION RTABLE RDOMAIN PRIO ONCE DEFAULT DELAY %token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW MAXPKTRATE @@ -764,6 +764,18 @@ option : SET REASSEMBLE yesno optnodf { YYERROR; } } + | SET QUEUEUNITS STRING { + if (!strcmp($3, "bytes")) + pf->queue_units_bytes = 1; + else if (!strcmp($3, "bits")) + pf->queue_units_bytes = 0; + else { + yyerror("unknown queue-units \"%s\"", $3); + free($3); + YYERROR; + } + free($3); + } ; syncookie_val : STRING { @@ -4829,6 +4841,26 @@ expand_queue(char *qname, struct node_if *interfaces, struct queue_opts *opts) qspec.flags = opts->flags; qspec.qlimit = opts->qlimit; + /* after opts->flags copy to avoid overwrite */ + if (pf->queue_units_bytes) + qspec.flags |= PFQS_BYTESMODE; + + if (qspec.linkshare.m1.absolute > UINT_MAX || + qspec.linkshare.m2.absolute > UINT_MAX || + qspec.realtime.m1.absolute > UINT_MAX || + qspec.realtime.m2.absolute > UINT_MAX || + qspec.upperlimit.m1.absolute > UINT_MAX || + qspec.upperlimit.m2.absolute > UINT_MAX) { + if (pf->queue_units_bytes) + yyerror("bandwidth value exceeds maximum " + "for bytes mode (~4.29 GB/s = " + "~34.36 Gb/s)"); + else + yyerror("bandwidth value exceeds maximum " + "for bits mode (~4.29 Gb/s)"); + return (1); + } + if (pfctl_add_queue(pf, &qspec)) { yyerror("cannot add queue"); return (1); @@ -5567,6 +5599,7 @@ lookup(char *s) { "qlimit", QLIMIT}, { "quantum", QUANTUM}, { "queue", QUEUE}, + { "queue-units", QUEUEUNITS}, { "quick", QUICK}, { "random", RANDOM}, { "random-id", RANDOMID}, diff --git sbin/pfctl/pfctl_parser.h sbin/pfctl/pfctl_parser.h index d5a61831..f0a6221c 100644 --- sbin/pfctl/pfctl_parser.h +++ sbin/pfctl/pfctl_parser.h @@ -126,6 +126,7 @@ struct pfctl { u_int8_t reass_set; u_int8_t syncookies_set; u_int8_t syncookieswat_set; + u_int8_t queue_units_bytes; }; struct node_if { diff --git sbin/pfctl/pfctl_queue.c sbin/pfctl/pfctl_queue.c index 1f0d4563..6fb30d32 100644 --- sbin/pfctl/pfctl_queue.c +++ sbin/pfctl/pfctl_queue.c @@ -91,6 +91,14 @@ pfctl_show_queues(int dev, const char *iface, int opts, int verbose2) pfctl_print_queue_node(dev, node, opts); } + if (nodes > 0) { + int bytesmode = 0; + node = TAILQ_FIRST(&qnodes); + if (node != NULL && (node->qs.flags & PFQS_BYTESMODE)) + bytesmode = 1; + printf("queue-units %s\n", bytesmode ? "bytes" : "bits"); + } + while (verbose2 && nodes > 0) { printf("\n"); fflush(stdout); diff --git share/man/man5/pf.conf.5 share/man/man5/pf.conf.5 index 3a383b23..9d24aab4 100644 --- share/man/man5/pf.conf.5 +++ share/man/man5/pf.conf.5 @@ -1477,6 +1477,29 @@ The thresholds for entering and leaving syncookie mode can be specified using: set syncookies adaptive (start 25%, end 12%) .Ed .El +.It Cm set queue-units Ar bits | bytes +Sets the unit for all queue bandwidth values. +The default is +.Cm bits , +where bandwidth values represent bits per second. +When set to +.Cm bytes , +all bandwidth values represent bytes per second, +effectively multiplying the usable range by 8. +This allows queue bandwidth specifications up to approximately 34 Gbps +using the existing suffixes +.Cm K , +.Cm M , +and +.Cm G +to represent kilobytes, megabytes, and gigabytes per second. +This option must appear before any queue rules. +.Pp +For example, to configure a 10 Gbps queue: +.Bd -literal -offset indent +set queue-units bytes +queue rootq on em0 bandwidth 1250M max 1250M +.Ed .It Ic set Cm timeout Ar variable value .Bl -tag -width "src.track" -compact .It Cm frag @@ -1639,12 +1662,15 @@ time can be specified: .Pp All .Cm bandwidth -values are specified as bits per second or using the suffixes +values are specified in the units set by +.Cm set queue-units +.Pq default: bits per second . +Suffixes .Cm K , .Cm M , and .Cm G -to represent kilobits, megabits, and gigabits per second, respectively. +represent kilo, mega, and giga units respectively. The value must not exceed the interface bandwidth. .Pp If multiple connections are assigned the same queue, they're not guaranteed diff --git sys/net/hfsc.c sys/net/hfsc.c index 37e9b6bf..9e10578c 100644 --- sys/net/hfsc.c +++ sys/net/hfsc.c @@ -198,7 +198,7 @@ struct hfsc_if { struct hfsc_class *hfsc_class_create(struct hfsc_if *, struct hfsc_sc *, struct hfsc_sc *, struct hfsc_sc *, struct hfsc_class *, int, - int, int); + int, int, int); int hfsc_class_destroy(struct hfsc_if *, struct hfsc_class *); struct hfsc_class *hfsc_nextclass(struct hfsc_class *); @@ -234,8 +234,11 @@ static __inline u_int64_t m2ism(u_int); static __inline u_int64_t d2dx(u_int); static __inline u_int sm2m(u_int64_t); static __inline u_int dx2d(u_int64_t); +static __inline u_int64_t bm2sm(u_int); +static __inline u_int64_t bm2ism(u_int); +static __inline u_int sm2bm(u_int64_t); -void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *); +void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *, int); void hfsc_rtsc_init(struct hfsc_runtime_sc *, struct hfsc_internal_sc *, u_int64_t, u_int64_t); u_int64_t hfsc_rtsc_y2x(struct hfsc_runtime_sc *, u_int64_t); @@ -401,6 +404,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q) struct hfsc_class *cl, *parent, *np = NULL; struct hfsc_sc rtsc, lssc, ulsc; int error = 0; + int bytesmode = (q->flags & PFQS_BYTESMODE) ? 1 : 0; KASSERT(hif != NULL); KASSERT(q->qid != 0); @@ -412,7 +416,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q) if (q->parent_qid == 0 && hif->hif_rootclass == NULL) { np = hfsc_class_create(hif, NULL, NULL, NULL, NULL, - 0, 0, HFSC_ROOT_CLASS | q->qid); + 0, 0, HFSC_ROOT_CLASS | q->qid, bytesmode); if (np == NULL) return (EINVAL); parent = np; @@ -435,7 +439,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q) ulsc.m2 = q->upperlimit.m2.absolute; if ((cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, - parent, q->qlimit, q->flags, q->qid)) == NULL) { + parent, q->qlimit, q->flags, q->qid, bytesmode)) == NULL) { hfsc_class_destroy(hif, np); return (ENOMEM); } @@ -614,7 +618,7 @@ hfsc_purge(struct ifqueue *ifq, struct mbuf_list *ml) struct hfsc_class * hfsc_class_create(struct hfsc_if *hif, struct hfsc_sc *rsc, struct hfsc_sc *fsc, struct hfsc_sc *usc, struct hfsc_class *parent, - int qlimit, int flags, int qid) + int qlimit, int flags, int qid, int bytesmode) { struct hfsc_class *cl, *p; int i, s; @@ -639,18 +643,18 @@ hfsc_class_create(struct hfsc_if *hif, struct hfsc_sc *rsc, if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { cl->cl_rsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK); - hfsc_sc2isc(rsc, cl->cl_rsc); + hfsc_sc2isc(rsc, cl->cl_rsc, bytesmode); hfsc_rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); hfsc_rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); } if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { cl->cl_fsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK); - hfsc_sc2isc(fsc, cl->cl_fsc); + hfsc_sc2isc(fsc, cl->cl_fsc, bytesmode); hfsc_rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); } if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { cl->cl_usc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK); - hfsc_sc2isc(usc, cl->cl_usc); + hfsc_sc2isc(usc, cl->cl_usc, bytesmode); hfsc_rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); } @@ -1498,15 +1502,58 @@ dx2d(u_int64_t dx) return ((u_int)d); } +/* bytes/sec to scaled slope (PFQS_BYTESMODE: skip /8) */ +static __inline u_int64_t +bm2sm(u_int m) +{ + u_int64_t sm; + + sm = ((u_int64_t)m << SM_SHIFT) / HFSC_FREQ; + return (sm); +} + +/* bytes/sec to inverse scaled slope (PFQS_BYTESMODE: skip *8) */ +static __inline u_int64_t +bm2ism(u_int m) +{ + u_int64_t ism; + + if (m == 0) + ism = HFSC_HT_INFINITY; + else + ism = ((u_int64_t)HFSC_FREQ << ISM_SHIFT) / m; + return (ism); +} + +/* scaled slope to bytes/sec (inverse of bm2sm) */ +static __inline u_int +sm2bm(u_int64_t sm) +{ + u_int64_t b; + + b = (sm * HFSC_FREQ) >> SM_SHIFT; + return ((u_int)b); +} + void -hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc) +hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc, int bytesmode) { - isc->sm1 = m2sm(sc->m1); - isc->ism1 = m2ism(sc->m1); + if (bytesmode) { + isc->sm1 = bm2sm(sc->m1); + isc->ism1 = bm2ism(sc->m1); + } else { + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + } isc->dx = d2dx(sc->d); isc->dy = seg_x2y(isc->dx, isc->sm1); - isc->sm2 = m2sm(sc->m2); - isc->ism2 = m2ism(sc->m2); + if (bytesmode) { + isc->sm2 = bm2sm(sc->m2); + isc->ism2 = bm2ism(sc->m2); + } else { + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); + } } /* @@ -1641,27 +1688,42 @@ hfsc_getclstats(struct hfsc_class_stats *sp, struct hfsc_class *cl) sp->class_handle = cl->cl_handle; if (cl->cl_rsc != NULL) { - sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + if (cl->cl_flags & PFQS_BYTESMODE) { + sp->rsc.m1 = sm2bm(cl->cl_rsc->sm1); + sp->rsc.m2 = sm2bm(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } sp->rsc.d = dx2d(cl->cl_rsc->dx); - sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); } else { sp->rsc.m1 = 0; sp->rsc.d = 0; sp->rsc.m2 = 0; } if (cl->cl_fsc != NULL) { - sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + if (cl->cl_flags & PFQS_BYTESMODE) { + sp->fsc.m1 = sm2bm(cl->cl_fsc->sm1); + sp->fsc.m2 = sm2bm(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } sp->fsc.d = dx2d(cl->cl_fsc->dx); - sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); } else { sp->fsc.m1 = 0; sp->fsc.d = 0; sp->fsc.m2 = 0; } if (cl->cl_usc != NULL) { - sp->usc.m1 = sm2m(cl->cl_usc->sm1); + if (cl->cl_flags & PFQS_BYTESMODE) { + sp->usc.m1 = sm2bm(cl->cl_usc->sm1); + sp->usc.m2 = sm2bm(cl->cl_usc->sm2); + } else { + sp->usc.m1 = sm2m(cl->cl_usc->sm1); + sp->usc.m2 = sm2m(cl->cl_usc->sm2); + } sp->usc.d = dx2d(cl->cl_usc->dx); - sp->usc.m2 = sm2m(cl->cl_usc->sm2); } else { sp->usc.m1 = 0; sp->usc.d = 0; diff --git sys/net/pfvar.h sys/net/pfvar.h index 750fe0ef..61110c84 100644 --- sys/net/pfvar.h +++ sys/net/pfvar.h @@ -1291,12 +1291,14 @@ struct pf_queuespec { #define PFQS_FLOWQUEUE 0x0001 #define PFQS_ROOTCLASS 0x0002 +#define PFQS_BYTESMODE 0x0004 /* bandwidth values are bytes/sec */ #define PFQS_DEFAULT 0x1000 /* maps to HFSC_DEFAULTCLASS */ struct priq_opts { int flags; }; +/* XXX hfsc_opts is unused */ struct hfsc_opts { /* real-time service curve */ u_int rtsc_m1; /* slope of the 1st segment in bps */ diff --git usr.bin/systat/pftop.c usr.bin/systat/pftop.c index 8668bd28..e492a811 100644 --- usr.bin/systat/pftop.c +++ usr.bin/systat/pftop.c @@ -1640,8 +1640,12 @@ print_queue_node(struct pfctl_queue_node *node) * spot as the 'period' in hfsc_class_stats. */ tbprintf("%u", node->qstats.data.period); - } else - tbprintf("%u%c", rate, unit[i]); + } else { + if (node->qs.flags & PFQS_BYTESMODE) + tbprintf("%u%cB", rate, unit[i]); + else + tbprintf("%u%c", rate, unit[i]); + } print_fld_tb(FLD_BANDW); print_fld_str(FLD_SCHED, node->qs.flags & PFQS_FLOWQUEUE ?