Download raw body.
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
On 2026/03/01 23:48, Andrew Lemin wrote:
> Hi,
>
> The HFSC queue scheduler stores bandwidth in `struct hfsc_sc` using
> `u_int m1` and `m2` (bits/sec). This limits maximum queue bandwidth
> to UINT_MAX = ~4.29 Gbps, which is no longer adequate for 10G/25G
> interfaces.
>
> The most straightforward fix would be widening `hfsc_sc.m1`/`m2` to
> `u_int64_t`, but that changes `struct hfsc_class_stats` which is
> copyout'd via DIOCGETQSTATS. While OpenBSD rebuilds kernel and
> userland together, the ioctl struct change ripples into pfctl, pftop, and
> any third-party tools reading queue stats. It also felt like using
> a sledgehammer.
>
> Instead, this diff adds `set queue-units bytes` to pf.conf. When
> set, K/M/G suffixes mean KB/s, MB/s, GB/s. The kernel HFSC code
> skips the /8 division when converting to internal scaled values
> (which already operate in bytes). A single unused bit (0x0004) in
> the existing `pf_queuespec.flags` field carries the mode through
> the ioctl. No structs change size or layout.
You need to change those tools using your approach too otherwose they're
8x out in the "bytes" case, and this seems a bit of an awkward way to
do it. OpenBSD doesn't have a problem changing ABI where it makes sense.
I don't know if there are other reason to keep as uint, but depending
on that, it would seem simpler to either change the type, or if not
then change the kernel to always use bytes and do the conversion in
userland.
>
> The effective maximum becomes UINT_MAX bytes/sec = ~34.36 Gbps.
> Default is `bits`, so existing configs are unaffected.
>
> Example:
>
> set queue-units bytes
> queue rootq on em0 bandwidth 1250M max 1250M # 10 Gbps
> queue fast parent rootq bandwidth 500M default # 4 Gbps
>
> Also adds validation that rejects bandwidth values exceeding
> UINT_MAX in both modes. Today, `bandwidth 10G` silently truncates
> to ~1.4 Gbps.
>
> Tested on arm64 and amd64 with the pfctl regression suite (pf, selfpf, pfail,
> pfsimple all pass), and manual queue loading on a live system.
>
> Looking for feedback as this is my first kernel patch :)
> I suspect my regression tests need corrections?
>
> diff inline below.
>
> diff --git regress/sbin/pfctl/Makefile regress/sbin/pfctl/Makefile
> index 5dd2c948..85949377 100644
> --- regress/sbin/pfctl/Makefile
> +++ regress/sbin/pfctl/Makefile
> @@ -18,11 +18,11 @@ PFTESTS=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
> 18 19 20 21 22 23 24 25 26 27
> PFTESTS+=28 29 30 31 32 34 35 36 38 39 40 41 44 46 47 48 49 50
> PFTESTS+=52 53 54 55 56 57 60 61 65 66 67 68 69 70 71 72 73
> PFTESTS+=74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
> -PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114
> +PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114 115
> PFFAIL=1 2 3 4 5 6 7 8 11 12 13 14 15 16 17 19 20 23 25 27
> PFFAIL+=30 37 38 39 40 41 42 43 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
> -PFFAIL+=63 64 65 66 67
> -PFSIMPLE=1 2
> +PFFAIL+=63 64 65 66 67 68
> +PFSIMPLE=1 2 3
> PFSETUP=1 4
> PFLOAD=1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 23 24 25 26 27 28 29
> PFLOAD+=30 31 32 34 36 38 39 40 44 46 47 48 49 54 56 60 61 65 66 67 68 69 70 71
> diff --git regress/sbin/pfctl/pf115.in regress/sbin/pfctl/pf115.in
> new file mode 100644
> index 00000000..894078e1
> --- /dev/null
> +++ regress/sbin/pfctl/pf115.in
> @@ -0,0 +1,3 @@
> +set queue-units bytes
> +queue rootq on lo1000000 bandwidth 1G
> +queue defq parent rootq bandwidth 500M default
> diff --git regress/sbin/pfctl/pf115.ok regress/sbin/pfctl/pf115.ok
> new file mode 100644
> index 00000000..fa29c7a6
> --- /dev/null
> +++ regress/sbin/pfctl/pf115.ok
> @@ -0,0 +1,2 @@
> +queue rootq on lo1000000 bandwidth 1G
> +queue defq parent rootq bandwidth 500M default
> diff --git regress/sbin/pfctl/pfail68.in regress/sbin/pfctl/pfail68.in
> new file mode 100644
> index 00000000..df5df227
> --- /dev/null
> +++ regress/sbin/pfctl/pfail68.in
> @@ -0,0 +1,3 @@
> +set queue-units bytes
> +queue rootq on lo1000000 bandwidth 5G
> +queue defq parent rootq bandwidth 500M default
> diff --git regress/sbin/pfctl/pfail68.ok regress/sbin/pfctl/pfail68.ok
> new file mode 100644
> index 00000000..6f081086
> --- /dev/null
> +++ regress/sbin/pfctl/pfail68.ok
> @@ -0,0 +1 @@
> +stdin:2: bandwidth value exceeds maximum for bytes mode (~4.29 GB/s =
> ~34.36 Gb/s)
> diff --git regress/sbin/pfctl/pfsimple3.in regress/sbin/pfctl/pfsimple3.in
> new file mode 100644
> index 00000000..c654a254
> --- /dev/null
> +++ regress/sbin/pfctl/pfsimple3.in
> @@ -0,0 +1,3 @@
> +set queue-units bytes
> +queue rootq on lo1000000 bandwidth 4G
> +queue defq parent rootq bandwidth 500M default
> diff --git sbin/pfctl/parse.y sbin/pfctl/parse.y
> index 01d7553c..02cae598 100644
> --- sbin/pfctl/parse.y
> +++ sbin/pfctl/parse.y
> @@ -538,7 +538,7 @@ int parseport(char *, struct range *r, int);
> %token ANTISPOOF FOR INCLUDE MATCHES
> %token BITMASK RANDOM SOURCEHASH ROUNDROBIN LEASTSTATES STATICPORT PROBABILITY
> %token WEIGHT BANDWIDTH FLOWS QUANTUM
> -%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT
> +%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT QUEUEUNITS
> %token LOAD RULESET_OPTIMIZATION RTABLE RDOMAIN PRIO ONCE DEFAULT DELAY
> %token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE
> %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW MAXPKTRATE
> @@ -764,6 +764,18 @@ option : SET REASSEMBLE yesno optnodf {
> YYERROR;
> }
> }
> + | SET QUEUEUNITS STRING {
> + if (!strcmp($3, "bytes"))
> + pf->queue_units_bytes = 1;
> + else if (!strcmp($3, "bits"))
> + pf->queue_units_bytes = 0;
> + else {
> + yyerror("unknown queue-units \"%s\"", $3);
> + free($3);
> + YYERROR;
> + }
> + free($3);
> + }
> ;
>
> syncookie_val : STRING {
> @@ -4829,6 +4841,26 @@ expand_queue(char *qname, struct node_if
> *interfaces, struct queue_opts *opts)
> qspec.flags = opts->flags;
> qspec.qlimit = opts->qlimit;
>
> + /* after opts->flags copy to avoid overwrite */
> + if (pf->queue_units_bytes)
> + qspec.flags |= PFQS_BYTESMODE;
> +
> + if (qspec.linkshare.m1.absolute > UINT_MAX ||
> + qspec.linkshare.m2.absolute > UINT_MAX ||
> + qspec.realtime.m1.absolute > UINT_MAX ||
> + qspec.realtime.m2.absolute > UINT_MAX ||
> + qspec.upperlimit.m1.absolute > UINT_MAX ||
> + qspec.upperlimit.m2.absolute > UINT_MAX) {
> + if (pf->queue_units_bytes)
> + yyerror("bandwidth value exceeds maximum "
> + "for bytes mode (~4.29 GB/s = "
> + "~34.36 Gb/s)");
> + else
> + yyerror("bandwidth value exceeds maximum "
> + "for bits mode (~4.29 Gb/s)");
> + return (1);
> + }
> +
> if (pfctl_add_queue(pf, &qspec)) {
> yyerror("cannot add queue");
> return (1);
> @@ -5567,6 +5599,7 @@ lookup(char *s)
> { "qlimit", QLIMIT},
> { "quantum", QUANTUM},
> { "queue", QUEUE},
> + { "queue-units", QUEUEUNITS},
> { "quick", QUICK},
> { "random", RANDOM},
> { "random-id", RANDOMID},
> diff --git sbin/pfctl/pfctl_parser.h sbin/pfctl/pfctl_parser.h
> index d5a61831..f0a6221c 100644
> --- sbin/pfctl/pfctl_parser.h
> +++ sbin/pfctl/pfctl_parser.h
> @@ -126,6 +126,7 @@ struct pfctl {
> u_int8_t reass_set;
> u_int8_t syncookies_set;
> u_int8_t syncookieswat_set;
> + u_int8_t queue_units_bytes;
> };
>
> struct node_if {
> diff --git sbin/pfctl/pfctl_queue.c sbin/pfctl/pfctl_queue.c
> index 1f0d4563..6fb30d32 100644
> --- sbin/pfctl/pfctl_queue.c
> +++ sbin/pfctl/pfctl_queue.c
> @@ -91,6 +91,14 @@ pfctl_show_queues(int dev, const char *iface, int
> opts, int verbose2)
> pfctl_print_queue_node(dev, node, opts);
> }
>
> + if (nodes > 0) {
> + int bytesmode = 0;
> + node = TAILQ_FIRST(&qnodes);
> + if (node != NULL && (node->qs.flags & PFQS_BYTESMODE))
> + bytesmode = 1;
> + printf("queue-units %s\n", bytesmode ? "bytes" : "bits");
> + }
> +
> while (verbose2 && nodes > 0) {
> printf("\n");
> fflush(stdout);
> diff --git share/man/man5/pf.conf.5 share/man/man5/pf.conf.5
> index 3a383b23..9d24aab4 100644
> --- share/man/man5/pf.conf.5
> +++ share/man/man5/pf.conf.5
> @@ -1477,6 +1477,29 @@ The thresholds for entering and leaving
> syncookie mode can be specified using:
> set syncookies adaptive (start 25%, end 12%)
> .Ed
> .El
> +.It Cm set queue-units Ar bits | bytes
> +Sets the unit for all queue bandwidth values.
> +The default is
> +.Cm bits ,
> +where bandwidth values represent bits per second.
> +When set to
> +.Cm bytes ,
> +all bandwidth values represent bytes per second,
> +effectively multiplying the usable range by 8.
> +This allows queue bandwidth specifications up to approximately 34 Gbps
> +using the existing suffixes
> +.Cm K ,
> +.Cm M ,
> +and
> +.Cm G
> +to represent kilobytes, megabytes, and gigabytes per second.
> +This option must appear before any queue rules.
> +.Pp
> +For example, to configure a 10 Gbps queue:
> +.Bd -literal -offset indent
> +set queue-units bytes
> +queue rootq on em0 bandwidth 1250M max 1250M
> +.Ed
> .It Ic set Cm timeout Ar variable value
> .Bl -tag -width "src.track" -compact
> .It Cm frag
> @@ -1639,12 +1662,15 @@ time can be specified:
> .Pp
> All
> .Cm bandwidth
> -values are specified as bits per second or using the suffixes
> +values are specified in the units set by
> +.Cm set queue-units
> +.Pq default: bits per second .
> +Suffixes
> .Cm K ,
> .Cm M ,
> and
> .Cm G
> -to represent kilobits, megabits, and gigabits per second, respectively.
> +represent kilo, mega, and giga units respectively.
> The value must not exceed the interface bandwidth.
> .Pp
> If multiple connections are assigned the same queue, they're not guaranteed
> diff --git sys/net/hfsc.c sys/net/hfsc.c
> index 37e9b6bf..9e10578c 100644
> --- sys/net/hfsc.c
> +++ sys/net/hfsc.c
> @@ -198,7 +198,7 @@ struct hfsc_if {
> struct hfsc_class *hfsc_class_create(struct hfsc_if *,
> struct hfsc_sc *, struct hfsc_sc *,
> struct hfsc_sc *, struct hfsc_class *, int,
> - int, int);
> + int, int, int);
> int hfsc_class_destroy(struct hfsc_if *,
> struct hfsc_class *);
> struct hfsc_class *hfsc_nextclass(struct hfsc_class *);
> @@ -234,8 +234,11 @@ static __inline u_int64_t m2ism(u_int);
> static __inline u_int64_t d2dx(u_int);
> static __inline u_int sm2m(u_int64_t);
> static __inline u_int dx2d(u_int64_t);
> +static __inline u_int64_t bm2sm(u_int);
> +static __inline u_int64_t bm2ism(u_int);
> +static __inline u_int sm2bm(u_int64_t);
>
> -void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *);
> +void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *, int);
> void hfsc_rtsc_init(struct hfsc_runtime_sc *,
> struct hfsc_internal_sc *, u_int64_t, u_int64_t);
> u_int64_t hfsc_rtsc_y2x(struct hfsc_runtime_sc *, u_int64_t);
> @@ -401,6 +404,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
> struct hfsc_class *cl, *parent, *np = NULL;
> struct hfsc_sc rtsc, lssc, ulsc;
> int error = 0;
> + int bytesmode = (q->flags & PFQS_BYTESMODE) ? 1 : 0;
>
> KASSERT(hif != NULL);
> KASSERT(q->qid != 0);
> @@ -412,7 +416,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
>
> if (q->parent_qid == 0 && hif->hif_rootclass == NULL) {
> np = hfsc_class_create(hif, NULL, NULL, NULL, NULL,
> - 0, 0, HFSC_ROOT_CLASS | q->qid);
> + 0, 0, HFSC_ROOT_CLASS | q->qid, bytesmode);
> if (np == NULL)
> return (EINVAL);
> parent = np;
> @@ -435,7 +439,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
> ulsc.m2 = q->upperlimit.m2.absolute;
>
> if ((cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc,
> - parent, q->qlimit, q->flags, q->qid)) == NULL) {
> + parent, q->qlimit, q->flags, q->qid, bytesmode)) == NULL) {
> hfsc_class_destroy(hif, np);
> return (ENOMEM);
> }
> @@ -614,7 +618,7 @@ hfsc_purge(struct ifqueue *ifq, struct mbuf_list *ml)
> struct hfsc_class *
> hfsc_class_create(struct hfsc_if *hif, struct hfsc_sc *rsc,
> struct hfsc_sc *fsc, struct hfsc_sc *usc, struct hfsc_class *parent,
> - int qlimit, int flags, int qid)
> + int qlimit, int flags, int qid, int bytesmode)
> {
> struct hfsc_class *cl, *p;
> int i, s;
> @@ -639,18 +643,18 @@ hfsc_class_create(struct hfsc_if *hif, struct
> hfsc_sc *rsc,
>
> if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) {
> cl->cl_rsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
> - hfsc_sc2isc(rsc, cl->cl_rsc);
> + hfsc_sc2isc(rsc, cl->cl_rsc, bytesmode);
> hfsc_rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0);
> hfsc_rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0);
> }
> if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) {
> cl->cl_fsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
> - hfsc_sc2isc(fsc, cl->cl_fsc);
> + hfsc_sc2isc(fsc, cl->cl_fsc, bytesmode);
> hfsc_rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0);
> }
> if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) {
> cl->cl_usc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
> - hfsc_sc2isc(usc, cl->cl_usc);
> + hfsc_sc2isc(usc, cl->cl_usc, bytesmode);
> hfsc_rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0);
> }
>
> @@ -1498,15 +1502,58 @@ dx2d(u_int64_t dx)
> return ((u_int)d);
> }
>
> +/* bytes/sec to scaled slope (PFQS_BYTESMODE: skip /8) */
> +static __inline u_int64_t
> +bm2sm(u_int m)
> +{
> + u_int64_t sm;
> +
> + sm = ((u_int64_t)m << SM_SHIFT) / HFSC_FREQ;
> + return (sm);
> +}
> +
> +/* bytes/sec to inverse scaled slope (PFQS_BYTESMODE: skip *8) */
> +static __inline u_int64_t
> +bm2ism(u_int m)
> +{
> + u_int64_t ism;
> +
> + if (m == 0)
> + ism = HFSC_HT_INFINITY;
> + else
> + ism = ((u_int64_t)HFSC_FREQ << ISM_SHIFT) / m;
> + return (ism);
> +}
> +
> +/* scaled slope to bytes/sec (inverse of bm2sm) */
> +static __inline u_int
> +sm2bm(u_int64_t sm)
> +{
> + u_int64_t b;
> +
> + b = (sm * HFSC_FREQ) >> SM_SHIFT;
> + return ((u_int)b);
> +}
> +
> void
> -hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc)
> +hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc, int bytesmode)
> {
> - isc->sm1 = m2sm(sc->m1);
> - isc->ism1 = m2ism(sc->m1);
> + if (bytesmode) {
> + isc->sm1 = bm2sm(sc->m1);
> + isc->ism1 = bm2ism(sc->m1);
> + } else {
> + isc->sm1 = m2sm(sc->m1);
> + isc->ism1 = m2ism(sc->m1);
> + }
> isc->dx = d2dx(sc->d);
> isc->dy = seg_x2y(isc->dx, isc->sm1);
> - isc->sm2 = m2sm(sc->m2);
> - isc->ism2 = m2ism(sc->m2);
> + if (bytesmode) {
> + isc->sm2 = bm2sm(sc->m2);
> + isc->ism2 = bm2ism(sc->m2);
> + } else {
> + isc->sm2 = m2sm(sc->m2);
> + isc->ism2 = m2ism(sc->m2);
> + }
> }
>
> /*
> @@ -1641,27 +1688,42 @@ hfsc_getclstats(struct hfsc_class_stats *sp,
> struct hfsc_class *cl)
> sp->class_handle = cl->cl_handle;
>
> if (cl->cl_rsc != NULL) {
> - sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
> + if (cl->cl_flags & PFQS_BYTESMODE) {
> + sp->rsc.m1 = sm2bm(cl->cl_rsc->sm1);
> + sp->rsc.m2 = sm2bm(cl->cl_rsc->sm2);
> + } else {
> + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
> + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
> + }
> sp->rsc.d = dx2d(cl->cl_rsc->dx);
> - sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
> } else {
> sp->rsc.m1 = 0;
> sp->rsc.d = 0;
> sp->rsc.m2 = 0;
> }
> if (cl->cl_fsc != NULL) {
> - sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
> + if (cl->cl_flags & PFQS_BYTESMODE) {
> + sp->fsc.m1 = sm2bm(cl->cl_fsc->sm1);
> + sp->fsc.m2 = sm2bm(cl->cl_fsc->sm2);
> + } else {
> + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
> + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
> + }
> sp->fsc.d = dx2d(cl->cl_fsc->dx);
> - sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
> } else {
> sp->fsc.m1 = 0;
> sp->fsc.d = 0;
> sp->fsc.m2 = 0;
> }
> if (cl->cl_usc != NULL) {
> - sp->usc.m1 = sm2m(cl->cl_usc->sm1);
> + if (cl->cl_flags & PFQS_BYTESMODE) {
> + sp->usc.m1 = sm2bm(cl->cl_usc->sm1);
> + sp->usc.m2 = sm2bm(cl->cl_usc->sm2);
> + } else {
> + sp->usc.m1 = sm2m(cl->cl_usc->sm1);
> + sp->usc.m2 = sm2m(cl->cl_usc->sm2);
> + }
> sp->usc.d = dx2d(cl->cl_usc->dx);
> - sp->usc.m2 = sm2m(cl->cl_usc->sm2);
> } else {
> sp->usc.m1 = 0;
> sp->usc.d = 0;
> diff --git sys/net/pfvar.h sys/net/pfvar.h
> index 750fe0ef..61110c84 100644
> --- sys/net/pfvar.h
> +++ sys/net/pfvar.h
> @@ -1291,12 +1291,14 @@ struct pf_queuespec {
>
> #define PFQS_FLOWQUEUE 0x0001
> #define PFQS_ROOTCLASS 0x0002
> +#define PFQS_BYTESMODE 0x0004 /* bandwidth values are bytes/sec */
> #define PFQS_DEFAULT 0x1000 /* maps to HFSC_DEFAULTCLASS */
>
> struct priq_opts {
> int flags;
> };
>
> +/* XXX hfsc_opts is unused */
> struct hfsc_opts {
> /* real-time service curve */
> u_int rtsc_m1; /* slope of the 1st segment in bps */
> diff --git usr.bin/systat/pftop.c usr.bin/systat/pftop.c
> index 8668bd28..e492a811 100644
> --- usr.bin/systat/pftop.c
> +++ usr.bin/systat/pftop.c
> @@ -1640,8 +1640,12 @@ print_queue_node(struct pfctl_queue_node *node)
> * spot as the 'period' in hfsc_class_stats.
> */
> tbprintf("%u", node->qstats.data.period);
> - } else
> - tbprintf("%u%c", rate, unit[i]);
> + } else {
> + if (node->qs.flags & PFQS_BYTESMODE)
> + tbprintf("%u%cB", rate, unit[i]);
> + else
> + tbprintf("%u%c", rate, unit[i]);
> + }
> print_fld_tb(FLD_BANDW);
>
> print_fld_str(FLD_SCHED, node->qs.flags & PFQS_FLOWQUEUE ?
>
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'