Download raw body.
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'
Hi,
The HFSC queue scheduler stores bandwidth in `struct hfsc_sc` using
`u_int m1` and `m2` (bits/sec). This limits maximum queue bandwidth
to UINT_MAX = ~4.29 Gbps, which is no longer adequate for 10G/25G
interfaces.
The most straightforward fix would be widening `hfsc_sc.m1`/`m2` to
`u_int64_t`, but that changes `struct hfsc_class_stats` which is
copyout'd via DIOCGETQSTATS. While OpenBSD rebuilds kernel and
userland together, the ioctl struct change ripples into pfctl, pftop, and
any third-party tools reading queue stats. It also felt like using
a sledgehammer.
Instead, this diff adds `set queue-units bytes` to pf.conf. When
set, K/M/G suffixes mean KB/s, MB/s, GB/s. The kernel HFSC code
skips the /8 division when converting to internal scaled values
(which already operate in bytes). A single unused bit (0x0004) in
the existing `pf_queuespec.flags` field carries the mode through
the ioctl. No structs change size or layout.
The effective maximum becomes UINT_MAX bytes/sec = ~34.36 Gbps.
Default is `bits`, so existing configs are unaffected.
Example:
set queue-units bytes
queue rootq on em0 bandwidth 1250M max 1250M # 10 Gbps
queue fast parent rootq bandwidth 500M default # 4 Gbps
Also adds validation that rejects bandwidth values exceeding
UINT_MAX in both modes. Today, `bandwidth 10G` silently truncates
to ~1.4 Gbps.
Tested on arm64 and amd64 with the pfctl regression suite (pf, selfpf, pfail,
pfsimple all pass), and manual queue loading on a live system.
Looking for feedback as this is my first kernel patch :)
I suspect my regression tests need corrections?
diff inline below.
diff --git regress/sbin/pfctl/Makefile regress/sbin/pfctl/Makefile
index 5dd2c948..85949377 100644
--- regress/sbin/pfctl/Makefile
+++ regress/sbin/pfctl/Makefile
@@ -18,11 +18,11 @@ PFTESTS=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27
PFTESTS+=28 29 30 31 32 34 35 36 38 39 40 41 44 46 47 48 49 50
PFTESTS+=52 53 54 55 56 57 60 61 65 66 67 68 69 70 71 72 73
PFTESTS+=74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
-PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114
+PFTESTS+=97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 114 115
PFFAIL=1 2 3 4 5 6 7 8 11 12 13 14 15 16 17 19 20 23 25 27
PFFAIL+=30 37 38 39 40 41 42 43 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
-PFFAIL+=63 64 65 66 67
-PFSIMPLE=1 2
+PFFAIL+=63 64 65 66 67 68
+PFSIMPLE=1 2 3
PFSETUP=1 4
PFLOAD=1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 23 24 25 26 27 28 29
PFLOAD+=30 31 32 34 36 38 39 40 44 46 47 48 49 54 56 60 61 65 66 67 68 69 70 71
diff --git regress/sbin/pfctl/pf115.in regress/sbin/pfctl/pf115.in
new file mode 100644
index 00000000..894078e1
--- /dev/null
+++ regress/sbin/pfctl/pf115.in
@@ -0,0 +1,3 @@
+set queue-units bytes
+queue rootq on lo1000000 bandwidth 1G
+queue defq parent rootq bandwidth 500M default
diff --git regress/sbin/pfctl/pf115.ok regress/sbin/pfctl/pf115.ok
new file mode 100644
index 00000000..fa29c7a6
--- /dev/null
+++ regress/sbin/pfctl/pf115.ok
@@ -0,0 +1,2 @@
+queue rootq on lo1000000 bandwidth 1G
+queue defq parent rootq bandwidth 500M default
diff --git regress/sbin/pfctl/pfail68.in regress/sbin/pfctl/pfail68.in
new file mode 100644
index 00000000..df5df227
--- /dev/null
+++ regress/sbin/pfctl/pfail68.in
@@ -0,0 +1,3 @@
+set queue-units bytes
+queue rootq on lo1000000 bandwidth 5G
+queue defq parent rootq bandwidth 500M default
diff --git regress/sbin/pfctl/pfail68.ok regress/sbin/pfctl/pfail68.ok
new file mode 100644
index 00000000..6f081086
--- /dev/null
+++ regress/sbin/pfctl/pfail68.ok
@@ -0,0 +1 @@
+stdin:2: bandwidth value exceeds maximum for bytes mode (~4.29 GB/s =
~34.36 Gb/s)
diff --git regress/sbin/pfctl/pfsimple3.in regress/sbin/pfctl/pfsimple3.in
new file mode 100644
index 00000000..c654a254
--- /dev/null
+++ regress/sbin/pfctl/pfsimple3.in
@@ -0,0 +1,3 @@
+set queue-units bytes
+queue rootq on lo1000000 bandwidth 4G
+queue defq parent rootq bandwidth 500M default
diff --git sbin/pfctl/parse.y sbin/pfctl/parse.y
index 01d7553c..02cae598 100644
--- sbin/pfctl/parse.y
+++ sbin/pfctl/parse.y
@@ -538,7 +538,7 @@ int parseport(char *, struct range *r, int);
%token ANTISPOOF FOR INCLUDE MATCHES
%token BITMASK RANDOM SOURCEHASH ROUNDROBIN LEASTSTATES STATICPORT PROBABILITY
%token WEIGHT BANDWIDTH FLOWS QUANTUM
-%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT
+%token QUEUE PRIORITY QLIMIT RTABLE RDOMAIN MINIMUM BURST PARENT QUEUEUNITS
%token LOAD RULESET_OPTIMIZATION RTABLE RDOMAIN PRIO ONCE DEFAULT DELAY
%token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE
%token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW MAXPKTRATE
@@ -764,6 +764,18 @@ option : SET REASSEMBLE yesno optnodf {
YYERROR;
}
}
+ | SET QUEUEUNITS STRING {
+ if (!strcmp($3, "bytes"))
+ pf->queue_units_bytes = 1;
+ else if (!strcmp($3, "bits"))
+ pf->queue_units_bytes = 0;
+ else {
+ yyerror("unknown queue-units \"%s\"", $3);
+ free($3);
+ YYERROR;
+ }
+ free($3);
+ }
;
syncookie_val : STRING {
@@ -4829,6 +4841,26 @@ expand_queue(char *qname, struct node_if
*interfaces, struct queue_opts *opts)
qspec.flags = opts->flags;
qspec.qlimit = opts->qlimit;
+ /* after opts->flags copy to avoid overwrite */
+ if (pf->queue_units_bytes)
+ qspec.flags |= PFQS_BYTESMODE;
+
+ if (qspec.linkshare.m1.absolute > UINT_MAX ||
+ qspec.linkshare.m2.absolute > UINT_MAX ||
+ qspec.realtime.m1.absolute > UINT_MAX ||
+ qspec.realtime.m2.absolute > UINT_MAX ||
+ qspec.upperlimit.m1.absolute > UINT_MAX ||
+ qspec.upperlimit.m2.absolute > UINT_MAX) {
+ if (pf->queue_units_bytes)
+ yyerror("bandwidth value exceeds maximum "
+ "for bytes mode (~4.29 GB/s = "
+ "~34.36 Gb/s)");
+ else
+ yyerror("bandwidth value exceeds maximum "
+ "for bits mode (~4.29 Gb/s)");
+ return (1);
+ }
+
if (pfctl_add_queue(pf, &qspec)) {
yyerror("cannot add queue");
return (1);
@@ -5567,6 +5599,7 @@ lookup(char *s)
{ "qlimit", QLIMIT},
{ "quantum", QUANTUM},
{ "queue", QUEUE},
+ { "queue-units", QUEUEUNITS},
{ "quick", QUICK},
{ "random", RANDOM},
{ "random-id", RANDOMID},
diff --git sbin/pfctl/pfctl_parser.h sbin/pfctl/pfctl_parser.h
index d5a61831..f0a6221c 100644
--- sbin/pfctl/pfctl_parser.h
+++ sbin/pfctl/pfctl_parser.h
@@ -126,6 +126,7 @@ struct pfctl {
u_int8_t reass_set;
u_int8_t syncookies_set;
u_int8_t syncookieswat_set;
+ u_int8_t queue_units_bytes;
};
struct node_if {
diff --git sbin/pfctl/pfctl_queue.c sbin/pfctl/pfctl_queue.c
index 1f0d4563..6fb30d32 100644
--- sbin/pfctl/pfctl_queue.c
+++ sbin/pfctl/pfctl_queue.c
@@ -91,6 +91,14 @@ pfctl_show_queues(int dev, const char *iface, int
opts, int verbose2)
pfctl_print_queue_node(dev, node, opts);
}
+ if (nodes > 0) {
+ int bytesmode = 0;
+ node = TAILQ_FIRST(&qnodes);
+ if (node != NULL && (node->qs.flags & PFQS_BYTESMODE))
+ bytesmode = 1;
+ printf("queue-units %s\n", bytesmode ? "bytes" : "bits");
+ }
+
while (verbose2 && nodes > 0) {
printf("\n");
fflush(stdout);
diff --git share/man/man5/pf.conf.5 share/man/man5/pf.conf.5
index 3a383b23..9d24aab4 100644
--- share/man/man5/pf.conf.5
+++ share/man/man5/pf.conf.5
@@ -1477,6 +1477,29 @@ The thresholds for entering and leaving
syncookie mode can be specified using:
set syncookies adaptive (start 25%, end 12%)
.Ed
.El
+.It Cm set queue-units Ar bits | bytes
+Sets the unit for all queue bandwidth values.
+The default is
+.Cm bits ,
+where bandwidth values represent bits per second.
+When set to
+.Cm bytes ,
+all bandwidth values represent bytes per second,
+effectively multiplying the usable range by 8.
+This allows queue bandwidth specifications up to approximately 34 Gbps
+using the existing suffixes
+.Cm K ,
+.Cm M ,
+and
+.Cm G
+to represent kilobytes, megabytes, and gigabytes per second.
+This option must appear before any queue rules.
+.Pp
+For example, to configure a 10 Gbps queue:
+.Bd -literal -offset indent
+set queue-units bytes
+queue rootq on em0 bandwidth 1250M max 1250M
+.Ed
.It Ic set Cm timeout Ar variable value
.Bl -tag -width "src.track" -compact
.It Cm frag
@@ -1639,12 +1662,15 @@ time can be specified:
.Pp
All
.Cm bandwidth
-values are specified as bits per second or using the suffixes
+values are specified in the units set by
+.Cm set queue-units
+.Pq default: bits per second .
+Suffixes
.Cm K ,
.Cm M ,
and
.Cm G
-to represent kilobits, megabits, and gigabits per second, respectively.
+represent kilo, mega, and giga units respectively.
The value must not exceed the interface bandwidth.
.Pp
If multiple connections are assigned the same queue, they're not guaranteed
diff --git sys/net/hfsc.c sys/net/hfsc.c
index 37e9b6bf..9e10578c 100644
--- sys/net/hfsc.c
+++ sys/net/hfsc.c
@@ -198,7 +198,7 @@ struct hfsc_if {
struct hfsc_class *hfsc_class_create(struct hfsc_if *,
struct hfsc_sc *, struct hfsc_sc *,
struct hfsc_sc *, struct hfsc_class *, int,
- int, int);
+ int, int, int);
int hfsc_class_destroy(struct hfsc_if *,
struct hfsc_class *);
struct hfsc_class *hfsc_nextclass(struct hfsc_class *);
@@ -234,8 +234,11 @@ static __inline u_int64_t m2ism(u_int);
static __inline u_int64_t d2dx(u_int);
static __inline u_int sm2m(u_int64_t);
static __inline u_int dx2d(u_int64_t);
+static __inline u_int64_t bm2sm(u_int);
+static __inline u_int64_t bm2ism(u_int);
+static __inline u_int sm2bm(u_int64_t);
-void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *);
+void hfsc_sc2isc(struct hfsc_sc *, struct hfsc_internal_sc *, int);
void hfsc_rtsc_init(struct hfsc_runtime_sc *,
struct hfsc_internal_sc *, u_int64_t, u_int64_t);
u_int64_t hfsc_rtsc_y2x(struct hfsc_runtime_sc *, u_int64_t);
@@ -401,6 +404,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
struct hfsc_class *cl, *parent, *np = NULL;
struct hfsc_sc rtsc, lssc, ulsc;
int error = 0;
+ int bytesmode = (q->flags & PFQS_BYTESMODE) ? 1 : 0;
KASSERT(hif != NULL);
KASSERT(q->qid != 0);
@@ -412,7 +416,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
if (q->parent_qid == 0 && hif->hif_rootclass == NULL) {
np = hfsc_class_create(hif, NULL, NULL, NULL, NULL,
- 0, 0, HFSC_ROOT_CLASS | q->qid);
+ 0, 0, HFSC_ROOT_CLASS | q->qid, bytesmode);
if (np == NULL)
return (EINVAL);
parent = np;
@@ -435,7 +439,7 @@ hfsc_pf_addqueue(void *arg, struct pf_queuespec *q)
ulsc.m2 = q->upperlimit.m2.absolute;
if ((cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc,
- parent, q->qlimit, q->flags, q->qid)) == NULL) {
+ parent, q->qlimit, q->flags, q->qid, bytesmode)) == NULL) {
hfsc_class_destroy(hif, np);
return (ENOMEM);
}
@@ -614,7 +618,7 @@ hfsc_purge(struct ifqueue *ifq, struct mbuf_list *ml)
struct hfsc_class *
hfsc_class_create(struct hfsc_if *hif, struct hfsc_sc *rsc,
struct hfsc_sc *fsc, struct hfsc_sc *usc, struct hfsc_class *parent,
- int qlimit, int flags, int qid)
+ int qlimit, int flags, int qid, int bytesmode)
{
struct hfsc_class *cl, *p;
int i, s;
@@ -639,18 +643,18 @@ hfsc_class_create(struct hfsc_if *hif, struct
hfsc_sc *rsc,
if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) {
cl->cl_rsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
- hfsc_sc2isc(rsc, cl->cl_rsc);
+ hfsc_sc2isc(rsc, cl->cl_rsc, bytesmode);
hfsc_rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0);
hfsc_rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0);
}
if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) {
cl->cl_fsc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
- hfsc_sc2isc(fsc, cl->cl_fsc);
+ hfsc_sc2isc(fsc, cl->cl_fsc, bytesmode);
hfsc_rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0);
}
if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) {
cl->cl_usc = pool_get(&hfsc_internal_sc_pl, PR_WAITOK);
- hfsc_sc2isc(usc, cl->cl_usc);
+ hfsc_sc2isc(usc, cl->cl_usc, bytesmode);
hfsc_rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0);
}
@@ -1498,15 +1502,58 @@ dx2d(u_int64_t dx)
return ((u_int)d);
}
+/* bytes/sec to scaled slope (PFQS_BYTESMODE: skip /8) */
+static __inline u_int64_t
+bm2sm(u_int m)
+{
+ u_int64_t sm;
+
+ sm = ((u_int64_t)m << SM_SHIFT) / HFSC_FREQ;
+ return (sm);
+}
+
+/* bytes/sec to inverse scaled slope (PFQS_BYTESMODE: skip *8) */
+static __inline u_int64_t
+bm2ism(u_int m)
+{
+ u_int64_t ism;
+
+ if (m == 0)
+ ism = HFSC_HT_INFINITY;
+ else
+ ism = ((u_int64_t)HFSC_FREQ << ISM_SHIFT) / m;
+ return (ism);
+}
+
+/* scaled slope to bytes/sec (inverse of bm2sm) */
+static __inline u_int
+sm2bm(u_int64_t sm)
+{
+ u_int64_t b;
+
+ b = (sm * HFSC_FREQ) >> SM_SHIFT;
+ return ((u_int)b);
+}
+
void
-hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc)
+hfsc_sc2isc(struct hfsc_sc *sc, struct hfsc_internal_sc *isc, int bytesmode)
{
- isc->sm1 = m2sm(sc->m1);
- isc->ism1 = m2ism(sc->m1);
+ if (bytesmode) {
+ isc->sm1 = bm2sm(sc->m1);
+ isc->ism1 = bm2ism(sc->m1);
+ } else {
+ isc->sm1 = m2sm(sc->m1);
+ isc->ism1 = m2ism(sc->m1);
+ }
isc->dx = d2dx(sc->d);
isc->dy = seg_x2y(isc->dx, isc->sm1);
- isc->sm2 = m2sm(sc->m2);
- isc->ism2 = m2ism(sc->m2);
+ if (bytesmode) {
+ isc->sm2 = bm2sm(sc->m2);
+ isc->ism2 = bm2ism(sc->m2);
+ } else {
+ isc->sm2 = m2sm(sc->m2);
+ isc->ism2 = m2ism(sc->m2);
+ }
}
/*
@@ -1641,27 +1688,42 @@ hfsc_getclstats(struct hfsc_class_stats *sp,
struct hfsc_class *cl)
sp->class_handle = cl->cl_handle;
if (cl->cl_rsc != NULL) {
- sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
+ if (cl->cl_flags & PFQS_BYTESMODE) {
+ sp->rsc.m1 = sm2bm(cl->cl_rsc->sm1);
+ sp->rsc.m2 = sm2bm(cl->cl_rsc->sm2);
+ } else {
+ sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
+ sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
+ }
sp->rsc.d = dx2d(cl->cl_rsc->dx);
- sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
} else {
sp->rsc.m1 = 0;
sp->rsc.d = 0;
sp->rsc.m2 = 0;
}
if (cl->cl_fsc != NULL) {
- sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
+ if (cl->cl_flags & PFQS_BYTESMODE) {
+ sp->fsc.m1 = sm2bm(cl->cl_fsc->sm1);
+ sp->fsc.m2 = sm2bm(cl->cl_fsc->sm2);
+ } else {
+ sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
+ sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
+ }
sp->fsc.d = dx2d(cl->cl_fsc->dx);
- sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
} else {
sp->fsc.m1 = 0;
sp->fsc.d = 0;
sp->fsc.m2 = 0;
}
if (cl->cl_usc != NULL) {
- sp->usc.m1 = sm2m(cl->cl_usc->sm1);
+ if (cl->cl_flags & PFQS_BYTESMODE) {
+ sp->usc.m1 = sm2bm(cl->cl_usc->sm1);
+ sp->usc.m2 = sm2bm(cl->cl_usc->sm2);
+ } else {
+ sp->usc.m1 = sm2m(cl->cl_usc->sm1);
+ sp->usc.m2 = sm2m(cl->cl_usc->sm2);
+ }
sp->usc.d = dx2d(cl->cl_usc->dx);
- sp->usc.m2 = sm2m(cl->cl_usc->sm2);
} else {
sp->usc.m1 = 0;
sp->usc.d = 0;
diff --git sys/net/pfvar.h sys/net/pfvar.h
index 750fe0ef..61110c84 100644
--- sys/net/pfvar.h
+++ sys/net/pfvar.h
@@ -1291,12 +1291,14 @@ struct pf_queuespec {
#define PFQS_FLOWQUEUE 0x0001
#define PFQS_ROOTCLASS 0x0002
+#define PFQS_BYTESMODE 0x0004 /* bandwidth values are bytes/sec */
#define PFQS_DEFAULT 0x1000 /* maps to HFSC_DEFAULTCLASS */
struct priq_opts {
int flags;
};
+/* XXX hfsc_opts is unused */
struct hfsc_opts {
/* real-time service curve */
u_int rtsc_m1; /* slope of the 1st segment in bps */
diff --git usr.bin/systat/pftop.c usr.bin/systat/pftop.c
index 8668bd28..e492a811 100644
--- usr.bin/systat/pftop.c
+++ usr.bin/systat/pftop.c
@@ -1640,8 +1640,12 @@ print_queue_node(struct pfctl_queue_node *node)
* spot as the 'period' in hfsc_class_stats.
*/
tbprintf("%u", node->qstats.data.period);
- } else
- tbprintf("%u%c", rate, unit[i]);
+ } else {
+ if (node->qs.flags & PFQS_BYTESMODE)
+ tbprintf("%u%cB", rate, unit[i]);
+ else
+ tbprintf("%u%c", rate, unit[i]);
+ }
print_fld_tb(FLD_BANDW);
print_fld_str(FLD_SCHED, node->qs.flags & PFQS_FLOWQUEUE ?
PF queue bandwidth now works upto ~34.36 Gbps with 'set queue-units bytes'