From: Claudio Jeker Subject: bgpd: introduce chash and start using it To: tech@openbsd.org Date: Mon, 15 Sep 2025 14:49:39 +0200 bgpd uses a lot of RB trees and why they are great the come with a big cost especially for objects that do not require an order. This diff introduces CHASH which is a variation off swisstables (that's why it is called CH). The hash table is built like the RBT macros and is close to a drop in replacement. It scales up and down nicely but I have not yet spent excessive amount of time in tuning the last bit of performance out of it. This diff adds chash functions and then uses it for a few objects in the RDE. The goal is to move more over to this. Especially parts of the Adj-RIB-Out handling where the RB lookups burn a lot of CPU time. I tested this a fair bit and it seems to behave but this is a big change and more testing is needed. -- :wq Claudio Index: Makefile =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/Makefile,v diff -u -p -r1.42 Makefile --- Makefile 26 Feb 2025 15:49:56 -0000 1.42 +++ Makefile 31 Mar 2025 14:14:20 -0000 @@ -3,6 +3,7 @@ PROG= bgpd SRCS= bgpd.c SRCS+= carp.c +SRCS+= chash.c SRCS+= config.c SRCS+= control.c SRCS+= flowspec.c Index: chash.c =================================================================== RCS file: chash.c diff -N chash.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ chash.c 8 Sep 2025 12:29:40 -0000 @@ -0,0 +1,876 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) 2025 Claudio Jeker + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "chash.h" + +/* + * CH hash table: a cache optimized hash table + * + * The CH hash table is split into an extendible hashing part and + * sub tables are split into CH_H2_SIZE groups holding 7 elements each. + * The sub tables are open addressed hash tables similar to swiss tables + * but the groups are only using 7 fields and not 16. Also the meta data + * is part of each group. On 64bit archs a group uses 64 bytes. + * + * The hash is split in three parts: + * o H1: variable size hash used by the extendible hashing table + * o H2: fixed size hash to select a group in a sub table. + * o H3: 1 byte hash to select element in a group. + * + * Unlike other unordered sets this hash table does not allow a key to + * be inserted more than once. + */ + +#define CH_MAX_LOAD 875 /* in in per-mille */ + +#define CH_H3_BITS 8 +#define CH_H3_SIZE (1 << CH_H3_BITS) +#define CH_H3_MASK (CH_H3_SIZE - 1) + +#define CH_H2_SHIFT CH_H3_BITS +#define CH_H2_BITS 9 +#define CH_H2_SIZE (1 << CH_H2_BITS) +#define CH_H2_MASK (CH_H2_SIZE - 1) + +#define CH_H1_BITS (64 - CH_H2_BITS - CH_H3_BITS) + +#define CH_H1(h, l) ((l) == 0 ? 0 : h >> (64 - (l))) +#define CH_H2(h) ((h >> CH_H2_SHIFT) & CH_H2_MASK) +#define CH_H3(h) (h & CH_H3_MASK) + +struct ch_group { + uint64_t cg_meta; + /* + * cg_meta is actually split like this: + * uint8_t cg_flags; + * uint8_t cg_hashes[7]; + */ + void *cg_data[7]; +}; +#define CH_EVER_FULL 0x80 +#define CH_SLOT_MASK 0x7f +#define CH_FLAGS_SHIFT 56 + +struct ch_meta { + uint32_t cs_num_elm; + uint32_t cs_num_tomb; + uint32_t cs_num_ever_full; + uint32_t cs_local_level; +}; + +/* + * API to work with the cg_meta field of a ch_group: + * cg_meta_set_flags: Set bits in flags section, return true if not yet set. + * cg_meta_clear_flags: Clear bits in flags section, return true if cleared. + * cg_meta_get_flags: Return flags section as uint8_t. + * cg_meta_check_flags: Return true if flag is set in flags section. + * cg_meta_set_hash: Set one of the 7 hash bytes according to slot and hash. + * ch_meta_locate: find possible canidates. + */ +static inline int +cg_meta_set_flags(struct ch_group *g, uint8_t flag) +{ + uint64_t f = flag, oldf; + oldf = g->cg_meta & f << CH_FLAGS_SHIFT; + g->cg_meta |= f << CH_FLAGS_SHIFT; + return oldf != 0; +} + +static inline int +cg_meta_clear_flags(struct ch_group *g, uint8_t flag) +{ + uint64_t f = flag, oldf; + oldf = g->cg_meta & f << CH_FLAGS_SHIFT; + g->cg_meta &= ~(f << CH_FLAGS_SHIFT); + return oldf != 0; +} + +static inline uint8_t +cg_meta_get_flags(const struct ch_group *g) +{ + return g->cg_meta >> CH_FLAGS_SHIFT; +} + +static inline int +cg_meta_check_flags(const struct ch_group *g, uint8_t flag) +{ + uint64_t f = flag; + return (g->cg_meta & f << CH_FLAGS_SHIFT) != 0; +} + +static inline void +cg_meta_set_hash(struct ch_group *g, int slot, uint8_t hash) +{ + ((uint8_t *)&g->cg_meta)[slot] = hash; +} + +/* + * Find possible canidates in the group where both the mask matches with + * the hash of the slot and where the flag bit for the slot is set. + * Additionally set CH_EVER_FULL in the return value if it is set in + * the group metadata. + */ +static uint8_t +ch_meta_locate(struct ch_group *g, uint64_t mask) +{ + uint64_t lookup; + uint8_t flags, i, hits; + + lookup = g->cg_meta ^ mask; + flags = cg_meta_get_flags(g); + hits = flags & CH_EVER_FULL; + + for (i = 0; i < 7; i++) { + if (((lookup >> i * 8) & CH_H3_MASK) == 0) + hits |= flags & (1 << i); + } + return hits; +} + +/* + * API to work with fixed size sub tables. + * Each sub table is an array of struct ch_group with CH_H2_SIZE elements. + * + * To find an element the group metadata is used to see if there is a + * hit. If there was no hit but the CH_EVER_FULL flag is set the lookup + * will continue and look at the next element until the needle is found + * or there is a group where CH_EVER_FULL is unset. + * + * On insert if a group overflows CH_EVER_FULL is set. The flag will not be + * cleared again. + * On remove a data slot in the group is freed but depending on the + * CH_EVER_FULL flag the slot is considered a tombstone (if set) or considered + * free (not set). + * + * Sub tables are automatically split in two when the maximum loadfactor is + * reached. If the fill factor drops below a threashold then buddy tables + * may be joined back together. + */ + +/* Return load factor (including tombstones) of a sub table in per mille. */ +static int +ch_sub_loadfactor(struct ch_meta *m) +{ + uint32_t max = CH_H2_SIZE * 7; + uint32_t used = m->cs_num_elm + m->cs_num_tomb; + + return used * 1000 / max; +} + +/* Return fill factor (only active elements) of a sub table in per mille. */ +static int +ch_sub_fillfactor(struct ch_meta *m) +{ + uint32_t max = CH_H2_SIZE * 7; + uint32_t used = m->cs_num_elm; + + return used * 1000 / max; +} + +/* + * Insert element into the hash table. If an equal element is already present + * in the table return the pointer to that element else return NULL. + */ +static void * +ch_sub_insert(const struct ch_type *type, struct ch_group *table, + struct ch_meta *meta, uint64_t h, void *elm) +{ + uint64_t mask; + uint32_t bucket = CH_H2(h); + int i; + uint8_t empties, hits, ins_i; + struct ch_group *g = &table[bucket], *ins_g = NULL; + + memset(&mask, CH_H3(h), sizeof(mask)); + while (1) { + /* first check if object already present */ + hits = ch_meta_locate(g, mask); + for (i = 0; i < 7; i++) { + if (hits & (1 << i)) { + if (type->t_equal(g->cg_data[i], elm)) + return g->cg_data[i]; + } + } + /* at the same time remember the first empty spot */ + empties = ~cg_meta_get_flags(g) & CH_SLOT_MASK; + if (ins_g == NULL && (i = ffs(empties)) != 0) { + ins_g = g; + ins_i = i - 1; + } else if (empties == 0) { + /* overflow, mark group as full */ + if (cg_meta_set_flags(g, CH_EVER_FULL) == 0) + meta->cs_num_ever_full++; + } + /* check if lookup is finished and obj is not present */ + if (cg_meta_check_flags(g, CH_EVER_FULL) == 0) + break; + g = &table[++bucket & CH_H2_MASK]; + } + + /* + * insert new object and adjust accounting. + * If CH_EVER_FULL is set then it was a tombstone and not an empty slot. + */ + ins_g->cg_data[ins_i] = elm; + cg_meta_set_hash(ins_g, ins_i, h & CH_H3_MASK); + cg_meta_set_flags(ins_g, 1 << ins_i); + if (cg_meta_check_flags(ins_g, CH_EVER_FULL)) + meta->cs_num_tomb--; + meta->cs_num_elm++; + + return NULL; +} + +/* + * Lookup needle in the sub table and if found, remove the entry and return its + * pointer else return NULL. + */ +static void * +ch_sub_remove(const struct ch_type *type, struct ch_group *table, + struct ch_meta *meta, uint64_t h, const void *needle) +{ + uint64_t mask; + uint32_t bucket = CH_H2(h); + int i; + uint8_t hits; + struct ch_group *g = &table[bucket]; + + memset(&mask, CH_H3(h), sizeof(mask)); + while (1) { + hits = ch_meta_locate(g, mask); + for (i = 0; i < 7; i++) { + if (hits & (1 << i)) { + /* most porbably a hit */ + if (type->t_equal(g->cg_data[i], needle)) { + void *elm = g->cg_data[i]; + g->cg_data[i] = NULL; + cg_meta_set_hash(g, i, 0); + cg_meta_clear_flags(g, 1 << i); + if (hits & CH_EVER_FULL) + meta->cs_num_tomb++; + meta->cs_num_elm--; + return elm; + } + } + } + if ((hits & CH_EVER_FULL) == 0) + return NULL; + g = &table[++bucket & CH_H2_MASK]; + } +} + +/* + * Lookup needle in the sub table and if found return its pointer + * else return NULL. + */ +static void * +ch_sub_find(const struct ch_type *type, struct ch_group *table, uint64_t h, + const void *needle) +{ + uint64_t mask; + uint32_t bucket = CH_H2(h); + int i; + uint8_t hits; + struct ch_group *g = &table[bucket]; + + memset(&mask, CH_H3(h), sizeof(mask)); + while (1) { + hits = ch_meta_locate(g, mask); + for (i = 0; i < 7; i++) { + if (hits & (1 << i)) { + /* most porbably a hit */ + if (type->t_equal(g->cg_data[i], needle)) + return g->cg_data[i]; + } + } + if ((hits & CH_EVER_FULL) == 0) + return NULL; + g = &table[++bucket & CH_H2_MASK]; + } +} + +/* + * Lookup hash in the sub table and if possible match is found pass it to + * cmp callback to verify. + */ +static void * +ch_sub_locate(const struct ch_type *type, struct ch_group *table, uint64_t h, + int (*cmp)(const void *, void *), void *arg) +{ + uint64_t mask; + uint32_t bucket = CH_H2(h); + int i; + uint8_t hits; + struct ch_group *g = &table[bucket]; + + memset(&mask, CH_H3(h), sizeof(mask)); + while (1) { + hits = ch_meta_locate(g, mask); + for (i = 0; i < 7; i++) { + if (hits & (1 << i)) { + /* most porbably a hit */ + if (cmp(g->cg_data[i], arg)) + return g->cg_data[i]; + } + } + if ((hits & CH_EVER_FULL) == 0) + return NULL; + g = &table[++bucket & CH_H2_MASK]; + } +} + +/* + * Start of sub table iterator, reset the set and grp indices and locate + * first element in sub table. Return element or NULL if table is empty. + */ +static void * +ch_sub_first(const struct ch_type *type, struct ch_group *table, + struct ch_iter *iter) +{ + struct ch_group *g; + uint32_t n; + uint8_t elms; + + for (n = 0; n < CH_H2_SIZE; n++, g++) { + g = &table[n]; + elms = cg_meta_get_flags(g) & CH_SLOT_MASK; + if (elms == 0) + continue; + iter->ci_set_idx = n; + iter->ci_grp_idx = ffs(elms); + return g->cg_data[iter->ci_grp_idx - 1]; + } + iter->ci_set_idx = n; + return NULL; +} + +/* + * Get next element of a sub table based on the iterator indices. + * Return element or NULL if the rest of the table is empty. + */ +static void * +ch_sub_next(const struct ch_type *type, struct ch_group *table, + struct ch_iter *iter) +{ + struct ch_group *g; + uint32_t n; + uint8_t elms; + + for (n = iter->ci_set_idx; n < CH_H2_SIZE; n++, g++) { + g = &table[n]; + elms = cg_meta_get_flags(g) & CH_SLOT_MASK; + elms &= CH_SLOT_MASK << iter->ci_grp_idx; + if (elms == 0) { + iter->ci_grp_idx = 0; + continue; + } + iter->ci_set_idx = n; + iter->ci_grp_idx = ffs(elms); + return g->cg_data[iter->ci_grp_idx - 1]; + } + iter->ci_set_idx = n; + return NULL; +} + +/* + * Split table from into two new tables low and high. Update the meta data + * accordingly. The cs_local_level of low and high will be 1 higher then that + * of the from table. + */ +static int +ch_sub_split(const struct ch_type *type, struct ch_group *from, + struct ch_group *low, struct ch_group *high, struct ch_meta *frommeta, + struct ch_meta *lowmeta, struct ch_meta *highmeta) +{ + struct ch_group *g = &from[0]; + uint32_t n; + uint8_t elms, i; + + lowmeta->cs_local_level = frommeta->cs_local_level + 1; + highmeta->cs_local_level = frommeta->cs_local_level + 1; + + for (n = 0; n < CH_H2_SIZE; n++, g++) { + elms = cg_meta_get_flags(g) & CH_SLOT_MASK; + if (elms == 0) + continue; + for (i = 0; i < 7; i++) { + if (elms & (1 << i)) { + void *v; + uint64_t h = type->t_hash(g->cg_data[i]); + if (CH_H1(h, lowmeta->cs_local_level) & 1) + v = ch_sub_insert(type, high, highmeta, + h, g->cg_data[i]); + else + v = ch_sub_insert(type, low, lowmeta, + h, g->cg_data[i]); + if (v != NULL) { + errno = EINVAL; + return -1; + } + } + } + } + + return 0; +} + +/* + * Merge all active elements of one sub group into the table table. + * Return 0 on success, -1 on failure. + */ +static int +ch_sub_merge_one(const struct ch_type *type, struct ch_group *table, + struct ch_meta *meta, const struct ch_group *from) +{ + uint8_t elms, i; + + elms = cg_meta_get_flags(from) & CH_SLOT_MASK; + if (elms == 0) + return 0; + for (i = 0; i < 7; i++) { + if (elms & (1 << i)) { + void *v; + uint64_t h = type->t_hash(from->cg_data[i]); + v = ch_sub_insert(type, table, meta, h, + from->cg_data[i]); + if (v != NULL) /* how should there be a dup? */ + return -1; + } + } + return 0; +} + +/* + * Merge sub tables from and buddy into the new table to. + * Returns 0 on success and -1 on failure keeping from and buddy as is. + */ +static int +ch_sub_merge(const struct ch_type *type, struct ch_group *to, + struct ch_group *from, struct ch_group *buddy, struct ch_meta *tometa, + struct ch_meta *frommeta, struct ch_meta *buddymeta) +{ + struct ch_group *g = &from[0]; + struct ch_group *b = &buddy[0]; + uint32_t n; + + tometa->cs_local_level = frommeta->cs_local_level - 1; + + for (n = 0; n < CH_H2_SIZE; n++, g++, b++) { + if (ch_sub_merge_one(type, to, tometa, g) == -1) + return -1; + if (ch_sub_merge_one(type, to, tometa, b) == -1) + return -1; + } + return 0; +} + +static int +ch_sub_alloc(struct ch_group **table, struct ch_meta **meta) +{ + if ((*table = calloc(CH_H2_SIZE, sizeof(**table))) == NULL) + return -1; + if ((*meta = calloc(1, sizeof(**meta))) == NULL) { + free(*table); + return -1; + } + return 0; +} + +static void +ch_sub_free(struct ch_group *table, struct ch_meta *meta) +{ + free(table); + free(meta); +} + +/* + * Resize extendible hash table by 2. Updating all pointers accordingly. + * Return 0 on success, -1 on failure and set errno. + */ +static int +ch_table_resize(struct ch_table *t) +{ + struct ch_group **tables; + struct ch_meta **metas; + uint64_t oldsize = 1 << t->ch_level; + uint64_t newsize = 1 << (t->ch_level + 1); + int64_t idx; + + if (t->ch_level + 1 >= CH_H1_BITS) { + errno = E2BIG; + return -1; + } + + if (t->ch_tables == NULL) { + oldsize = 0; + newsize = 1; + } + + tables = reallocarray(t->ch_tables, newsize, sizeof(*tables)); + if (tables == NULL) + return -1; + metas = reallocarray(t->ch_metas, newsize, sizeof(*metas)); + if (metas == NULL) { + free(tables); + return -1; + } + + for (idx = oldsize - 1; idx >= 0; idx--) { + tables[idx * 2] = tables[idx]; + tables[idx * 2 + 1] = tables[idx]; + metas[idx * 2] = metas[idx]; + metas[idx * 2 + 1] = metas[idx]; + } + + if (t->ch_tables != NULL) + t->ch_level++; + t->ch_tables = tables; + t->ch_metas = metas; + return 0; +} + +/* + * Set table pointers from idx to the required number of elements + * depending on the table ch_level and the sub table cs_local_level. + * Idx only includes the bits covered by the sub table cs_local_level. + */ +static void +ch_table_fill(struct ch_table *t, uint64_t idx, struct ch_group *table, + struct ch_meta *meta) +{ + uint64_t cnt, i; + + idx <<= (t->ch_level - meta->cs_local_level); + cnt = 1 << (t->ch_level - meta->cs_local_level); + + for (i = 0; i < cnt; i++) { + t->ch_tables[idx + i] = table; + t->ch_metas[idx + i] = meta; + } +} + +/* + * Return the buddy sub group for the table with idx and local_level. + * The buddy page must have the same local level to be a buddy. + */ +static struct ch_group * +ch_table_buddy(struct ch_table *t, uint64_t idx, uint32_t local_level, + struct ch_meta **meta) +{ + struct ch_meta *m; + + /* the single root table can't be merged */ + if (local_level == 0) + return NULL; + + idx ^= 1ULL << (t->ch_level - local_level); + + m = t->ch_metas[idx]; + /* can only merge buddies at same level */ + if (m->cs_local_level == local_level) { + *meta = m; + return t->ch_tables[idx]; + } + return NULL; +} + +/* + * Grow the hash table by spliting a sub group and if needed by doubling + * the extendible hash of the primary table. + * Return 0 on success, -1 on failure and set errno. + */ +static int +ch_table_grow(const struct ch_type *type, struct ch_table *t, uint64_t h, + struct ch_group *table, struct ch_meta *meta) +{ + struct ch_group *left, *right; + struct ch_meta *leftmeta, *rightmeta; + uint64_t idx; + + /* check if the extendible hashing table needs to grow */ + if (meta->cs_local_level == t->ch_level) { + if (ch_table_resize(t) == -1) + return -1; + } + + /* allocate new sub tables */ + if (ch_sub_alloc(&left, &leftmeta) == -1) + return -1; + if (ch_sub_alloc(&right, &rightmeta) == -1) { + ch_sub_free(left, leftmeta); + return -1; + } + + /* split up the old table into the two new ones */ + if (ch_sub_split(type, table, left, right, + meta, leftmeta, rightmeta) == -1) { + ch_sub_free(right, rightmeta); + ch_sub_free(left, leftmeta); + return -1; + } + + /* + * Insert new tables into the extendible hash table. + * Calculate index based on the H1 hash of the old sub table + * but shift it by one bit since the new tables use one more bit. + */ + idx = CH_H1(h, meta->cs_local_level) << 1; + ch_table_fill(t, idx, left, leftmeta); + ch_table_fill(t, idx | 1, right, rightmeta); + ch_sub_free(table, meta); + + return 0; +} + +/* + * Try to compact two sub groups back together and adjusting the extendible + * hash. Compaction only happens if there is a buddy page (page with same + * local level on the alternate slot) and the result of the merge is below + * 2 / 3 of the maximum load factor. + * Return 0 on success, -1 if no compaction is possible. + */ +static int +ch_table_compact(const struct ch_type *type, struct ch_table *t, uint64_t h, + struct ch_group *table, struct ch_meta *meta) +{ + struct ch_group *buddy, *to; + struct ch_meta *buddymeta, *tometa; + uint64_t idx; + + idx = CH_H1(h, t->ch_level); + buddy = ch_table_buddy(t, idx, meta->cs_local_level, &buddymeta); + if (buddy == NULL || + ch_sub_fillfactor(meta) + + ch_sub_fillfactor(buddymeta) > + CH_MAX_LOAD * 2 / 3) + return -1; + + /* allocate new sub tables */ + if (ch_sub_alloc(&to, &tometa) == -1) + return -1; + + /* merge the table and buddy into to. */ + if (ch_sub_merge(type, to, table, buddy, tometa, meta, buddymeta) == + -1) { + ch_sub_free(to, tometa); + return -1; + } + + /* + * Update table in the extendible hash table, which overwrites + * all entries of the buddy. + */ + idx = CH_H1(h, tometa->cs_local_level); + ch_table_fill(t, idx, to, tometa); + ch_sub_free(buddy, buddymeta); + ch_sub_free(table, meta); + + return 0; +} + +/* + * Public API used by the macros defined in chash.h. + */ +int +_ch_init(const struct ch_type *type, struct ch_table *t) +{ + struct ch_group *table; + struct ch_meta *meta; + + t->ch_level = 0; + t->ch_num_elm = 0; + if (ch_sub_alloc(&table, &meta) == -1) + return -1; + + if (ch_table_resize(t) == -1) { + ch_sub_free(table, meta); + return -1; + } + ch_table_fill(t, 0, table, meta); + + return 0; +} + +void +_ch_destroy(const struct ch_type *type, struct ch_table *t) +{ + uint64_t idx, max = 1 << t->ch_level; + struct ch_group *table = NULL; + + for (idx = 0; idx < max; idx++) { + if (table == t->ch_tables[idx]) + continue; + + table = t->ch_tables[idx]; + ch_sub_free(t->ch_tables[idx], t->ch_metas[idx]); + free(t->ch_metas[idx]); + } + free(t->ch_tables); + free(t->ch_metas); + memset(t, 0, sizeof(*t)); +} + +void * +_ch_insert(const struct ch_type *type, struct ch_table *t, uint64_t h, + void *elm) +{ + struct ch_group *table; + struct ch_meta *meta; + void *v; + uint64_t idx; + + if (t->ch_tables == NULL) + if (_ch_init(type, t) == -1) + return CH_INS_FAILED; + + idx = CH_H1(h, t->ch_level); + table = t->ch_tables[idx]; + meta = t->ch_metas[idx]; + + if (ch_sub_loadfactor(meta) >= CH_MAX_LOAD) { + if (ch_table_grow(type, t, h, table, meta) == -1) + return CH_INS_FAILED; + /* refetch data after resize */ + idx = CH_H1(h, t->ch_level); + table = t->ch_tables[idx]; + meta = t->ch_metas[idx]; + } + + v = ch_sub_insert(type, table, meta, h, elm); + if (v == NULL) + t->ch_num_elm++; + return v; +} + +void * +_ch_remove(const struct ch_type *type, struct ch_table *t, uint64_t h, + const void *needle) +{ + struct ch_group *table; + struct ch_meta *meta; + void *v; + uint64_t idx; + + if (t->ch_tables == NULL) + return NULL; + + idx = CH_H1(h, t->ch_level); + table = t->ch_tables[idx]; + meta = t->ch_metas[idx]; + + v = ch_sub_remove(type, table, meta, h, needle); + if (v != NULL) { + t->ch_num_elm--; + + while (ch_sub_fillfactor(meta) <= CH_MAX_LOAD / 4) { + if (ch_table_compact(type, t, h, table, meta) == -1) + break; + + /* refetch data after compaction */ + table = t->ch_tables[idx]; + meta = t->ch_metas[idx]; + } + } + return v; +} + +void * +_ch_find(const struct ch_type *type, struct ch_table *t, uint64_t h, + const void *needle) +{ + struct ch_group *table; + uint64_t idx; + + if (t->ch_tables == NULL) + return NULL; + + idx = CH_H1(h, t->ch_level); + table = t->ch_tables[idx]; + + return ch_sub_find(type, table, h, needle); +} + +void * +_ch_locate(const struct ch_type *type, struct ch_table *t, uint64_t h, + int (*cmp)(const void *, void *), void *arg) +{ + struct ch_group *table; + uint64_t idx; + + if (t->ch_tables == NULL) + return NULL; + + idx = CH_H1(h, t->ch_level); + table = t->ch_tables[idx]; + + return ch_sub_locate(type, table, h, cmp, arg); +} + +void * +_ch_first(const struct ch_type *type, struct ch_table *t, struct ch_iter *it) +{ + struct ch_group *table; + uint64_t idx; + + if (t->ch_tables == NULL) + return NULL; + + idx = it->ci_ext_idx = 0; + table = t->ch_tables[idx]; + + return ch_sub_first(type, table, it); +} + +void * +_ch_next(const struct ch_type *type, struct ch_table *t, struct ch_iter *it) +{ + struct ch_group *table; + uint64_t idx, max; + void *v; + + if (t->ch_tables == NULL) + return NULL; + + max = 1 << t->ch_level; + idx = it->ci_ext_idx; + if (idx >= max) + return NULL; + + table = t->ch_tables[idx]; + v = ch_sub_next(type, table, it); + if (v != NULL) + return v; + + /* find next sub table */ + for (idx++; idx < max; idx++) { + if (table != t->ch_tables[idx]) + break; + } + if (idx >= max) + return NULL; + /* start next sub table */ + it->ci_ext_idx = idx; + table = t->ch_tables[idx]; + return ch_sub_first(type, table, it); +} Index: chash.h =================================================================== RCS file: chash.h diff -N chash.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ chash.h 28 Aug 2025 14:46:50 -0000 @@ -0,0 +1,167 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) 2025 Claudio Jeker + * Copyright (c) 2016 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __CHASH_H__ +#define __CHASH_H__ + +struct ch_type { + int (*t_equal)(const void *, const void *); + uint64_t (*t_hash)(const void *); +}; + +struct ch_group; +struct ch_meta; + +struct ch_table { + struct ch_group **ch_tables; + struct ch_meta **ch_metas; + uint32_t ch_level; + uint32_t ch_num_elm; +}; + + +#define CH_HEAD(_name, _type) \ +struct _name { \ + struct ch_table ch_table; \ +} + +struct ch_iter { + uint64_t ci_ext_idx; + uint32_t ci_set_idx; + uint32_t ci_grp_idx; +}; + +int _ch_init(const struct ch_type *, struct ch_table *); +void _ch_destroy(const struct ch_type *, struct ch_table *); +void *_ch_insert(const struct ch_type *, struct ch_table *, uint64_t, void *); +void *_ch_remove(const struct ch_type *, struct ch_table *, uint64_t, + const void *); +void *_ch_find(const struct ch_type *, struct ch_table *, uint64_t, + const void *); +void *_ch_locate(const struct ch_type *, struct ch_table *, uint64_t, + int (*)(const void *, void *), void *); +void *_ch_first(const struct ch_type *, struct ch_table *, struct ch_iter *); +void *_ch_next(const struct ch_type *, struct ch_table *, struct ch_iter *); + +#define CH_INS_FAILED ((void *)-1) + +#define CH_INITIALIZER(_head) { 0 } + +#define CH_PROTOTYPE(_name, _type, _hash) \ +extern const struct ch_type *const _name##_CH_TYPE; \ + \ +__unused static inline int \ +_name##_CH_INIT(struct _name *head) \ +{ \ + return _ch_init(_name##_CH_TYPE, &head->ch_table); \ +} \ + \ +__unused static inline void \ +_name##_CH_DESTROY(struct _name *head) \ +{ \ + _ch_destroy(_name##_CH_TYPE, &head->ch_table); \ +} \ + \ +__unused static inline int \ +_name##_CH_INSERT(struct _name *head, struct _type *elm, struct _type **prev) \ +{ \ + struct _type *p; \ + uint64_t h; \ + h = _hash(elm); \ + p = _ch_insert(_name##_CH_TYPE, &head->ch_table, h, elm); \ + if (p == CH_INS_FAILED) \ + return -1; \ + if (prev != NULL) \ + *prev = p; \ + return (p == NULL); \ +} \ + \ +__unused static inline struct _type * \ +_name##_CH_REMOVE(struct _name *head, struct _type *elm) \ +{ \ + uint64_t h; \ + h = _hash(elm); \ + return _ch_remove(_name##_CH_TYPE, &head->ch_table, h, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_CH_FIND(struct _name *head, const struct _type *key) \ +{ \ + uint64_t h; \ + h = _hash(key); \ + return _ch_find(_name##_CH_TYPE, &head->ch_table, h, key); \ +} \ + \ +__unused static inline struct _type * \ +_name##_CH_LOCATE(struct _name *head, uint64_t hash, \ + int (*cmp)(const void *, void *), void *arg) \ +{ \ + return _ch_locate(_name##_CH_TYPE, &head->ch_table, hash, \ + cmp, arg); \ +} \ + \ +__unused static inline struct _type * \ +_name##_CH_FIRST(struct _name *head, struct ch_iter *iter) \ +{ \ + return _ch_first(_name##_CH_TYPE, &head->ch_table, iter); \ +} \ + \ +__unused static inline struct _type * \ +_name##_CH_NEXT(struct _name *head, struct ch_iter *iter) \ +{ \ + return _ch_next(_name##_CH_TYPE, &head->ch_table, iter); \ +} + + +#define CH_GENERATE(_name, _type, _eq, _hash) \ +static int \ +_name##_CH_EQUAL(const void *lptr, const void *rptr) \ +{ \ + const struct _type *l = lptr, *r = rptr; \ + return _eq(l, r); \ +} \ + \ +static uint64_t \ +_name##_CH_HASH(const void *ptr) \ +{ \ + const struct _type *obj = ptr; \ + return _hash(obj); \ +} \ + \ +static const struct ch_type _name##_CH_INFO = { \ + .t_equal = _name##_CH_EQUAL, \ + .t_hash = _name##_CH_HASH, \ +}; \ +const struct ch_type *const _name##_CH_TYPE = &_name##_CH_INFO + +#define CH_INIT(_name, _head) _name##_CH_INIT(_head) +#define CH_INSERT(_name, _head, _elm, _prev) \ + _name##_CH_INSERT(_head, _elm, _prev) +#define CH_REMOVE(_name, _head, _elm) _name##_CH_REMOVE(_head, _elm) +#define CH_FIND(_name, _head, _key) _name##_CH_FIND(_head, _key) +#define CH_LOCATE(_name, _head, _h, _cmp, _a) \ + _name##_CH_LOCATE(_head, _h, _cmp, _a) +#define CH_FIRST(_name, _head, _iter) _name##_CH_FIRST(_head, _iter) +#define CH_NEXT(_name, _head, _iter) _name##_CH_NEXT(_head, _iter) + +#define CH_FOREACH(_e, _name, _head, _iter) \ + for ((_e) = CH_FIRST(_name, (_head), (_iter)); \ + (_e) != NULL; \ + (_e) = CH_NEXT(_name, (_head), (_iter))) + +#endif Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v diff -u -p -r1.656 rde.c --- rde.c 4 Jun 2025 09:12:34 -0000 1.656 +++ rde.c 15 Sep 2025 11:52:53 -0000 @@ -212,6 +212,9 @@ rde_main(int debug, int verbose) TAILQ_INIT(out_rules); pt_init(); + attr_init(); + path_init(); + communities_init(); peer_init(out_rules); /* make sure the default RIBs are setup */ @@ -4779,8 +4782,6 @@ rde_shutdown(void) /* now check everything */ rib_shutdown(); nexthop_shutdown(); - path_shutdown(); - attr_shutdown(); pt_shutdown(); } Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v diff -u -p -r1.316 rde.h --- rde.h 4 Jun 2025 09:12:34 -0000 1.316 +++ rde.h 15 Sep 2025 11:52:02 -0000 @@ -163,7 +163,7 @@ enum attrtypes { #define ATTR_WELL_KNOWN ATTR_TRANSITIVE struct attr { - RB_ENTRY(attr) entry; + uint64_t hash; u_char *data; int refcnt; uint16_t len; @@ -172,17 +172,18 @@ struct attr { }; struct rde_community { - RB_ENTRY(rde_community) entry; - int size; - int nentries; - int flags; - int refcnt; + uint64_t hash; + int size; + int nentries; + int flags; + int refcnt; struct community *communities; }; #define PARTIAL_COMMUNITIES 0x01 #define PARTIAL_LARGE_COMMUNITIES 0x02 #define PARTIAL_EXT_COMMUNITIES 0x04 +#define PARTIAL_DIRTY 0x08 #define F_ATTR_ORIGIN 0x00001 #define F_ATTR_ASPATH 0x00002 @@ -208,7 +209,7 @@ struct rde_community { #define DEFAULT_LPREF 100 struct rde_aspath { - RB_ENTRY(rde_aspath) entry; + uint64_t hash; struct attr **others; struct aspath *aspath; struct rde_aspa_state aspa_state; @@ -223,6 +224,9 @@ struct rde_aspath { uint8_t others_len; uint8_t aspa_generation; }; +#define PATH_HASHOFF offsetof(struct rde_aspath, med) +#define PATH_HASHSTART(x) ((const uint8_t *)x + PATH_HASHOFF) +#define PATH_HASHSIZE (sizeof(struct rde_aspath) - PATH_HASHOFF) enum nexthop_state { NEXTHOP_LOOKUP, @@ -397,12 +401,13 @@ RB_PROTOTYPE(peer_tree, rde_peer, entry, /* rde_attr.c */ int attr_writebuf(struct ibuf *, uint8_t, uint8_t, void *, uint16_t); -void attr_shutdown(void); +void attr_init(void); int attr_optadd(struct rde_aspath *, uint8_t, uint8_t, void *, uint16_t); struct attr *attr_optget(const struct rde_aspath *, uint8_t); void attr_copy(struct rde_aspath *, const struct rde_aspath *); -int attr_compare(struct rde_aspath *, struct rde_aspath *); +int attr_equal(const struct rde_aspath *, + const struct rde_aspath *); void attr_freeall(struct rde_aspath *); void attr_free(struct rde_aspath *, struct attr *); @@ -413,7 +418,7 @@ u_char *aspath_deflate(u_char *, uint16 void aspath_merge(struct rde_aspath *, struct attr *); uint32_t aspath_neighbor(struct aspath *); int aspath_loopfree(struct aspath *, uint32_t); -int aspath_compare(struct aspath *, struct aspath *); +int aspath_compare(const struct aspath *, const struct aspath *); int aspath_match(struct aspath *, struct filter_as *, uint32_t); u_char *aspath_prepend(struct aspath *, uint32_t, int, uint16_t *); u_char *aspath_override(struct aspath *, uint32_t, uint32_t, @@ -452,12 +457,13 @@ int community_large_add(struct rde_commu int community_ext_add(struct rde_community *, int, int, struct ibuf *); int community_writebuf(struct rde_community *, uint8_t, int, struct ibuf *); -void communities_shutdown(void); +void communities_init(void); struct rde_community *communities_lookup(struct rde_community *); struct rde_community *communities_link(struct rde_community *); void communities_unlink(struct rde_community *); -int communities_equal(struct rde_community *, struct rde_community *); +int communities_equal(const struct rde_community *, + const struct rde_community *); void communities_copy(struct rde_community *, struct rde_community *); void communities_clean(struct rde_community *); @@ -575,7 +581,7 @@ re_rib(struct rib_entry *re) return rib_byid(re->rib_id); } -void path_shutdown(void); +void path_init(void); struct rde_aspath *path_copy(struct rde_aspath *, const struct rde_aspath *); struct rde_aspath *path_prep(struct rde_aspath *); struct rde_aspath *path_get(void); Index: rde_attr.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_attr.c,v diff -u -p -r1.137 rde_attr.c --- rde_attr.c 14 Apr 2025 11:49:39 -0000 1.137 +++ rde_attr.c 15 Sep 2025 11:52:24 -0000 @@ -22,12 +22,14 @@ #include #include +#include #include #include #include "bgpd.h" #include "rde.h" #include "log.h" +#include "chash.h" int attr_writebuf(struct ibuf *buf, uint8_t flags, uint8_t type, void *data, @@ -55,21 +57,36 @@ attr_writebuf(struct ibuf *buf, uint8_t } /* optional attribute specific functions */ -struct attr *attr_alloc(uint8_t, uint8_t, void *, uint16_t); -struct attr *attr_lookup(uint8_t, uint8_t, void *, uint16_t); -void attr_put(struct attr *); +static struct attr *attr_alloc(uint8_t, uint8_t, void *, uint16_t, uint64_t); +static struct attr *attr_lookup(uint8_t, uint8_t, void *, uint16_t, uint64_t); +static void attr_put(struct attr *); -static inline int attr_diff(struct attr *, struct attr *); +static SIPHASH_KEY attrkey; -RB_HEAD(attr_tree, attr) attrtable = RB_INITIALIZER(&attr); -RB_GENERATE_STATIC(attr_tree, attr, entry, attr_diff); +static inline uint64_t +attr_hash(const struct attr *a) +{ + return a->hash; +} + +static uint64_t +attr_calc_hash(uint8_t type, const void *data, uint16_t len) +{ + SIPHASH_CTX ctx; + + SipHash24_Init(&ctx, &attrkey); + SipHash24_Update(&ctx, data, len); + SipHash24_Update(&ctx, &type, sizeof(type)); + return SipHash24_End(&ctx); +} +CH_HEAD(attr_tree, attr) attrtable = CH_INITIALIZER(&attrtable); +CH_PROTOTYPE(attr_tree, attr, attr_hash); void -attr_shutdown(void) +attr_init(void) { - if (!RB_EMPTY(&attrtable)) - log_warnx("%s: free non-free attr table", __func__); + arc4random_buf(&attrkey, sizeof(attrkey)); } int @@ -79,6 +96,7 @@ attr_optadd(struct rde_aspath *asp, uint uint8_t l; struct attr *a, *t; void *p; + uint64_t h; /* attribute allowed only once */ for (l = 0; l < asp->others_len; l++) { @@ -91,8 +109,9 @@ attr_optadd(struct rde_aspath *asp, uint } /* known optional attributes were validated previously */ - if ((a = attr_lookup(flags, type, data, len)) == NULL) - a = attr_alloc(flags, type, data, len); + h = attr_calc_hash(type, data, len); + if ((a = attr_lookup(flags, type, data, len, h)) == NULL) + a = attr_alloc(flags, type, data, len, h); /* add attribute to the table but first bump refcnt */ a->refcnt++; @@ -169,57 +188,30 @@ attr_copy(struct rde_aspath *t, const st } static inline int -attr_diff(struct attr *oa, struct attr *ob) +attr_eq(const struct attr *oa, const struct attr *ob) { - int r; - - if (ob == NULL) - return (1); - if (oa == NULL) - return (-1); - if (oa->flags > ob->flags) - return (1); - if (oa->flags < ob->flags) - return (-1); - if (oa->type > ob->type) - return (1); - if (oa->type < ob->type) - return (-1); - if (oa->len > ob->len) - return (1); - if (oa->len < ob->len) - return (-1); - if (oa->len == 0) - return (0); - r = memcmp(oa->data, ob->data, oa->len); - if (r > 0) - return (1); - if (r < 0) - return (-1); - return (0); + if (oa->hash != ob->hash) + return 0; + if (oa->type != ob->type) + return 0; + if (oa->flags != ob->flags) + return 0; + if (oa->len != ob->len) + return 0; + return (oa->len == 0 || memcmp(oa->data, ob->data, oa->len) == 0); } int -attr_compare(struct rde_aspath *a, struct rde_aspath *b) +attr_equal(const struct rde_aspath *a, const struct rde_aspath *b) { - uint8_t l, min; + uint8_t l; - min = a->others_len < b->others_len ? a->others_len : b->others_len; - for (l = 0; l < min; l++) + if (a->others_len != b->others_len) + return (0); + for (l = 0; l < a->others_len; l++) if (a->others[l] != b->others[l]) - return (attr_diff(a->others[l], b->others[l])); - - if (a->others_len < b->others_len) { - for (; l < b->others_len; l++) - if (b->others[l] != NULL) - return (-1); - } else if (a->others_len > b->others_len) { - for (; l < a->others_len; l++) - if (a->others[l] != NULL) - return (1); - } - - return (0); + return (0); + return (1); } void @@ -253,7 +245,7 @@ attr_freeall(struct rde_aspath *asp) } struct attr * -attr_alloc(uint8_t flags, uint8_t type, void *data, uint16_t len) +attr_alloc(uint8_t flags, uint8_t type, void *data, uint16_t len, uint64_t hash) { struct attr *a; @@ -276,14 +268,17 @@ attr_alloc(uint8_t flags, uint8_t type, } else a->data = NULL; - if (RB_INSERT(attr_tree, &attrtable, a) != NULL) + a->hash = hash; + + if (CH_INSERT(attr_tree, &attrtable, a, NULL) != 1) fatalx("corrupted attr tree"); return (a); } struct attr * -attr_lookup(uint8_t flags, uint8_t type, void *data, uint16_t len) +attr_lookup(uint8_t flags, uint8_t type, void *data, uint16_t len, + uint64_t hash) { struct attr needle; @@ -293,7 +288,9 @@ attr_lookup(uint8_t flags, uint8_t type, needle.type = type; needle.len = len; needle.data = data; - return RB_FIND(attr_tree, &attrtable, &needle); + needle.hash = hash; + + return CH_FIND(attr_tree, &attrtable, &needle); } void @@ -308,7 +305,7 @@ attr_put(struct attr *a) return; /* unlink */ - RB_REMOVE(attr_tree, &attrtable, a); + CH_REMOVE(attr_tree, &attrtable, a); if (a->len != 0) rdemem.attr_dcnt--; @@ -318,6 +315,8 @@ attr_put(struct attr *a) free(a); } +CH_GENERATE(attr_tree, attr, attr_eq, attr_hash); + /* aspath specific functions */ static uint16_t aspath_count(const void *, uint16_t); @@ -327,7 +326,7 @@ static void aspath_countcopy(struct asp uint16_t, int); int -aspath_compare(struct aspath *a1, struct aspath *a2) +aspath_compare(const struct aspath *a1, const struct aspath *a2) { int r; Index: rde_community.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_community.c,v diff -u -p -r1.16 rde_community.c --- rde_community.c 10 Sep 2024 08:53:20 -0000 1.16 +++ rde_community.c 28 Aug 2025 09:34:10 -0000 @@ -19,12 +19,14 @@ #include #include +#include #include #include #include "bgpd.h" #include "rde.h" #include "log.h" +#include "chash.h" static int apply_flag(uint32_t in, uint8_t flag, struct rde_peer *peer, uint32_t *out, @@ -225,9 +227,10 @@ insert_community(struct rde_community *c struct community *new; int newsize = comm->size + 8; - if ((new = recallocarray(comm->communities, comm->size, - newsize, sizeof(struct community))) == NULL) + if ((new = reallocarray(comm->communities, newsize, + sizeof(struct community))) == NULL) fatal(__func__); + memset(&new[comm->size], 0, sizeof(struct community) * 8); comm->communities = new; comm->size = newsize; } @@ -250,6 +253,7 @@ insert_community(struct rde_community *c /* insert community at slot l */ comm->communities[l] = *c; comm->nentries++; + comm->flags |= PARTIAL_DIRTY; } static int @@ -391,6 +395,7 @@ struct rde_peer *peer) (char *)(comm->communities + comm->nentries) - (char *)(match + 1)); comm->nentries--; + comm->flags |= PARTIAL_DIRTY; return; } else { if (fc2c(fc, peer, &test, &mask) == -1) @@ -403,6 +408,7 @@ struct rde_peer *peer) comm->communities + l + 1, (comm->nentries - l - 1) * sizeof(test)); comm->nentries--; + comm->flags |= PARTIAL_DIRTY; continue; } l++; @@ -620,32 +626,44 @@ community_writebuf(struct rde_community /* * Global RIB cache for communities */ -static inline int -communities_compare(struct rde_community *a, struct rde_community *b) -{ - if (a->nentries != b->nentries) - return a->nentries > b->nentries ? 1 : -1; - if (a->flags != b->flags) - return a->flags > b->flags ? 1 : -1; +static SIPHASH_KEY commkey; - return memcmp(a->communities, b->communities, - a->nentries * sizeof(struct community)); +static inline uint64_t +communities_hash(const struct rde_community *comm) +{ + return comm->hash; } -RB_HEAD(comm_tree, rde_community) commtable = RB_INITIALIZER(&commtable); -RB_GENERATE_STATIC(comm_tree, rde_community, entry, communities_compare); +static inline void +communities_calc_hash(struct rde_community *comm) +{ + SIPHASH_CTX ctx; + + if (comm->flags & PARTIAL_DIRTY) { + comm->flags &= ~PARTIAL_DIRTY; + SipHash24_Init(&ctx, &commkey); + SipHash24_Update(&ctx, comm->communities, + comm->nentries * sizeof(struct community)); + SipHash24_Update(&ctx, &comm->flags, sizeof(comm->flags)); + comm->hash = SipHash24_End(&ctx); + } +} + +CH_HEAD(comm_tree, rde_community) commtable = CH_INITIALIZER(&commtable); +CH_PROTOTYPE(comm_tree, rde_community, communities_hash); +CH_GENERATE(comm_tree, rde_community, communities_equal, communities_hash); void -communities_shutdown(void) +communities_init(void) { - if (!RB_EMPTY(&commtable)) - log_warnx("%s: free non-free table", __func__); + arc4random_buf(&commkey, sizeof(commkey)); } struct rde_community * communities_lookup(struct rde_community *comm) { - return RB_FIND(comm_tree, &commtable, comm); + communities_calc_hash(comm); + return CH_FIND(comm_tree, &commtable, comm); } struct rde_community * @@ -656,8 +674,12 @@ communities_link(struct rde_community *c if ((n = malloc(sizeof(*n))) == NULL) fatal(__func__); communities_copy(n, comm); + communities_calc_hash(comm); - if ((f = RB_INSERT(comm_tree, &commtable, n)) != NULL) { + switch (CH_INSERT(comm_tree, &commtable, n, &f)) { + case -1: + fatal(__func__); + case 0: log_warnx("duplicate communities collection inserted"); free(n->communities); free(n); @@ -678,7 +700,7 @@ communities_unlink(struct rde_community if (comm->refcnt != 1) fatalx("%s: unlinking still referenced communities", __func__); - RB_REMOVE(comm_tree, &commtable, comm); + CH_REMOVE(comm_tree, &commtable, comm); rdemem.comm_size -= comm->size; rdemem.comm_nmemb -= comm->nentries; @@ -693,7 +715,7 @@ communities_unlink(struct rde_community * otherwise returns zero. */ int -communities_equal(struct rde_community *a, struct rde_community *b) +communities_equal(const struct rde_community *a, const struct rde_community *b) { if (a->nentries != b->nentries) return 0; @@ -718,6 +740,7 @@ communities_copy(struct rde_community *t to->size = from->nentries; to->nentries = from->nentries; to->flags = from->flags; + to->hash = from->hash; if (to->nentries == 0) return; Index: rde_rib.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v diff -u -p -r1.269 rde_rib.c --- rde_rib.c 24 Apr 2025 20:17:49 -0000 1.269 +++ rde_rib.c 15 Sep 2025 12:22:43 -0000 @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "bgpd.h" #include "rde.h" #include "log.h" +#include "chash.h" /* * BGP RIB -- Routing Information Base @@ -596,63 +598,64 @@ rib_dump_subtree(uint16_t id, struct bgp /* path specific functions */ -static struct rde_aspath *path_lookup(struct rde_aspath *); +static struct rde_aspath *path_getcache(struct rde_aspath *); static void path_link(struct rde_aspath *); static void path_unlink(struct rde_aspath *); -static inline int -path_compare(struct rde_aspath *a, struct rde_aspath *b) +static SIPHASH_KEY pathkey; + +static inline uint64_t +path_hash(const struct rde_aspath *p) { - int r; + return p->hash; +} + +static uint64_t +path_calc_hash(const struct rde_aspath *p) +{ + SIPHASH_CTX ctx; + + SipHash24_Init(&ctx, &pathkey); + SipHash24_Update(&ctx, PATH_HASHSTART(p), PATH_HASHSIZE); + SipHash24_Update(&ctx, aspath_dump(p->aspath), + aspath_length(p->aspath)); + return SipHash24_End(&ctx); +} +static inline int +path_equal(const struct rde_aspath *a, const struct rde_aspath *b) +{ if (a == NULL && b == NULL) - return (0); - else if (b == NULL) return (1); + else if (b == NULL) + return (0); else if (a == NULL) - return (-1); - if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED)) - return (1); - if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED)) - return (-1); - if (a->origin > b->origin) - return (1); - if (a->origin < b->origin) - return (-1); - if (a->med > b->med) - return (1); - if (a->med < b->med) - return (-1); - if (a->lpref > b->lpref) - return (1); - if (a->lpref < b->lpref) - return (-1); - if (a->weight > b->weight) - return (1); - if (a->weight < b->weight) - return (-1); - if (a->rtlabelid > b->rtlabelid) - return (1); - if (a->rtlabelid < b->rtlabelid) - return (-1); - if (a->pftableid > b->pftableid) - return (1); - if (a->pftableid < b->pftableid) - return (-1); + return (0); - /* no need to check aspa_state or aspa_generation */ + if ((a->flags & ~F_ATTR_LINKED) != (b->flags & ~F_ATTR_LINKED)) + return (0); + if (a->origin != b->origin) + return (0); + if (a->med != b->med) + return (0); + if (a->lpref != b->lpref) + return (0); + if (a->weight != b->weight) + return (0); + if (a->rtlabelid != b->rtlabelid) + return (0); + if (a->pftableid != b->pftableid) + return (0); - r = aspath_compare(a->aspath, b->aspath); - if (r > 0) - return (1); - if (r < 0) - return (-1); + /* no need to check aspa_state or aspa_generation */ - return (attr_compare(a, b)); + if (aspath_compare(a->aspath, b->aspath) != 0) + return (0); + return (attr_equal(a, b)); } -RB_HEAD(path_tree, rde_aspath) pathtable = RB_INITIALIZER(&pathtable); -RB_GENERATE_STATIC(path_tree, rde_aspath, entry, path_compare); +CH_HEAD(path_tree, rde_aspath) pathtable = CH_INITIALIZER(&pathtable); +CH_PROTOTYPE(path_tree, rde_aspath, path_hash); static inline struct rde_aspath * path_ref(struct rde_aspath *asp) @@ -679,16 +682,23 @@ path_unref(struct rde_aspath *asp) } void -path_shutdown(void) +path_init(void) { - if (!RB_EMPTY(&pathtable)) - log_warnx("path_free: free non-free table"); + arc4random_buf(&pathkey, sizeof(pathkey)); } static struct rde_aspath * -path_lookup(struct rde_aspath *aspath) +path_getcache(struct rde_aspath *aspath) { - return (RB_FIND(path_tree, &pathtable, aspath)); + struct rde_aspath *asp; + + aspath->hash = path_calc_hash(aspath); + if ((asp = CH_FIND(path_tree, &pathtable, aspath)) == NULL) { + /* Path not available, create and link a new one. */ + asp = path_copy(path_get(), aspath); + path_link(asp); + } + return asp; } /* @@ -698,7 +708,7 @@ path_lookup(struct rde_aspath *aspath) static void path_link(struct rde_aspath *asp) { - if (RB_INSERT(path_tree, &pathtable, asp) != NULL) + if (CH_INSERT(path_tree, &pathtable, asp, NULL) != 1) fatalx("%s: already linked object", __func__); asp->flags |= F_ATTR_LINKED; } @@ -717,7 +727,7 @@ path_unlink(struct rde_aspath *asp) if (asp->refcnt != 0) fatalx("%s: still holds references", __func__); - RB_REMOVE(path_tree, &pathtable, asp); + CH_REMOVE(path_tree, &pathtable, asp); asp->flags &= ~F_ATTR_LINKED; path_put(asp); @@ -734,6 +744,7 @@ path_copy(struct rde_aspath *dst, const dst->refcnt = 0; dst->flags = src->flags & ~F_ATTR_LINKED; + dst->hash = src->hash; dst->med = src->med; dst->lpref = src->lpref; dst->weight = src->weight; @@ -798,6 +809,8 @@ path_put(struct rde_aspath *asp) free(asp); } +CH_GENERATE(path_tree, rde_aspath, path_equal, path_hash); + /* prefix specific functions */ static int prefix_add(struct bgpd_addr *, int, struct rib *, @@ -985,7 +998,7 @@ prefix_update(struct rib *rib, struct rd if (prefix_nexthop(p) == state->nexthop && prefix_nhflags(p) == state->nhflags && communities_equal(ncomm, prefix_communities(p)) && - path_compare(nasp, prefix_aspath(p)) == 0) { + path_equal(nasp, prefix_aspath(p))) { /* no change, update last change */ p->lastchange = getmonotime(); p->validation_state = state->vstate; @@ -1002,11 +1015,7 @@ prefix_update(struct rib *rib, struct rd * In both cases lookup the new aspath to make sure it is not * already in the RIB. */ - if ((asp = path_lookup(nasp)) == NULL) { - /* Path not available, create and link a new one. */ - asp = path_copy(path_get(), nasp); - path_link(asp); - } + asp = path_getcache(nasp); if ((comm = communities_lookup(ncomm)) == NULL) { /* Communities not available, create and link a new one. */ @@ -1153,11 +1162,8 @@ prefix_flowspec_update(struct rde_peer * nasp = &state->aspath; ncomm = &state->communities; - if ((asp = path_lookup(nasp)) == NULL) { - /* Path not available, create and link a new one. */ - asp = path_copy(path_get(), nasp); - path_link(asp); - } + asp = path_getcache(nasp); + if ((comm = communities_lookup(ncomm)) == NULL) { /* Communities not available, create and link a new one. */ comm = communities_link(ncomm); @@ -1271,7 +1277,7 @@ prefix_adjout_update(struct prefix *p, s prefix_nexthop(p) == state->nexthop && communities_equal(&state->communities, prefix_communities(p)) && - path_compare(&state->aspath, prefix_aspath(p)) == 0) { + path_equal(&state->aspath, prefix_aspath(p))) { /* nothing changed */ p->validation_state = state->vstate; p->lastchange = getmonotime(); @@ -1306,11 +1312,7 @@ prefix_adjout_update(struct prefix *p, s fatalx("%s: RB index invariant violated", __func__); } - if ((asp = path_lookup(&state->aspath)) == NULL) { - /* Path not available, create and link a new one. */ - asp = path_copy(path_get(), &state->aspath); - path_link(asp); - } + asp = path_getcache(&state->aspath); if ((comm = communities_lookup(&state->communities)) == NULL) { /* Communities not available, create and link a new one. */