From: Mark Kettenis Subject: Re: gprof: Profiling a multi-threaded application (revived) To: Yuichiro NAITO Cc: tech@openbsd.org, deraadt@openbsd.org, guenther@openbsd.org, mpi@openbsd.org Date: Thu, 12 Jun 2025 16:12:30 +0200 > Date: Fri, 06 Jun 2025 17:43:10 +0900 (JST) > From: Yuichiro NAITO Hi Yuichiro, This is changing the interface between libc and libpthread. This is tricky. Minimally this needs libc and libpthread minor bumps. But it is more complicated than that and I have trouble wrapping my head around exactly how that works again. guenther@ should know more. > > Small update to my patch. > > From: Yuichiro NAITO > Subject: Re: gprof: Profiling a multi-threaded application (revived) > Date: Thu, 05 Jun 2025 18:27:20 +0900 (JST) > > > I updated the multi-thread profiling patch through discussion with > > Theo de Raadt. I'll list the improvements. > > > > 1. Allocate kcount buffer memory when a thread starts. > > > > The allocation logic becomes simple and also reduces the patch size. > > This needs the '_gmon_alloc' symbol to be exported for the libpthread. > > > > 2. Allocate buffer memory on a single mmap(2) call. > > > > This is the same way as Theo's commit. > > https://github.com/OpenBSD/src/commit/4ea76518007ed1dc99d51b18368b53ba7b11fd17 > > I misunderstood Theo's intention about pointer calculation. I should > hold the return value of mmap in a char * variable, so that I don't > need code-width cast. > > > 3. Mark 'static' to '_gmonfree', '_gmoninuse', '_gmonkey' variables. > > > > These are used only in the 'gmon.c'. > > > > 4. Refactor 'gmonparam' lookup code in the __mcount function. > > > > Reduce the patch size. > > > > diff --git a/lib/libc/Symbols.list b/lib/libc/Symbols.list > index 6eb2cca93e5..d5d5497cf93 100644 > --- a/lib/libc/Symbols.list > +++ b/lib/libc/Symbols.list > @@ -880,6 +880,7 @@ _mcleanup > _monstartup > moncontrol > monstartup > +_gmon_alloc > > /* hash */ > MD5Data > diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c > index 99fd42d7824..93176edd11d 100644 > --- a/lib/libc/gmon/gmon.c > +++ b/lib/libc/gmon/gmon.c > @@ -42,12 +42,21 @@ > > struct gmonparam _gmonparam = { GMON_PROF_OFF }; > > +#include > +#include > + > +static SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree); > +static SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse); > +_THREAD_PRIVATE_MUTEX(_gmonlock); > +static pthread_key_t _gmonkey; > + > static int s_scale; > /* see profil(2) where this is describe (incorrectly) */ > #define SCALE_1_TO_1 0x10000L > > #define ERR(s) write(STDERR_FILENO, s, sizeof(s)) > > +PROTO_NORMAL(_gmon_alloc); > PROTO_NORMAL(moncontrol); > PROTO_DEPRECATED(monstartup); > > @@ -62,6 +71,19 @@ monstartup(u_long lowpc, u_long highpc) > #define PAGEMASK (PAGESIZE - 1) > #define PAGEROUND(x) (((x) + (PAGEMASK)) & ~PAGEMASK) > > +static void > +_gmon_destructor(void *arg) > +{ > + struct gmonparam *p = arg, *q, **prev; > + > + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); > + SLIST_REMOVE(&_gmoninuse, p, gmonparam, list); > + SLIST_INSERT_HEAD(&_gmonfree, p, list); > + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); > + > + pthread_setspecific(_gmonkey, NULL); > +} > + > void > _monstartup(u_long lowpc, u_long highpc) > { > @@ -130,12 +152,166 @@ _monstartup(u_long lowpc, u_long highpc) > else > p->dirfd = -1; > > + pthread_key_create(&_gmonkey, _gmon_destructor); > + > moncontrol(1); > > if (p->dirfd != -1) > close(p->dirfd); > } > > +struct gmonparam * > +_gmon_alloc(void) > +{ > + struct gmonparam *p; > + char *a; > + > + if (_gmonparam.state == GMON_PROF_OFF) > + return NULL; > + > + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); > + p = SLIST_FIRST(&_gmonfree); > + if (p != NULL) { > + SLIST_REMOVE_HEAD(&_gmonfree, list); > + SLIST_INSERT_HEAD(&_gmoninuse, p, list); > + } else { > + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); > + a = mmap(NULL, > + _ALIGN(sizeof(*p)) + _ALIGN(_gmonparam.fromssize) + > + _ALIGN(_gmonparam.tossize), > + PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); > + if (a == MAP_FAILED) { > + pthread_setspecific(_gmonkey, NULL); > + ERR("_gmon_alloc: out of memory\n"); > + return NULL; > + } > + p = (struct gmonparam *)a; > + *p = _gmonparam; > + p->kcount = NULL; > + p->kcountsize = 0; > + p->froms = (void *)(a + _ALIGN(sizeof(*p))); > + p->tos = (void *)(a + _ALIGN(sizeof(*p)) + > + _ALIGN(_gmonparam.fromssize)); > + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); > + SLIST_INSERT_HEAD(&_gmoninuse, p, list); > + } > + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); > + pthread_setspecific(_gmonkey, p); > + > + return p; > +} > +DEF_WEAK(_gmon_alloc); > + > +static void > +_gmon_merge_two(struct gmonparam *p, struct gmonparam *q) > +{ > + u_long fromindex, selfpc, endfrom; > + u_short *frompcindex, qtoindex, toindex; > + long count; > + struct tostruct *top; > + > + endfrom = (q->fromssize / sizeof(*q->froms)); > + for (fromindex = 0; fromindex < endfrom; fromindex++) { > + if (q->froms[fromindex] == 0) > + continue; > + for (qtoindex = q->froms[fromindex]; qtoindex != 0; > + qtoindex = q->tos[qtoindex].link) { > + selfpc = q->tos[qtoindex].selfpc; > + count = q->tos[qtoindex].count; > + /* cribbed from mcount */ > + frompcindex = &p->froms[fromindex]; > + toindex = *frompcindex; > + if (toindex == 0) { > + /* > + * first time traversing this arc > + */ > + toindex = ++p->tos[0].link; > + if (toindex >= p->tolimit) > + /* halt further profiling */ > + goto overflow; > + > + *frompcindex = (u_short)toindex; > + top = &p->tos[(size_t)toindex]; > + top->selfpc = selfpc; > + top->count = count; > + top->link = 0; > + goto done; > + } > + top = &p->tos[(size_t)toindex]; > + if (top->selfpc == selfpc) { > + /* > + * arc at front of chain; usual case. > + */ > + top->count+= count; > + goto done; > + } > + /* > + * have to go looking down chain for it. > + * top points to what we are looking at, > + * we know it is not at the head of the chain. > + */ > + for (; /* goto done */; ) { > + if (top->link == 0) { > + /* > + * top is end of the chain and > + * none of the chain had > + * top->selfpc == selfpc. so > + * we allocate a new tostruct > + * and link it to the head of > + * the chain. > + */ > + toindex = ++p->tos[0].link; > + if (toindex >= p->tolimit) > + goto overflow; > + top = &p->tos[(size_t)toindex]; > + top->selfpc = selfpc; > + top->count = count; > + top->link = *frompcindex; > + *frompcindex = (u_short)toindex; > + goto done; > + } > + /* > + * otherwise, check the next arc on the chain. > + */ > + top = &p->tos[top->link]; > + if (top->selfpc == selfpc) { > + /* > + * there it is. > + * add to its count. > + */ > + top->count += count; > + goto done; > + } > + > + } > + > + done: ; > + } > + > + } > +overflow: ; > + > +} > + > +static void > +_gmon_merge(void) > +{ > + struct gmonparam *q; > + > + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); > + > + SLIST_FOREACH(q, &_gmonfree, list) > + _gmon_merge_two(&_gmonparam, q); > + > + SLIST_FOREACH(q, &_gmoninuse, list) { > + q->state = GMON_PROF_OFF; > + _gmon_merge_two(&_gmonparam, q); > + } > + > + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); > +} > + > + > void > _mcleanup(void) > { > @@ -179,6 +355,9 @@ _mcleanup(void) > p->kcount, p->kcountsize); > write(log, dbuf, strlen(dbuf)); > #endif > + > + _gmon_merge(); > + > hdr = (struct gmonhdr *)p->outbuf; > hdr->lpc = p->lowpc; > hdr->hpc = p->highpc; > diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c > index 6846c10c857..f5ecb3f7cdc 100644 > --- a/lib/libc/gmon/mcount.c > +++ b/lib/libc/gmon/mcount.c > @@ -31,6 +31,11 @@ > #include > #include > > +#ifndef _KERNEL > +#include > +#include > +#endif > + > /* > * mcount is called on entry to each function compiled with the profiling > * switch set. _mcount(), which is declared in a machine-dependent way > @@ -65,13 +70,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc) > if ((p = curcpu()->ci_gmon) == NULL) > return; > #else > - p = &_gmonparam; > + p = (__isthreaded) ? > + ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam; > #endif > /* > * check that we are profiling > * and that we aren't recursively invoked. > */ > - if (p->state != GMON_PROF_ON) > + if (p == NULL || p->state != GMON_PROF_ON) > return; > #ifdef _KERNEL > MCOUNT_ENTER; > diff --git a/lib/libc/include/thread_private.h b/lib/libc/include/thread_private.h > index 1ec10711615..6b8164bab43 100644 > --- a/lib/libc/include/thread_private.h > +++ b/lib/libc/include/thread_private.h > @@ -5,6 +5,9 @@ > #ifndef _THREAD_PRIVATE_H_ > #define _THREAD_PRIVATE_H_ > > +#include > +#include > + > extern int __isthreaded; > > #define _MALLOC_MUTEXES 32 > @@ -390,6 +393,7 @@ struct pthread { > > /* cancel received in a delayed cancel block? */ > int delayed_cancel; > + struct gmonparam *gmonparam; > }; > /* flags in pthread->flags */ > #define THREAD_DONE 0x001 > diff --git a/lib/libc/thread/rthread.c b/lib/libc/thread/rthread.c > index ae74aceb51b..f8116e8a0e0 100644 > --- a/lib/libc/thread/rthread.c > +++ b/lib/libc/thread/rthread.c > @@ -21,6 +21,7 @@ > > #include > #include > +#include > > #include > #include > @@ -103,6 +104,9 @@ _rthread_init(void) > } > > _threads_inited = 1; > + > + /* Ignore errors. NULL is OK for a non-profiling case. */ > + thread->gmonparam = _gmon_alloc(); > } > > /* > diff --git a/lib/librthread/rthread.c b/lib/librthread/rthread.c > index 53f1d9ca82b..3389ea6baa9 100644 > --- a/lib/librthread/rthread.c > +++ b/lib/librthread/rthread.c > @@ -25,6 +25,7 @@ > #include > #pragma weak _DYNAMIC > #endif > +#include > > #include > #include > @@ -393,6 +394,10 @@ pthread_create(pthread_t *threadp, const pthread_attr_t *attr, > > /* we're going to be multi-threaded real soon now */ > __isthreaded = 1; > + > + /* Ignore errors. NULL is OK for a non-profiling case. */ > + thread->gmonparam = _gmon_alloc(); > + > rc = __tfork_thread(¶m, sizeof(param), _rthread_start, thread); > if (rc != -1) { > /* success */ > diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c > index e020dede07a..59b815a02d5 100644 > --- a/sys/lib/libkern/mcount.c > +++ b/sys/lib/libkern/mcount.c > @@ -33,6 +33,11 @@ > #include > #include > > +#ifndef _KERNEL > +#include > +#include > +#endif > + > /* > * mcount is called on entry to each function compiled with the profiling > * switch set. _mcount(), which is declared in a machine-dependent way > @@ -67,13 +72,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc) > if ((p = curcpu()->ci_gmon) == NULL) > return; > #else > - p = &_gmonparam; > + p = (__isthreaded) ? > + ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam; > #endif > /* > * check that we are profiling > * and that we aren't recursively invoked. > */ > - if (p->state != GMON_PROF_ON) > + if (p == NULL || p->state != GMON_PROF_ON) > return; > #ifdef _KERNEL > MCOUNT_ENTER; > diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h > index baaf7743be5..a29bfa849d9 100644 > --- a/sys/sys/gmon.h > +++ b/sys/sys/gmon.h > @@ -35,6 +35,7 @@ > #ifndef _SYS_GMON_H_ > #define _SYS_GMON_H_ > > +#include > #include > > /* > @@ -141,6 +142,7 @@ struct gmonparam { > size_t outbuflen; > void *rawarcs; > int dirfd; > + SLIST_ENTRY(gmonparam) list; > }; > > /* > @@ -174,6 +176,7 @@ void _monstartup(u_long, u_long); > void moncontrol(int); > /* XXX remove end of may 2025 */ > void monstartup(u_long, u_long); > +struct gmonparam *_gmon_alloc(void); > __END_DECLS > > #endif /* !_KERNEL */ > > -- > Yuichiro NAITO (naito.yuichiro@gmail.com) > >