From: Yuichiro NAITO Subject: Re: gprof: Profiling a multi-threaded application (revived) To: tech@openbsd.org, deraadt@openbsd.org, guenther@openbsd.org, mpi@openbsd.org Date: Fri, 06 Jun 2025 17:43:10 +0900 Small update to my patch. From: Yuichiro NAITO Subject: Re: gprof: Profiling a multi-threaded application (revived) Date: Thu, 05 Jun 2025 18:27:20 +0900 (JST) > I updated the multi-thread profiling patch through discussion with > Theo de Raadt. I'll list the improvements. > > 1. Allocate kcount buffer memory when a thread starts. > > The allocation logic becomes simple and also reduces the patch size. > This needs the '_gmon_alloc' symbol to be exported for the libpthread. > > 2. Allocate buffer memory on a single mmap(2) call. > > This is the same way as Theo's commit. > https://github.com/OpenBSD/src/commit/4ea76518007ed1dc99d51b18368b53ba7b11fd17 I misunderstood Theo's intention about pointer calculation. I should hold the return value of mmap in a char * variable, so that I don't need code-width cast. > 3. Mark 'static' to '_gmonfree', '_gmoninuse', '_gmonkey' variables. > > These are used only in the 'gmon.c'. > > 4. Refactor 'gmonparam' lookup code in the __mcount function. > > Reduce the patch size. > diff --git a/lib/libc/Symbols.list b/lib/libc/Symbols.list index 6eb2cca93e5..d5d5497cf93 100644 --- a/lib/libc/Symbols.list +++ b/lib/libc/Symbols.list @@ -880,6 +880,7 @@ _mcleanup _monstartup moncontrol monstartup +_gmon_alloc /* hash */ MD5Data diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c index 99fd42d7824..93176edd11d 100644 --- a/lib/libc/gmon/gmon.c +++ b/lib/libc/gmon/gmon.c @@ -42,12 +42,21 @@ struct gmonparam _gmonparam = { GMON_PROF_OFF }; +#include +#include + +static SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree); +static SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse); +_THREAD_PRIVATE_MUTEX(_gmonlock); +static pthread_key_t _gmonkey; + static int s_scale; /* see profil(2) where this is describe (incorrectly) */ #define SCALE_1_TO_1 0x10000L #define ERR(s) write(STDERR_FILENO, s, sizeof(s)) +PROTO_NORMAL(_gmon_alloc); PROTO_NORMAL(moncontrol); PROTO_DEPRECATED(monstartup); @@ -62,6 +71,19 @@ monstartup(u_long lowpc, u_long highpc) #define PAGEMASK (PAGESIZE - 1) #define PAGEROUND(x) (((x) + (PAGEMASK)) & ~PAGEMASK) +static void +_gmon_destructor(void *arg) +{ + struct gmonparam *p = arg, *q, **prev; + + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); + SLIST_REMOVE(&_gmoninuse, p, gmonparam, list); + SLIST_INSERT_HEAD(&_gmonfree, p, list); + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); + + pthread_setspecific(_gmonkey, NULL); +} + void _monstartup(u_long lowpc, u_long highpc) { @@ -130,12 +152,166 @@ _monstartup(u_long lowpc, u_long highpc) else p->dirfd = -1; + pthread_key_create(&_gmonkey, _gmon_destructor); + moncontrol(1); if (p->dirfd != -1) close(p->dirfd); } +struct gmonparam * +_gmon_alloc(void) +{ + struct gmonparam *p; + char *a; + + if (_gmonparam.state == GMON_PROF_OFF) + return NULL; + + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); + p = SLIST_FIRST(&_gmonfree); + if (p != NULL) { + SLIST_REMOVE_HEAD(&_gmonfree, list); + SLIST_INSERT_HEAD(&_gmoninuse, p, list); + } else { + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); + a = mmap(NULL, + _ALIGN(sizeof(*p)) + _ALIGN(_gmonparam.fromssize) + + _ALIGN(_gmonparam.tossize), + PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); + if (a == MAP_FAILED) { + pthread_setspecific(_gmonkey, NULL); + ERR("_gmon_alloc: out of memory\n"); + return NULL; + } + p = (struct gmonparam *)a; + *p = _gmonparam; + p->kcount = NULL; + p->kcountsize = 0; + p->froms = (void *)(a + _ALIGN(sizeof(*p))); + p->tos = (void *)(a + _ALIGN(sizeof(*p)) + + _ALIGN(_gmonparam.fromssize)); + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); + SLIST_INSERT_HEAD(&_gmoninuse, p, list); + } + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); + pthread_setspecific(_gmonkey, p); + + return p; +} +DEF_WEAK(_gmon_alloc); + +static void +_gmon_merge_two(struct gmonparam *p, struct gmonparam *q) +{ + u_long fromindex, selfpc, endfrom; + u_short *frompcindex, qtoindex, toindex; + long count; + struct tostruct *top; + + endfrom = (q->fromssize / sizeof(*q->froms)); + for (fromindex = 0; fromindex < endfrom; fromindex++) { + if (q->froms[fromindex] == 0) + continue; + for (qtoindex = q->froms[fromindex]; qtoindex != 0; + qtoindex = q->tos[qtoindex].link) { + selfpc = q->tos[qtoindex].selfpc; + count = q->tos[qtoindex].count; + /* cribbed from mcount */ + frompcindex = &p->froms[fromindex]; + toindex = *frompcindex; + if (toindex == 0) { + /* + * first time traversing this arc + */ + toindex = ++p->tos[0].link; + if (toindex >= p->tolimit) + /* halt further profiling */ + goto overflow; + + *frompcindex = (u_short)toindex; + top = &p->tos[(size_t)toindex]; + top->selfpc = selfpc; + top->count = count; + top->link = 0; + goto done; + } + top = &p->tos[(size_t)toindex]; + if (top->selfpc == selfpc) { + /* + * arc at front of chain; usual case. + */ + top->count+= count; + goto done; + } + /* + * have to go looking down chain for it. + * top points to what we are looking at, + * we know it is not at the head of the chain. + */ + for (; /* goto done */; ) { + if (top->link == 0) { + /* + * top is end of the chain and + * none of the chain had + * top->selfpc == selfpc. so + * we allocate a new tostruct + * and link it to the head of + * the chain. + */ + toindex = ++p->tos[0].link; + if (toindex >= p->tolimit) + goto overflow; + top = &p->tos[(size_t)toindex]; + top->selfpc = selfpc; + top->count = count; + top->link = *frompcindex; + *frompcindex = (u_short)toindex; + goto done; + } + /* + * otherwise, check the next arc on the chain. + */ + top = &p->tos[top->link]; + if (top->selfpc == selfpc) { + /* + * there it is. + * add to its count. + */ + top->count += count; + goto done; + } + + } + + done: ; + } + + } +overflow: ; + +} + +static void +_gmon_merge(void) +{ + struct gmonparam *q; + + _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock); + + SLIST_FOREACH(q, &_gmonfree, list) + _gmon_merge_two(&_gmonparam, q); + + SLIST_FOREACH(q, &_gmoninuse, list) { + q->state = GMON_PROF_OFF; + _gmon_merge_two(&_gmonparam, q); + } + + _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock); +} + + void _mcleanup(void) { @@ -179,6 +355,9 @@ _mcleanup(void) p->kcount, p->kcountsize); write(log, dbuf, strlen(dbuf)); #endif + + _gmon_merge(); + hdr = (struct gmonhdr *)p->outbuf; hdr->lpc = p->lowpc; hdr->hpc = p->highpc; diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c index 6846c10c857..f5ecb3f7cdc 100644 --- a/lib/libc/gmon/mcount.c +++ b/lib/libc/gmon/mcount.c @@ -31,6 +31,11 @@ #include #include +#ifndef _KERNEL +#include +#include +#endif + /* * mcount is called on entry to each function compiled with the profiling * switch set. _mcount(), which is declared in a machine-dependent way @@ -65,13 +70,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc) if ((p = curcpu()->ci_gmon) == NULL) return; #else - p = &_gmonparam; + p = (__isthreaded) ? + ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam; #endif /* * check that we are profiling * and that we aren't recursively invoked. */ - if (p->state != GMON_PROF_ON) + if (p == NULL || p->state != GMON_PROF_ON) return; #ifdef _KERNEL MCOUNT_ENTER; diff --git a/lib/libc/include/thread_private.h b/lib/libc/include/thread_private.h index 1ec10711615..6b8164bab43 100644 --- a/lib/libc/include/thread_private.h +++ b/lib/libc/include/thread_private.h @@ -5,6 +5,9 @@ #ifndef _THREAD_PRIVATE_H_ #define _THREAD_PRIVATE_H_ +#include +#include + extern int __isthreaded; #define _MALLOC_MUTEXES 32 @@ -390,6 +393,7 @@ struct pthread { /* cancel received in a delayed cancel block? */ int delayed_cancel; + struct gmonparam *gmonparam; }; /* flags in pthread->flags */ #define THREAD_DONE 0x001 diff --git a/lib/libc/thread/rthread.c b/lib/libc/thread/rthread.c index ae74aceb51b..f8116e8a0e0 100644 --- a/lib/libc/thread/rthread.c +++ b/lib/libc/thread/rthread.c @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -103,6 +104,9 @@ _rthread_init(void) } _threads_inited = 1; + + /* Ignore errors. NULL is OK for a non-profiling case. */ + thread->gmonparam = _gmon_alloc(); } /* diff --git a/lib/librthread/rthread.c b/lib/librthread/rthread.c index 53f1d9ca82b..3389ea6baa9 100644 --- a/lib/librthread/rthread.c +++ b/lib/librthread/rthread.c @@ -25,6 +25,7 @@ #include #pragma weak _DYNAMIC #endif +#include #include #include @@ -393,6 +394,10 @@ pthread_create(pthread_t *threadp, const pthread_attr_t *attr, /* we're going to be multi-threaded real soon now */ __isthreaded = 1; + + /* Ignore errors. NULL is OK for a non-profiling case. */ + thread->gmonparam = _gmon_alloc(); + rc = __tfork_thread(¶m, sizeof(param), _rthread_start, thread); if (rc != -1) { /* success */ diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c index e020dede07a..59b815a02d5 100644 --- a/sys/lib/libkern/mcount.c +++ b/sys/lib/libkern/mcount.c @@ -33,6 +33,11 @@ #include #include +#ifndef _KERNEL +#include +#include +#endif + /* * mcount is called on entry to each function compiled with the profiling * switch set. _mcount(), which is declared in a machine-dependent way @@ -67,13 +72,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc) if ((p = curcpu()->ci_gmon) == NULL) return; #else - p = &_gmonparam; + p = (__isthreaded) ? + ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam; #endif /* * check that we are profiling * and that we aren't recursively invoked. */ - if (p->state != GMON_PROF_ON) + if (p == NULL || p->state != GMON_PROF_ON) return; #ifdef _KERNEL MCOUNT_ENTER; diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h index baaf7743be5..a29bfa849d9 100644 --- a/sys/sys/gmon.h +++ b/sys/sys/gmon.h @@ -35,6 +35,7 @@ #ifndef _SYS_GMON_H_ #define _SYS_GMON_H_ +#include #include /* @@ -141,6 +142,7 @@ struct gmonparam { size_t outbuflen; void *rawarcs; int dirfd; + SLIST_ENTRY(gmonparam) list; }; /* @@ -174,6 +176,7 @@ void _monstartup(u_long, u_long); void moncontrol(int); /* XXX remove end of may 2025 */ void monstartup(u_long, u_long); +struct gmonparam *_gmon_alloc(void); __END_DECLS #endif /* !_KERNEL */ -- Yuichiro NAITO (naito.yuichiro@gmail.com)