Download raw body.
gprof: Profiling a multi-threaded application (revived)
Hi, thanks for reviewing my patch.
From: Mark Kettenis <mark.kettenis@xs4all.nl>
Subject: Re: gprof: Profiling a multi-threaded application (revived)
Date: Thu, 12 Jun 2025 16:53:22 +0200
>> From: "Theo de Raadt" <deraadt@openbsd.org>
>> Date: Thu, 12 Jun 2025 08:16:31 -0600
>>
>> Mark Kettenis <mark.kettenis@xs4all.nl> wrote:
>>
>> > > Date: Fri, 06 Jun 2025 17:43:10 +0900 (JST)
>> > > From: Yuichiro NAITO <naito.yuichiro@gmail.com>
>> >
>> > Hi Yuichiro,
>> >
>> > This is changing the interface between libc and libpthread. This is
>> > tricky. Minimally this needs libc and libpthread minor bumps. But it
>> > is more complicated than that and I have trouble wrapping my head
>> > around exactly how that works again. guenther@ should know more.
>>
>> I am not concerned, because profiling only works in static binaries.
>
> The problem is the following bit of code:
>
> + /* Ignore errors. NULL is OK for a non-profiling case. */
> + thread->gmonparam = _gmon_alloc();
>
> which is executed unconditionally. That gmonparam member lives in a
> struct whose stotage is allocated by libc. So if you end up combining
> an old libc that doesn't allocate the storage for the gmonparam member
> with a new libpthread that writes to it you'll overwrite something
> else.
>
> That said, in that case you'd already lose because you have an
> unresolved reference to _gmon_alloc(), so your binary wouldn't run in
> the first place.
>
>> In dynamic binaries, I don't believe any of this code is active because
>> gcrt0 doesn't get linked in to create the conditions.
>>
>> As for the cranks. It adds a symbol between the two libraries, so it
>> needs minor cranks. But libc needs a major bump for removal of a
>> different symbol, so we'll just major bump both libraries. And that
>> ends any concern there, I think.
>
> I think a major bump for both would indeed be necessary.
> Unfortunately libpthread major bumps are something we haven't been
> able to do "right" in the past. In principle we'd need to do bump all
> the libs linked against libpthread. In base that means the xenocara
> libs, but I don't think we have a list. But those only really get
> used by ports and so we can just declare a flag day where we tell
> people some ports may not run until they run pkg_add -u?
What do you think about splitting my patch into two parts?
First, we update libc only and bump the libc major. At this point,
we add just the _gmon_alloc function and export the symbol. No one
calls the _gmon_alloc function; it prepares the following updates.
And then wait for the next release that will update xenocara and
all the packages to link to the new libc.
After the new release, we update the remaining part of gmon and
libpthread, and also bump the libpthread major. We don't need to
bump libc major for the second patch. Because it doesn't add
new functions.
This step-by-step approach avoids the combination problem between
libc and libpthread. I think the libpthread bump happened in the
OpenBSD history. We can follow the same manner as before.
Here is the first patch.
diff --git a/lib/libc/Symbols.list b/lib/libc/Symbols.list
index 6eb2cca93e5..d5d5497cf93 100644
--- a/lib/libc/Symbols.list
+++ b/lib/libc/Symbols.list
@@ -880,6 +880,7 @@ _mcleanup
_monstartup
moncontrol
monstartup
+_gmon_alloc
/* hash */
MD5Data
diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
index 99fd42d7824..54e89aef2c6 100644
--- a/lib/libc/gmon/gmon.c
+++ b/lib/libc/gmon/gmon.c
@@ -42,12 +42,21 @@
struct gmonparam _gmonparam = { GMON_PROF_OFF };
+#include <pthread.h>
+#include <thread_private.h>
+
+static SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
+static SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
+_THREAD_PRIVATE_MUTEX(_gmonlock);
+static pthread_key_t _gmonkey;
+
static int s_scale;
/* see profil(2) where this is describe (incorrectly) */
#define SCALE_1_TO_1 0x10000L
#define ERR(s) write(STDERR_FILENO, s, sizeof(s))
+PROTO_NORMAL(_gmon_alloc);
PROTO_NORMAL(moncontrol);
PROTO_DEPRECATED(monstartup);
@@ -136,6 +145,48 @@ _monstartup(u_long lowpc, u_long highpc)
close(p->dirfd);
}
+struct gmonparam *
+_gmon_alloc(void)
+{
+ struct gmonparam *p;
+ char *a;
+
+ if (_gmonparam.state == GMON_PROF_OFF)
+ return NULL;
+
+ _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+ p = SLIST_FIRST(&_gmonfree);
+ if (p != NULL) {
+ SLIST_REMOVE_HEAD(&_gmonfree, list);
+ SLIST_INSERT_HEAD(&_gmoninuse, p, list);
+ } else {
+ _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+ a = mmap(NULL,
+ _ALIGN(sizeof(*p)) + _ALIGN(_gmonparam.fromssize) +
+ _ALIGN(_gmonparam.tossize),
+ PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+ if (a == MAP_FAILED) {
+ pthread_setspecific(_gmonkey, NULL);
+ ERR("_gmon_alloc: out of memory\n");
+ return NULL;
+ }
+ p = (struct gmonparam *)a;
+ *p = _gmonparam;
+ p->kcount = NULL;
+ p->kcountsize = 0;
+ p->froms = (void *)(a + _ALIGN(sizeof(*p)));
+ p->tos = (void *)(a + _ALIGN(sizeof(*p)) +
+ _ALIGN(_gmonparam.fromssize));
+ _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+ SLIST_INSERT_HEAD(&_gmoninuse, p, list);
+ }
+ _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+ pthread_setspecific(_gmonkey, p);
+
+ return p;
+}
+DEF_WEAK(_gmon_alloc);
+
void
_mcleanup(void)
{
diff --git a/lib/libc/shlib_version b/lib/libc/shlib_version
index 7a749b0c915..98af1dc86b6 100644
--- a/lib/libc/shlib_version
+++ b/lib/libc/shlib_version
@@ -1,4 +1,4 @@
-major=100
-minor=3
+major=101
+minor=0
# note: If changes were made to include/thread_private.h or if system calls
# were added/changed then librthread/shlib_version must also be updated.
diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h
index baaf7743be5..a29bfa849d9 100644
--- a/sys/sys/gmon.h
+++ b/sys/sys/gmon.h
@@ -35,6 +35,7 @@
#ifndef _SYS_GMON_H_
#define _SYS_GMON_H_
+#include <sys/queue.h>
#include <machine/profile.h>
/*
@@ -141,6 +142,7 @@ struct gmonparam {
size_t outbuflen;
void *rawarcs;
int dirfd;
+ SLIST_ENTRY(gmonparam) list;
};
/*
@@ -174,6 +176,7 @@ void _monstartup(u_long, u_long);
void moncontrol(int);
/* XXX remove end of may 2025 */
void monstartup(u_long, u_long);
+struct gmonparam *_gmon_alloc(void);
__END_DECLS
#endif /* !_KERNEL */
And the second one.
diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
index 54e89aef2c6..1924476c667 100644
--- a/lib/libc/gmon/gmon.c
+++ b/lib/libc/gmon/gmon.c
@@ -71,6 +71,19 @@ monstartup(u_long lowpc, u_long highpc)
#define PAGEMASK (PAGESIZE - 1)
#define PAGEROUND(x) (((x) + (PAGEMASK)) & ~PAGEMASK)
+static void
+_gmon_destructor(void *arg)
+{
+ struct gmonparam *p = arg, *q, **prev;
+
+ _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+ SLIST_REMOVE(&_gmoninuse, p, gmonparam, list);
+ SLIST_INSERT_HEAD(&_gmonfree, p, list);
+ _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+
+ pthread_setspecific(_gmonkey, NULL);
+}
+
void
_monstartup(u_long lowpc, u_long highpc)
{
@@ -139,6 +152,8 @@ _monstartup(u_long lowpc, u_long highpc)
else
p->dirfd = -1;
+ pthread_key_create(&_gmonkey, _gmon_destructor);
+
moncontrol(1);
if (p->dirfd != -1)
@@ -187,6 +202,115 @@ _gmon_alloc(void)
}
DEF_WEAK(_gmon_alloc);
+static void
+_gmon_merge_two(struct gmonparam *p, struct gmonparam *q)
+{
+ u_long fromindex, selfpc, endfrom;
+ u_short *frompcindex, qtoindex, toindex;
+ long count;
+ struct tostruct *top;
+
+ endfrom = (q->fromssize / sizeof(*q->froms));
+ for (fromindex = 0; fromindex < endfrom; fromindex++) {
+ if (q->froms[fromindex] == 0)
+ continue;
+ for (qtoindex = q->froms[fromindex]; qtoindex != 0;
+ qtoindex = q->tos[qtoindex].link) {
+ selfpc = q->tos[qtoindex].selfpc;
+ count = q->tos[qtoindex].count;
+ /* cribbed from mcount */
+ frompcindex = &p->froms[fromindex];
+ toindex = *frompcindex;
+ if (toindex == 0) {
+ /*
+ * first time traversing this arc
+ */
+ toindex = ++p->tos[0].link;
+ if (toindex >= p->tolimit)
+ /* halt further profiling */
+ goto overflow;
+
+ *frompcindex = (u_short)toindex;
+ top = &p->tos[(size_t)toindex];
+ top->selfpc = selfpc;
+ top->count = count;
+ top->link = 0;
+ goto done;
+ }
+ top = &p->tos[(size_t)toindex];
+ if (top->selfpc == selfpc) {
+ /*
+ * arc at front of chain; usual case.
+ */
+ top->count+= count;
+ goto done;
+ }
+ /*
+ * have to go looking down chain for it.
+ * top points to what we are looking at,
+ * we know it is not at the head of the chain.
+ */
+ for (; /* goto done */; ) {
+ if (top->link == 0) {
+ /*
+ * top is end of the chain and
+ * none of the chain had
+ * top->selfpc == selfpc. so
+ * we allocate a new tostruct
+ * and link it to the head of
+ * the chain.
+ */
+ toindex = ++p->tos[0].link;
+ if (toindex >= p->tolimit)
+ goto overflow;
+ top = &p->tos[(size_t)toindex];
+ top->selfpc = selfpc;
+ top->count = count;
+ top->link = *frompcindex;
+ *frompcindex = (u_short)toindex;
+ goto done;
+ }
+ /*
+ * otherwise, check the next arc on the chain.
+ */
+ top = &p->tos[top->link];
+ if (top->selfpc == selfpc) {
+ /*
+ * there it is.
+ * add to its count.
+ */
+ top->count += count;
+ goto done;
+ }
+
+ }
+
+ done: ;
+ }
+
+ }
+overflow: ;
+
+}
+
+static void
+_gmon_merge(void)
+{
+ struct gmonparam *q;
+
+ _THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+
+ SLIST_FOREACH(q, &_gmonfree, list)
+ _gmon_merge_two(&_gmonparam, q);
+
+ SLIST_FOREACH(q, &_gmoninuse, list) {
+ q->state = GMON_PROF_OFF;
+ _gmon_merge_two(&_gmonparam, q);
+ }
+
+ _THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+}
+
void
_mcleanup(void)
{
@@ -230,6 +354,9 @@ _mcleanup(void)
p->kcount, p->kcountsize);
write(log, dbuf, strlen(dbuf));
#endif
+
+ _gmon_merge();
+
hdr = (struct gmonhdr *)p->outbuf;
hdr->lpc = p->lowpc;
hdr->hpc = p->highpc;
diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c
index 6846c10c857..f5ecb3f7cdc 100644
--- a/lib/libc/gmon/mcount.c
+++ b/lib/libc/gmon/mcount.c
@@ -31,6 +31,11 @@
#include <sys/types.h>
#include <sys/gmon.h>
+#ifndef _KERNEL
+#include <thread_private.h>
+#include <tib.h>
+#endif
+
/*
* mcount is called on entry to each function compiled with the profiling
* switch set. _mcount(), which is declared in a machine-dependent way
@@ -65,13 +70,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
if ((p = curcpu()->ci_gmon) == NULL)
return;
#else
- p = &_gmonparam;
+ p = (__isthreaded) ?
+ ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
#endif
/*
* check that we are profiling
* and that we aren't recursively invoked.
*/
- if (p->state != GMON_PROF_ON)
+ if (p == NULL || p->state != GMON_PROF_ON)
return;
#ifdef _KERNEL
MCOUNT_ENTER;
diff --git a/lib/libc/include/thread_private.h b/lib/libc/include/thread_private.h
index 1ec10711615..6b8164bab43 100644
--- a/lib/libc/include/thread_private.h
+++ b/lib/libc/include/thread_private.h
@@ -5,6 +5,9 @@
#ifndef _THREAD_PRIVATE_H_
#define _THREAD_PRIVATE_H_
+#include <sys/types.h>
+#include <sys/gmon.h>
+
extern int __isthreaded;
#define _MALLOC_MUTEXES 32
@@ -390,6 +393,7 @@ struct pthread {
/* cancel received in a delayed cancel block? */
int delayed_cancel;
+ struct gmonparam *gmonparam;
};
/* flags in pthread->flags */
#define THREAD_DONE 0x001
diff --git a/lib/libc/thread/rthread.c b/lib/libc/thread/rthread.c
index ae74aceb51b..f8116e8a0e0 100644
--- a/lib/libc/thread/rthread.c
+++ b/lib/libc/thread/rthread.c
@@ -21,6 +21,7 @@
#include <sys/types.h>
#include <sys/atomic.h>
+#include <sys/gmon.h>
#include <pthread.h>
#include <stdlib.h>
@@ -103,6 +104,9 @@ _rthread_init(void)
}
_threads_inited = 1;
+
+ /* Ignore errors. NULL is OK for a non-profiling case. */
+ thread->gmonparam = _gmon_alloc();
}
/*
diff --git a/lib/librthread/rthread.c b/lib/librthread/rthread.c
index 53f1d9ca82b..3389ea6baa9 100644
--- a/lib/librthread/rthread.c
+++ b/lib/librthread/rthread.c
@@ -25,6 +25,7 @@
#include <elf.h>
#pragma weak _DYNAMIC
#endif
+#include <sys/gmon.h>
#include <stdlib.h>
#include <unistd.h>
@@ -393,6 +394,10 @@ pthread_create(pthread_t *threadp, const pthread_attr_t *attr,
/* we're going to be multi-threaded real soon now */
__isthreaded = 1;
+
+ /* Ignore errors. NULL is OK for a non-profiling case. */
+ thread->gmonparam = _gmon_alloc();
+
rc = __tfork_thread(¶m, sizeof(param), _rthread_start, thread);
if (rc != -1) {
/* success */
diff --git a/lib/librthread/shlib_version b/lib/librthread/shlib_version
index 72e5894f74e..295c96b24e9 100644
--- a/lib/librthread/shlib_version
+++ b/lib/librthread/shlib_version
@@ -1,2 +1,2 @@
-major=27
-minor=1
+major=28
+minor=0
diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c
index e020dede07a..59b815a02d5 100644
--- a/sys/lib/libkern/mcount.c
+++ b/sys/lib/libkern/mcount.c
@@ -33,6 +33,11 @@
#include <sys/param.h>
#include <sys/gmon.h>
+#ifndef _KERNEL
+#include <thread_private.h>
+#include <tib.h>
+#endif
+
/*
* mcount is called on entry to each function compiled with the profiling
* switch set. _mcount(), which is declared in a machine-dependent way
@@ -67,13 +72,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
if ((p = curcpu()->ci_gmon) == NULL)
return;
#else
- p = &_gmonparam;
+ p = (__isthreaded) ?
+ ((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
#endif
/*
* check that we are profiling
* and that we aren't recursively invoked.
*/
- if (p->state != GMON_PROF_ON)
+ if (p == NULL || p->state != GMON_PROF_ON)
return;
#ifdef _KERNEL
MCOUNT_ENTER;
--
Yuichiro NAITO (naito.yuichiro@gmail.com)
gprof: Profiling a multi-threaded application (revived)