Index | Thread | Search

From:
Yuichiro NAITO <naito.yuichiro@gmail.com>
Subject:
Re: gprof: Profiling a multi-threaded application (revived)
To:
tech@openbsd.org, deraadt@openbsd.org, guenther@openbsd.org, mpi@openbsd.org
Date:
Thu, 05 Jun 2025 18:27:20 +0900

Download raw body.

Thread
I updated the multi-thread profiling patch through discussion with
Theo de Raadt. I'll list the improvements.

1. Allocate kcount buffer memory when a thread starts.

  The allocation logic becomes simple and also reduces the patch size.
  This needs the '_gmon_alloc' symbol to be exported for the libpthread.

2. Allocate buffer memory on a single mmap(2) call.

  This is the same way as Theo's commit.
  https://github.com/OpenBSD/src/commit/4ea76518007ed1dc99d51b18368b53ba7b11fd17
3. Mark 'static' to '_gmonfree', '_gmoninuse', '_gmonkey' variables.

  These are used only in the 'gmon.c'.

4. Refactor 'gmonparam' lookup code in the __mcount function.

  Reduce the patch size.
	    
diff --git a/lib/libc/Symbols.list b/lib/libc/Symbols.list
index 6eb2cca93e5..d5d5497cf93 100644
--- a/lib/libc/Symbols.list
+++ b/lib/libc/Symbols.list
@@ -880,6 +880,7 @@ _mcleanup
 _monstartup
 moncontrol
 monstartup
+_gmon_alloc
 
 /* hash */
 MD5Data
diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
index 99fd42d7824..c9089ac768c 100644
--- a/lib/libc/gmon/gmon.c
+++ b/lib/libc/gmon/gmon.c
@@ -42,12 +42,21 @@
 
 struct gmonparam _gmonparam = { GMON_PROF_OFF };
 
+#include <pthread.h>
+#include <thread_private.h>
+
+static SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
+static SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
+_THREAD_PRIVATE_MUTEX(_gmonlock);
+static pthread_key_t _gmonkey;
+
 static int	s_scale;
 /* see profil(2) where this is describe (incorrectly) */
 #define		SCALE_1_TO_1	0x10000L
 
 #define ERR(s) write(STDERR_FILENO, s, sizeof(s))
 
+PROTO_NORMAL(_gmon_alloc);
 PROTO_NORMAL(moncontrol);
 PROTO_DEPRECATED(monstartup);
 
@@ -62,6 +71,19 @@ monstartup(u_long lowpc, u_long highpc)
 #define PAGEMASK	(PAGESIZE - 1)
 #define PAGEROUND(x)	(((x) + (PAGEMASK)) & ~PAGEMASK)
 
+static void
+_gmon_destructor(void *arg)
+{
+	struct gmonparam *p = arg, *q, **prev;
+
+	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+	SLIST_REMOVE(&_gmoninuse, p, gmonparam, list);
+	SLIST_INSERT_HEAD(&_gmonfree, p, list);
+	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+
+	pthread_setspecific(_gmonkey, NULL);
+}
+
 void
 _monstartup(u_long lowpc, u_long highpc)
 {
@@ -130,12 +152,164 @@ _monstartup(u_long lowpc, u_long highpc)
 	else
 		p->dirfd = -1;
 
+	pthread_key_create(&_gmonkey, _gmon_destructor);
+
 	moncontrol(1);
 
 	if (p->dirfd != -1)
 		close(p->dirfd);
 }
 
+struct gmonparam *
+_gmon_alloc(void)
+{
+	struct gmonparam *p;
+
+	if (_gmonparam.state == GMON_PROF_OFF)
+		return NULL;
+
+	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+	p = SLIST_FIRST(&_gmonfree);
+	if (p != NULL) {
+		SLIST_REMOVE_HEAD(&_gmonfree, list);
+		SLIST_INSERT_HEAD(&_gmoninuse, p, list);
+	} else {
+		_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+		p = mmap(NULL,
+			 _ALIGN(sizeof(*p)) + _ALIGN(_gmonparam.fromssize) +
+			 _ALIGN(_gmonparam.tossize),
+			 PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if (p == MAP_FAILED) {
+			pthread_setspecific(_gmonkey, NULL);
+			ERR("_gmon_alloc: out of memory\n");
+			return NULL;
+		}
+		*p = _gmonparam;
+		p->kcount = NULL;
+		p->kcountsize = 0;
+		p->froms = (void *)((char *)p + _ALIGN(sizeof(*p)));
+		p->tos = (void *)((char *)p + _ALIGN(sizeof(*p)) +
+				  _ALIGN(_gmonparam.fromssize));
+		_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+		SLIST_INSERT_HEAD(&_gmoninuse, p, list);
+	}
+	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+	pthread_setspecific(_gmonkey, p);
+
+	return p;
+}
+DEF_WEAK(_gmon_alloc);
+
+static void
+_gmon_merge_two(struct gmonparam *p, struct gmonparam *q)
+{
+	u_long fromindex, selfpc, endfrom;
+	u_short *frompcindex, qtoindex, toindex;
+	long count;
+	struct tostruct *top;
+
+	endfrom = (q->fromssize / sizeof(*q->froms));
+	for (fromindex = 0; fromindex < endfrom; fromindex++) {
+		if (q->froms[fromindex] == 0)
+			continue;
+		for (qtoindex = q->froms[fromindex]; qtoindex != 0;
+		     qtoindex = q->tos[qtoindex].link) {
+			selfpc = q->tos[qtoindex].selfpc;
+			count = q->tos[qtoindex].count;
+			/* cribbed from mcount */
+			frompcindex = &p->froms[fromindex];
+			toindex = *frompcindex;
+			if (toindex == 0) {
+				/*
+				 *      first time traversing this arc
+				 */
+				toindex = ++p->tos[0].link;
+				if (toindex >= p->tolimit)
+					/* halt further profiling */
+					goto overflow;
+
+				*frompcindex = (u_short)toindex;
+				top = &p->tos[(size_t)toindex];
+				top->selfpc = selfpc;
+				top->count = count;
+				top->link = 0;
+				goto done;
+			}
+			top = &p->tos[(size_t)toindex];
+			if (top->selfpc == selfpc) {
+				/*
+				 * arc at front of chain; usual case.
+				 */
+				top->count+= count;
+				goto done;
+			}
+			/*
+			 * have to go looking down chain for it.
+			 * top points to what we are looking at,
+			 * we know it is not at the head of the chain.
+			 */
+			for (; /* goto done */; ) {
+				if (top->link == 0) {
+					/*
+					 * top is end of the chain and
+					 * none of the chain had
+					 * top->selfpc == selfpc.  so
+					 * we allocate a new tostruct
+					 * and link it to the head of
+					 * the chain.
+					 */
+					toindex = ++p->tos[0].link;
+					if (toindex >= p->tolimit)
+						goto overflow;
+					top = &p->tos[(size_t)toindex];
+					top->selfpc = selfpc;
+					top->count = count;
+					top->link = *frompcindex;
+					*frompcindex = (u_short)toindex;
+					goto done;
+				}
+				/*
+				 * otherwise, check the next arc on the chain.
+				 */
+				top = &p->tos[top->link];
+				if (top->selfpc == selfpc) {
+					/*
+					 * there it is.
+					 * add to its count.
+					 */
+					top->count += count;
+					goto done;
+				}
+
+			}
+
+		done: ;
+		}
+
+	}
+overflow: ;
+
+}
+
+static void
+_gmon_merge(void)
+{
+	struct gmonparam *q;
+
+	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
+
+	SLIST_FOREACH(q, &_gmonfree, list)
+		_gmon_merge_two(&_gmonparam, q);
+
+	SLIST_FOREACH(q, &_gmoninuse, list) {
+		q->state = GMON_PROF_OFF;
+		_gmon_merge_two(&_gmonparam, q);
+	}
+
+	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
+}
+
+
 void
 _mcleanup(void)
 {
@@ -179,6 +353,9 @@ _mcleanup(void)
 	    p->kcount, p->kcountsize);
 	write(log, dbuf, strlen(dbuf));
 #endif
+
+	_gmon_merge();
+
 	hdr = (struct gmonhdr *)p->outbuf;
 	hdr->lpc = p->lowpc;
 	hdr->hpc = p->highpc;
diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c
index 6846c10c857..f5ecb3f7cdc 100644
--- a/lib/libc/gmon/mcount.c
+++ b/lib/libc/gmon/mcount.c
@@ -31,6 +31,11 @@
 #include <sys/types.h>
 #include <sys/gmon.h>
 
+#ifndef _KERNEL
+#include <thread_private.h>
+#include <tib.h>
+#endif
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -65,13 +70,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
 	if ((p = curcpu()->ci_gmon) == NULL)
 		return;
 #else
-	p = &_gmonparam;
+	p = (__isthreaded) ?
+		((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
 #endif
 	/*
 	 * check that we are profiling
 	 * and that we aren't recursively invoked.
 	 */
-	if (p->state != GMON_PROF_ON)
+	if (p == NULL || p->state != GMON_PROF_ON)
 		return;
 #ifdef _KERNEL
 	MCOUNT_ENTER;
diff --git a/lib/libc/include/thread_private.h b/lib/libc/include/thread_private.h
index 1ec10711615..6b8164bab43 100644
--- a/lib/libc/include/thread_private.h
+++ b/lib/libc/include/thread_private.h
@@ -5,6 +5,9 @@
 #ifndef _THREAD_PRIVATE_H_
 #define _THREAD_PRIVATE_H_
 
+#include <sys/types.h>
+#include <sys/gmon.h>
+
 extern int __isthreaded;
 
 #define _MALLOC_MUTEXES 32
@@ -390,6 +393,7 @@ struct pthread {
 
 	/* cancel received in a delayed cancel block? */
 	int delayed_cancel;
+	struct gmonparam *gmonparam;
 };
 /* flags in pthread->flags */
 #define	THREAD_DONE		0x001
diff --git a/lib/libc/thread/rthread.c b/lib/libc/thread/rthread.c
index ae74aceb51b..f8116e8a0e0 100644
--- a/lib/libc/thread/rthread.c
+++ b/lib/libc/thread/rthread.c
@@ -21,6 +21,7 @@
 
 #include <sys/types.h>
 #include <sys/atomic.h>
+#include <sys/gmon.h>
 
 #include <pthread.h>
 #include <stdlib.h>
@@ -103,6 +104,9 @@ _rthread_init(void)
 	}
 
 	_threads_inited = 1;
+
+	/* Ignore errors. NULL is OK for a non-profiling case. */
+	thread->gmonparam = _gmon_alloc();
 }
 
 /*
diff --git a/lib/librthread/rthread.c b/lib/librthread/rthread.c
index 53f1d9ca82b..3389ea6baa9 100644
--- a/lib/librthread/rthread.c
+++ b/lib/librthread/rthread.c
@@ -25,6 +25,7 @@
 #include <elf.h>
 #pragma weak _DYNAMIC
 #endif
+#include <sys/gmon.h>
 
 #include <stdlib.h>
 #include <unistd.h>
@@ -393,6 +394,10 @@ pthread_create(pthread_t *threadp, const pthread_attr_t *attr,
 
 	/* we're going to be multi-threaded real soon now */
 	__isthreaded = 1;
+
+	/* Ignore errors. NULL is OK for a non-profiling case. */
+	thread->gmonparam = _gmon_alloc();
+
 	rc = __tfork_thread(&param, sizeof(param), _rthread_start, thread);
 	if (rc != -1) {
 		/* success */
diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c
index e020dede07a..59b815a02d5 100644
--- a/sys/lib/libkern/mcount.c
+++ b/sys/lib/libkern/mcount.c
@@ -33,6 +33,11 @@
 #include <sys/param.h>
 #include <sys/gmon.h>
 
+#ifndef _KERNEL
+#include <thread_private.h>
+#include <tib.h>
+#endif
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -67,13 +72,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
 	if ((p = curcpu()->ci_gmon) == NULL)
 		return;
 #else
-	p = &_gmonparam;
+	p = (__isthreaded) ?
+		((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
 #endif
 	/*
 	 * check that we are profiling
 	 * and that we aren't recursively invoked.
 	 */
-	if (p->state != GMON_PROF_ON)
+	if (p == NULL || p->state != GMON_PROF_ON)
 		return;
 #ifdef _KERNEL
 	MCOUNT_ENTER;
diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h
index baaf7743be5..a29bfa849d9 100644
--- a/sys/sys/gmon.h
+++ b/sys/sys/gmon.h
@@ -35,6 +35,7 @@
 #ifndef _SYS_GMON_H_
 #define _SYS_GMON_H_
 
+#include <sys/queue.h>
 #include <machine/profile.h>
 
 /*
@@ -141,6 +142,7 @@ struct gmonparam {
 	size_t		outbuflen;
 	void		*rawarcs;
 	int		dirfd;
+	SLIST_ENTRY(gmonparam)	list;
 };
 
 /*
@@ -174,6 +176,7 @@ void	_monstartup(u_long, u_long);
 void	moncontrol(int);
 /* XXX remove end of may 2025 */
 void	monstartup(u_long, u_long);
+struct gmonparam *_gmon_alloc(void);
 __END_DECLS
 
 #endif /* !_KERNEL */

-- 
Yuichiro NAITO (naito.yuichiro@gmail.com)