Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: gprof: Profiling a multi-threaded application (revived)
To:
Yuichiro NAITO <naito.yuichiro@gmail.com>
Cc:
tech@openbsd.org, deraadt@openbsd.org, guenther@openbsd.org, mpi@openbsd.org
Date:
Thu, 12 Jun 2025 16:12:30 +0200

Download raw body.

Thread
> Date: Fri, 06 Jun 2025 17:43:10 +0900 (JST)
> From: Yuichiro NAITO <naito.yuichiro@gmail.com>

Hi Yuichiro,

This is changing the interface between libc and libpthread.  This is
tricky.  Minimally this needs libc and libpthread minor bumps.  But it
is more complicated than that and I have trouble wrapping my head
around exactly how that works again.  guenther@ should know more.

> 
> Small update to my patch.
> 
> From: Yuichiro NAITO <naito.yuichiro@gmail.com>
> Subject: Re: gprof: Profiling a multi-threaded application (revived)
> Date: Thu, 05 Jun 2025 18:27:20 +0900 (JST)
> 
> > I updated the multi-thread profiling patch through discussion with
> > Theo de Raadt. I'll list the improvements.
> > 
> > 1. Allocate kcount buffer memory when a thread starts.
> > 
> >   The allocation logic becomes simple and also reduces the patch size.
> >   This needs the '_gmon_alloc' symbol to be exported for the libpthread.
> > 
> > 2. Allocate buffer memory on a single mmap(2) call.
> > 
> >   This is the same way as Theo's commit.
> >   https://github.com/OpenBSD/src/commit/4ea76518007ed1dc99d51b18368b53ba7b11fd17
> 
> I misunderstood Theo's intention about pointer calculation. I should
> hold the return value of mmap in a char * variable, so that I don't
> need code-width cast.
> 
> > 3. Mark 'static' to '_gmonfree', '_gmoninuse', '_gmonkey' variables.
> > 
> >   These are used only in the 'gmon.c'.
> > 
> > 4. Refactor 'gmonparam' lookup code in the __mcount function.
> > 
> >   Reduce the patch size.
> > 	    
> 
> diff --git a/lib/libc/Symbols.list b/lib/libc/Symbols.list
> index 6eb2cca93e5..d5d5497cf93 100644
> --- a/lib/libc/Symbols.list
> +++ b/lib/libc/Symbols.list
> @@ -880,6 +880,7 @@ _mcleanup
>  _monstartup
>  moncontrol
>  monstartup
> +_gmon_alloc
>  
>  /* hash */
>  MD5Data
> diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
> index 99fd42d7824..93176edd11d 100644
> --- a/lib/libc/gmon/gmon.c
> +++ b/lib/libc/gmon/gmon.c
> @@ -42,12 +42,21 @@
>  
>  struct gmonparam _gmonparam = { GMON_PROF_OFF };
>  
> +#include <pthread.h>
> +#include <thread_private.h>
> +
> +static SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
> +static SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
> +_THREAD_PRIVATE_MUTEX(_gmonlock);
> +static pthread_key_t _gmonkey;
> +
>  static int	s_scale;
>  /* see profil(2) where this is describe (incorrectly) */
>  #define		SCALE_1_TO_1	0x10000L
>  
>  #define ERR(s) write(STDERR_FILENO, s, sizeof(s))
>  
> +PROTO_NORMAL(_gmon_alloc);
>  PROTO_NORMAL(moncontrol);
>  PROTO_DEPRECATED(monstartup);
>  
> @@ -62,6 +71,19 @@ monstartup(u_long lowpc, u_long highpc)
>  #define PAGEMASK	(PAGESIZE - 1)
>  #define PAGEROUND(x)	(((x) + (PAGEMASK)) & ~PAGEMASK)
>  
> +static void
> +_gmon_destructor(void *arg)
> +{
> +	struct gmonparam *p = arg, *q, **prev;
> +
> +	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
> +	SLIST_REMOVE(&_gmoninuse, p, gmonparam, list);
> +	SLIST_INSERT_HEAD(&_gmonfree, p, list);
> +	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
> +
> +	pthread_setspecific(_gmonkey, NULL);
> +}
> +
>  void
>  _monstartup(u_long lowpc, u_long highpc)
>  {
> @@ -130,12 +152,166 @@ _monstartup(u_long lowpc, u_long highpc)
>  	else
>  		p->dirfd = -1;
>  
> +	pthread_key_create(&_gmonkey, _gmon_destructor);
> +
>  	moncontrol(1);
>  
>  	if (p->dirfd != -1)
>  		close(p->dirfd);
>  }
>  
> +struct gmonparam *
> +_gmon_alloc(void)
> +{
> +	struct gmonparam *p;
> +	char *a;
> +
> +	if (_gmonparam.state == GMON_PROF_OFF)
> +		return NULL;
> +
> +	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
> +	p = SLIST_FIRST(&_gmonfree);
> +	if (p != NULL) {
> +		SLIST_REMOVE_HEAD(&_gmonfree, list);
> +		SLIST_INSERT_HEAD(&_gmoninuse, p, list);
> +	} else {
> +		_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
> +		a = mmap(NULL,
> +			 _ALIGN(sizeof(*p)) + _ALIGN(_gmonparam.fromssize) +
> +			 _ALIGN(_gmonparam.tossize),
> +			 PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
> +		if (a == MAP_FAILED) {
> +			pthread_setspecific(_gmonkey, NULL);
> +			ERR("_gmon_alloc: out of memory\n");
> +			return NULL;
> +		}
> +		p = (struct gmonparam *)a;
> +		*p = _gmonparam;
> +		p->kcount = NULL;
> +		p->kcountsize = 0;
> +		p->froms = (void *)(a + _ALIGN(sizeof(*p)));
> +		p->tos = (void *)(a + _ALIGN(sizeof(*p)) +
> +		    _ALIGN(_gmonparam.fromssize));
> +		_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
> +		SLIST_INSERT_HEAD(&_gmoninuse, p, list);
> +	}
> +	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
> +	pthread_setspecific(_gmonkey, p);
> +
> +	return p;
> +}
> +DEF_WEAK(_gmon_alloc);
> +
> +static void
> +_gmon_merge_two(struct gmonparam *p, struct gmonparam *q)
> +{
> +	u_long fromindex, selfpc, endfrom;
> +	u_short *frompcindex, qtoindex, toindex;
> +	long count;
> +	struct tostruct *top;
> +
> +	endfrom = (q->fromssize / sizeof(*q->froms));
> +	for (fromindex = 0; fromindex < endfrom; fromindex++) {
> +		if (q->froms[fromindex] == 0)
> +			continue;
> +		for (qtoindex = q->froms[fromindex]; qtoindex != 0;
> +		     qtoindex = q->tos[qtoindex].link) {
> +			selfpc = q->tos[qtoindex].selfpc;
> +			count = q->tos[qtoindex].count;
> +			/* cribbed from mcount */
> +			frompcindex = &p->froms[fromindex];
> +			toindex = *frompcindex;
> +			if (toindex == 0) {
> +				/*
> +				 *      first time traversing this arc
> +				 */
> +				toindex = ++p->tos[0].link;
> +				if (toindex >= p->tolimit)
> +					/* halt further profiling */
> +					goto overflow;
> +
> +				*frompcindex = (u_short)toindex;
> +				top = &p->tos[(size_t)toindex];
> +				top->selfpc = selfpc;
> +				top->count = count;
> +				top->link = 0;
> +				goto done;
> +			}
> +			top = &p->tos[(size_t)toindex];
> +			if (top->selfpc == selfpc) {
> +				/*
> +				 * arc at front of chain; usual case.
> +				 */
> +				top->count+= count;
> +				goto done;
> +			}
> +			/*
> +			 * have to go looking down chain for it.
> +			 * top points to what we are looking at,
> +			 * we know it is not at the head of the chain.
> +			 */
> +			for (; /* goto done */; ) {
> +				if (top->link == 0) {
> +					/*
> +					 * top is end of the chain and
> +					 * none of the chain had
> +					 * top->selfpc == selfpc.  so
> +					 * we allocate a new tostruct
> +					 * and link it to the head of
> +					 * the chain.
> +					 */
> +					toindex = ++p->tos[0].link;
> +					if (toindex >= p->tolimit)
> +						goto overflow;
> +					top = &p->tos[(size_t)toindex];
> +					top->selfpc = selfpc;
> +					top->count = count;
> +					top->link = *frompcindex;
> +					*frompcindex = (u_short)toindex;
> +					goto done;
> +				}
> +				/*
> +				 * otherwise, check the next arc on the chain.
> +				 */
> +				top = &p->tos[top->link];
> +				if (top->selfpc == selfpc) {
> +					/*
> +					 * there it is.
> +					 * add to its count.
> +					 */
> +					top->count += count;
> +					goto done;
> +				}
> +
> +			}
> +
> +		done: ;
> +		}
> +
> +	}
> +overflow: ;
> +
> +}
> +
> +static void
> +_gmon_merge(void)
> +{
> +	struct gmonparam *q;
> +
> +	_THREAD_PRIVATE_MUTEX_LOCK(_gmonlock);
> +
> +	SLIST_FOREACH(q, &_gmonfree, list)
> +		_gmon_merge_two(&_gmonparam, q);
> +
> +	SLIST_FOREACH(q, &_gmoninuse, list) {
> +		q->state = GMON_PROF_OFF;
> +		_gmon_merge_two(&_gmonparam, q);
> +	}
> +
> +	_THREAD_PRIVATE_MUTEX_UNLOCK(_gmonlock);
> +}
> +
> +
>  void
>  _mcleanup(void)
>  {
> @@ -179,6 +355,9 @@ _mcleanup(void)
>  	    p->kcount, p->kcountsize);
>  	write(log, dbuf, strlen(dbuf));
>  #endif
> +
> +	_gmon_merge();
> +
>  	hdr = (struct gmonhdr *)p->outbuf;
>  	hdr->lpc = p->lowpc;
>  	hdr->hpc = p->highpc;
> diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c
> index 6846c10c857..f5ecb3f7cdc 100644
> --- a/lib/libc/gmon/mcount.c
> +++ b/lib/libc/gmon/mcount.c
> @@ -31,6 +31,11 @@
>  #include <sys/types.h>
>  #include <sys/gmon.h>
>  
> +#ifndef _KERNEL
> +#include <thread_private.h>
> +#include <tib.h>
> +#endif
> +
>  /*
>   * mcount is called on entry to each function compiled with the profiling
>   * switch set.  _mcount(), which is declared in a machine-dependent way
> @@ -65,13 +70,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
>  	if ((p = curcpu()->ci_gmon) == NULL)
>  		return;
>  #else
> -	p = &_gmonparam;
> +	p = (__isthreaded) ?
> +		((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
>  #endif
>  	/*
>  	 * check that we are profiling
>  	 * and that we aren't recursively invoked.
>  	 */
> -	if (p->state != GMON_PROF_ON)
> +	if (p == NULL || p->state != GMON_PROF_ON)
>  		return;
>  #ifdef _KERNEL
>  	MCOUNT_ENTER;
> diff --git a/lib/libc/include/thread_private.h b/lib/libc/include/thread_private.h
> index 1ec10711615..6b8164bab43 100644
> --- a/lib/libc/include/thread_private.h
> +++ b/lib/libc/include/thread_private.h
> @@ -5,6 +5,9 @@
>  #ifndef _THREAD_PRIVATE_H_
>  #define _THREAD_PRIVATE_H_
>  
> +#include <sys/types.h>
> +#include <sys/gmon.h>
> +
>  extern int __isthreaded;
>  
>  #define _MALLOC_MUTEXES 32
> @@ -390,6 +393,7 @@ struct pthread {
>  
>  	/* cancel received in a delayed cancel block? */
>  	int delayed_cancel;
> +	struct gmonparam *gmonparam;
>  };
>  /* flags in pthread->flags */
>  #define	THREAD_DONE		0x001
> diff --git a/lib/libc/thread/rthread.c b/lib/libc/thread/rthread.c
> index ae74aceb51b..f8116e8a0e0 100644
> --- a/lib/libc/thread/rthread.c
> +++ b/lib/libc/thread/rthread.c
> @@ -21,6 +21,7 @@
>  
>  #include <sys/types.h>
>  #include <sys/atomic.h>
> +#include <sys/gmon.h>
>  
>  #include <pthread.h>
>  #include <stdlib.h>
> @@ -103,6 +104,9 @@ _rthread_init(void)
>  	}
>  
>  	_threads_inited = 1;
> +
> +	/* Ignore errors. NULL is OK for a non-profiling case. */
> +	thread->gmonparam = _gmon_alloc();
>  }
>  
>  /*
> diff --git a/lib/librthread/rthread.c b/lib/librthread/rthread.c
> index 53f1d9ca82b..3389ea6baa9 100644
> --- a/lib/librthread/rthread.c
> +++ b/lib/librthread/rthread.c
> @@ -25,6 +25,7 @@
>  #include <elf.h>
>  #pragma weak _DYNAMIC
>  #endif
> +#include <sys/gmon.h>
>  
>  #include <stdlib.h>
>  #include <unistd.h>
> @@ -393,6 +394,10 @@ pthread_create(pthread_t *threadp, const pthread_attr_t *attr,
>  
>  	/* we're going to be multi-threaded real soon now */
>  	__isthreaded = 1;
> +
> +	/* Ignore errors. NULL is OK for a non-profiling case. */
> +	thread->gmonparam = _gmon_alloc();
> +
>  	rc = __tfork_thread(&param, sizeof(param), _rthread_start, thread);
>  	if (rc != -1) {
>  		/* success */
> diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c
> index e020dede07a..59b815a02d5 100644
> --- a/sys/lib/libkern/mcount.c
> +++ b/sys/lib/libkern/mcount.c
> @@ -33,6 +33,11 @@
>  #include <sys/param.h>
>  #include <sys/gmon.h>
>  
> +#ifndef _KERNEL
> +#include <thread_private.h>
> +#include <tib.h>
> +#endif
> +
>  /*
>   * mcount is called on entry to each function compiled with the profiling
>   * switch set.  _mcount(), which is declared in a machine-dependent way
> @@ -67,13 +72,14 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
>  	if ((p = curcpu()->ci_gmon) == NULL)
>  		return;
>  #else
> -	p = &_gmonparam;
> +	p = (__isthreaded) ?
> +		((pthread_t)TIB_GET()->tib_thread)->gmonparam : &_gmonparam;
>  #endif
>  	/*
>  	 * check that we are profiling
>  	 * and that we aren't recursively invoked.
>  	 */
> -	if (p->state != GMON_PROF_ON)
> +	if (p == NULL || p->state != GMON_PROF_ON)
>  		return;
>  #ifdef _KERNEL
>  	MCOUNT_ENTER;
> diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h
> index baaf7743be5..a29bfa849d9 100644
> --- a/sys/sys/gmon.h
> +++ b/sys/sys/gmon.h
> @@ -35,6 +35,7 @@
>  #ifndef _SYS_GMON_H_
>  #define _SYS_GMON_H_
>  
> +#include <sys/queue.h>
>  #include <machine/profile.h>
>  
>  /*
> @@ -141,6 +142,7 @@ struct gmonparam {
>  	size_t		outbuflen;
>  	void		*rawarcs;
>  	int		dirfd;
> +	SLIST_ENTRY(gmonparam)	list;
>  };
>  
>  /*
> @@ -174,6 +176,7 @@ void	_monstartup(u_long, u_long);
>  void	moncontrol(int);
>  /* XXX remove end of may 2025 */
>  void	monstartup(u_long, u_long);
> +struct gmonparam *_gmon_alloc(void);
>  __END_DECLS
>  
>  #endif /* !_KERNEL */
> 
> -- 
> Yuichiro NAITO (naito.yuichiro@gmail.com)
> 
>