/*
 * pfmon_system.c : handles per-cpu measurements
 *
 * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
 * Contributed by Stephane Eranian <eranian@hpl.hp.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 * 02111-1307 USA
 */
#include "pfmon.h"

#include <sys/wait.h>
#include <time.h>
#include <sys/time.h>
#include <fcntl.h>
#include <sys/ptrace.h>
#include <sys/mman.h>

/*
 * argument passed to each worker thread
 * pointer arguments are ALL read-only as they are shared
 * between all threads. To make modification, we need to make a copy first.
 */
typedef enum {
	THREAD_STARTED,
	THREAD_RUN,
	THREAD_DONE,
	THREAD_ERROR
} thread_state_t;

typedef struct {
	unsigned int 	   id;		/* logical thread identification */
	unsigned int 	   cpu;		/* which CPU to pin it on */

	pfmon_ctx_t	   *ctx;	/* generic context description */

	pthread_t	   thread_id;	/* pthread id */
	thread_state_t	   thread_state;
} pfmon_thread_desc_t;


typedef enum {
	SESSION_INIT,
	SESSION_RUN,
	SESSION_STOP,
	SESSION_ABORTED
} session_state_t;

typedef enum {
	REASON_NONE,
	REASON_CHILD,
	REASON_TIMEOUT,
	REASON_ABORT
} syswide_sigreason_t;

typedef struct _barrier {
	pthread_mutex_t mutex;
	pthread_cond_t	cond;
	unsigned long	counter;
	unsigned long	max;
	unsigned long   generation; /* avoid race condition on wake-up */
} barrier_t;

static barrier_t 		barrier;
static session_state_t		session_state;
static syswide_sigreason_t	syswide_sigreason;

static pthread_mutex_t 		pfmon_sys_aggr_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t  		results_cond;
static pthread_key_t   		param_key;

static pfmon_thread_desc_t	thread_desc[PFMON_MAX_CPUS];
static sem_t 			ovfl_sem[PFMON_MAX_CPUS];

static uint64_t			ovfl_cnts[PFMON_MAX_CPUS];

static pfmon_sdesc_t		sdesc_sys_aggr;

static pthread_t		master_thread_id;
static pid_t			master_tid;

static int
barrier_init(barrier_t *b, unsigned long count)
{
	int r;

	r = pthread_mutex_init(&b->mutex, NULL);
	if (r == -1) return -1;
	r = pthread_cond_init(&b->cond, NULL);
	if (r == -1) return -1;
	b->max = b->counter = count;

	b->generation = 0;

	return 0;
}

static void
cleanup_barrier(void *arg)
{
	int r;
	barrier_t *b = (barrier_t *)arg;
	r = pthread_mutex_unlock(&b->mutex);
	DPRINT(("free barrier mutex r=%d\n", r));
}


static int
barrier_wait(barrier_t *b)
{
	unsigned long generation;
	int oldstate;

	pthread_cleanup_push(cleanup_barrier, b);

	pthread_mutex_lock(&b->mutex);

	pthread_testcancel();

	if (--b->counter == 0) {
		DPRINT(("last thread entered\n"));

		/* reset barrier */
		b->counter = b->max;
		/*
		 * bump generation number, this avoids thread getting stuck in the
		 * wake up loop below in case a thread just out of the barrier goes
		 * back in right away before all the thread from the previous "round"
		 * have "escaped".
		 */
		b->generation++;

		pthread_cond_broadcast(&b->cond);
	} else {

		generation = b->generation;

		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);

		while (b->counter != b->max && generation == b->generation) {
			pthread_cond_wait(&b->cond, &b->mutex);
		}

		pthread_setcancelstate(oldstate, NULL);
	}
	pthread_mutex_unlock(&b->mutex);

	pthread_cleanup_pop(0);

	return 0;
}

static void
syswide_aggregate_results(pfmon_sdesc_t *sdesc)
{
	pfmon_event_set_t *set_aggr, *set;
	unsigned int i, count;
	
	for (set_aggr = sdesc_sys_aggr.sets,
	     set = sdesc->sets;
	     set_aggr;
	     set_aggr = set_aggr->next,
	     set = set->next) {

		count = set_aggr->event_count;

		for (i=0; i < count; i++) {
			set_aggr->master_pd[i].reg_value += set->master_pd[i].reg_value;
		}
	}
}

/*
 * tid is the value returned by gettid()
 */
static int
setup_fasync(unsigned int mycpu, pfmon_ctxid_t id, pid_t tid)
{
	int r;

	/*
	 * setup asynchronous overflow notifications
	 */
	r = fcntl(id, F_SETFL, fcntl(id, F_GETFL, 0) | O_ASYNC);
	if (r == -1) {
		warning("CPU%u cannot set ASYNC: %s\n", mycpu, strerror(errno));
		return -1;
	}

	/*
	 * That's the tricky part here, especially because of threads.
	 *
	 * By default, SIGIO (asynchronous signal) is delivered to any one thread in 
	 * the process, most likely the first one that does not have it blocked.
	 *
	 * For our purpose, we want the signal to be delivered to the thread that handles
	 * the session for each CPU. For instance, the SIGIO for the session on 
	 * CPU0 must be delivered to the thread running on CPU0, otherwise it is still 
	 * doable using some mapping table, but more complicated and less efficient.
	 *
	 * NPTL : gettid() = kernel task pid
	 *        getpid() = kernel tgid. Master thread tgid=pid
	 *
	 * LinuxThread:
	 * 	  gettid() = does not really exist at the user level
	 * 	  getpid() = kernel pid
	 */

	/*
	 * There is a way to overrride the default SIGIO behavior by using a Linux 
	 * specific extension (available via _GNU_SOURCE): the F_SETSIG command of
	 * fcntl(). Using this command, you can specify the signal to be delivered
	 * when there is an event for a file descriptor which is set in asynchronous
	 * mode (O_ASYNC). If F_SETSIG is specified, then the signal handler
	 * also receives the file descriptor number in si_fd. For what we do here 
	 * we can live with SIGIO as the signal and we do not need the file descriptor. 
	 * We use the call because of its nice side-effect on which thread gets the signal.
	 * If a signal is explicitely requested, via F_SETSIG, then ONLY the task that 
	 * is declared as the owner of the resource, via F_SETOWN, will receive the signal.
	 * Therefore, We use this command to ensure that the right thread gets the signal 
	 * independently of which thread library is used: LinuxThreads or NPTL.
	 */

	r = fcntl(id, F_SETSIG, SIGIO);
	if (r == -1) {
		warning("CPU%u cannot setsig: %s\n", mycpu, strerror(errno));
		return -1;
	}

	/*
	 * NPTL: 
	 * 	If we specify getpid() here, the first worker thread on CPU0, 
	 * 	receives the signal because the master thread has SIGIO blocked.
	 * 	This is not quite what we want. That forces us to use the actual 
	 * 	gettid() (kernel system call) value to identify precisely the 
	 * 	thread we want. 
	 *
	 * LinuxThread:
	 * 	getpid() identifies each thread uniquely, i.e., different in every 
	 * 	thread,	therefore it works. The tid passed as argument is really
	 * 	equivalent to getpid().
	 */
	r = fcntl(id, F_SETOWN, tid);
	if (r == -1) {
		warning("CPU%u cannot setown: %s\n", mycpu, strerror(errno));
		return -1;
	}
	return 0;
}

static int 
pfmon_sys_setup_context(pfmon_sdesc_t *sdesc, unsigned int cpu, pfmon_ctx_t *ctx)
{
	pfmon_smpl_desc_t *csmpl = &sdesc->csmpl;
	pfmon_ctxid_t id;

	/*
	 * XXX: cache lines sharing for master list
	 */
	pfmon_clone_sets(options.sets, sdesc);

	memset(csmpl, 0, sizeof(pfmon_smpl_desc_t));

	csmpl->cpu = cpu;

	if (pfmon_create_context(ctx, &csmpl->smpl_hdr, &sdesc->ctxid) == -1 ) {
		if (errno == EBUSY) {
			warning("concurrent conflicting monitoring session is present in your system\n");
		} else
			warning("can't create perfmon context: %s\n", strerror(errno));
		return -1;
	}

	id = sdesc->ctxid;

	DPRINT(("CPU%u handles fd=%d\n", cpu, id));

	/*
	 * XXX: need to add FD_CLOEXEC. however there seems to be a bug either in libc
	 * or in the kernel whereby if FD_CLOEXEC is set, then if we fork/exec an external
	 * command, we lose the SIGIO notification signal. So for now, we leak the file
	 * the contexts file descriptors to the command being run.
	 *
	 * Moving the FD_CLOEXEC in the child process before the actual execvp() fixes the problem.
	 * However, the context file descriptors are not easily acessible from there.
	 */

	if (open_results(sdesc) == -1) return -1;

	if (options.opt_use_smpl) {
		if (pfmon_setup_sampling_output(sdesc, &sdesc_sys_aggr.csmpl) == -1) return -1;
		DPRINT(("-->sampling buffer at %p aggr_count=%p data=%p\n", csmpl->smpl_hdr, csmpl->aggr_count, csmpl->data));
	}

	install_event_sets(sdesc);

	if (pfmon_load_context(sdesc->ctxid, cpu) == -1) return -1;

	return 0;
}

static void
setup_worker_signals(void)
{
        sigset_t my_set;

	/*
	 * workers have all the signals handled by the master blocked
	 * such that they are never called for them.
	 */
        sigemptyset(&my_set);
        sigaddset(&my_set, SIGINT);
        sigaddset(&my_set, SIGCHLD);
        sigaddset(&my_set, SIGALRM);
        pthread_sigmask(SIG_BLOCK, &my_set, NULL);
	
	/*
	 * POSIX: blocked signal mask is inherited from 
	 * parent thread. The master thread has SIGIO blocked
	 * therefore we must reenable it.
	 */
	sigemptyset(&my_set);
        sigaddset(&my_set, SIGIO);
        pthread_sigmask(SIG_UNBLOCK, &my_set, NULL);

}

static int
do_measure_one_cpu(void *data)
{
	pfmon_thread_desc_t *arg = (pfmon_thread_desc_t *)data;
	pfmon_sdesc_t sdesc_var; /* local pfmon task descriptor */
	pfmon_sdesc_t *sdesc = &sdesc_var; 
	sem_t *my_ovfl_sem;
	pfmon_ctxid_t ctxid = -1;
	pid_t mytid = gettid();
	unsigned int mycpu;
	int aggr, needs_order;
	int r;

	/*
	 * POSIX threads: 
	 * The signal state of the new thread is initialised as follows:
    	 *    - the signal mask is inherited from the creating thread.
         *    - the set of signals pending for the new thread is empty.
	 *
	 * we want to let the master handle the global signals, therefore
	 * we mask them here.
	 */
	setup_worker_signals();

	mycpu       = arg->cpu;
	my_ovfl_sem = ovfl_sem+arg->id;
	aggr        = options.opt_aggr;

	/*
	 * some NPTL sanity checks
	 */
	if (mytid == master_tid) {
		warning("pfmon is not compiled/linked with the correct pthread library,"
			"the program is linked with NPTL when it should not."
			"Check Makefile."
			"[pid=%d:tid=%d]\n", getpid(), mytid);
		goto error;
	}

	/*
	 * we initialize our "simplified" sdesc
	 */
	memset(sdesc, 0, sizeof(*sdesc));
	/*
	 * just to make sure we have these fields initialized
	 */
	sdesc->type =  PFMON_SDESC_ATTACH;
	sdesc->tid  = mytid;
	sdesc->pid  = getpid();
	sdesc->cpu  = mycpu;
	sdesc->id   = arg->id; /* logical id */

	DPRINT(("CPU%u: pid=%d tid=%d\n", mycpu, sdesc->pid, sdesc->tid));

	pthread_setspecific(param_key, arg);

	if (options.online_cpus > 1) {

		r = pfmon_pin_self(mycpu);
		if (r == -1) {
			warning("[%d] cannot set affinity to CPU%u: %s\n", mytid, mycpu, strerror(errno));
			goto error;
		}
	}

	r = pfmon_sys_setup_context(sdesc, arg->cpu, arg->ctx);
	if (r) goto error;

	ctxid       = sdesc->ctxid;
	needs_order = aggr || sdesc->out_fp == stdout;

	DPRINT(("sdesc->id=%u needs_order=%d\n", sdesc->id, needs_order));

	if (options.opt_use_smpl) {
		r = setup_fasync(mycpu, ctxid, mytid);
		if (r) goto error;
	}

	/*
	 * indicate we have reach the starting point
	 */
	arg->thread_state = THREAD_RUN;
	barrier_wait(&barrier);

	/*
	 * wait for the start signal
	 */
	barrier_wait(&barrier);

	DPRINT(("CPU%u after barrier state=%d\n", mycpu, session_state));

	if (session_state == SESSION_ABORTED) goto error;

	if (options.opt_dont_start == 0) {
		if (pfmon_start(ctxid) == -1) goto error;
		vbprintf("CPU%u started monitoring\n", mycpu);
	} else {
		vbprintf("CPU%u pfmon does not start session\n", mycpu);
	}

	/*
	 * print_interval is not possible when sampling
	 */
	if (options.opt_print_interval) {

		vbprintf("nsets=%u interval=%lu\n", sdesc->nsets, options.interval);

		for(;session_state == SESSION_RUN;) {

			sleep(options.interval);

			pfmon_stop(ctxid);

			pthread_testcancel();

			read_incremental_results(sdesc);

			show_results(sdesc, needs_order, PFMON_RESULTS_INTER);

			pfmon_start(ctxid);

		}

		pthread_testcancel();
	} else {
		/*
		 * we are sampling or we do not print at regular interval and wait
		 * for the end of the session
		 */
		for(;;) {
			sem_wait(my_ovfl_sem);

			if (session_state != SESSION_RUN) break;

			pthread_testcancel();

			if (aggr) pthread_mutex_lock(&pfmon_sys_aggr_lock);

			pfmon_process_smpl_buf(sdesc, 0);

			if (aggr) pthread_mutex_unlock(&pfmon_sys_aggr_lock);

			pthread_testcancel();
		}
	}

	if (pfmon_stop(ctxid) == -1)
		warning("CPU%u could not stop monitoring, CPU may be offline, check results\n", mycpu);

	vbprintf("CPU%-3u stopped monitoring\n", mycpu);

	/*
	 * read the final counts
	 */
	if (options.opt_use_smpl == 0 || options.opt_smpl_print_counts) {
		if (read_results(sdesc) == -1) {
			warning("CPU%u read_results error\n", mycpu);
			goto error;
		}
	}
	DPRINT(("CPU%u has read PMDS\n", mycpu));

	/* 
	 * dump results 
	 */
	if (options.opt_aggr) {
		pthread_mutex_lock(&pfmon_sys_aggr_lock);

		syswide_aggregate_results(sdesc);

		if (options.opt_use_smpl) pfmon_process_smpl_buf(sdesc, 1);

		pthread_mutex_unlock(&pfmon_sys_aggr_lock);

	} else {
		if (options.opt_use_smpl) pfmon_process_smpl_buf(sdesc, 1);

		show_results(sdesc, needs_order, PFMON_RESULTS_FINAL);

		close_results(sdesc);
	}

	if (options.opt_use_smpl && options.opt_aggr == 0) {
		pfmon_close_sampling_output(sdesc, &sdesc->csmpl, -1, mycpu);
	}

	close(sdesc->ctxid);

	arg->thread_state = THREAD_DONE;

	DPRINT(("CPU%u is done\n", mycpu));

	pthread_exit((void *)(0));
	/* NO RETURN */
error:
	if (sdesc->ctxid > -1)
		close(sdesc->ctxid);

	vbprintf("CPU%-3u session aborted\n", mycpu);

	if (options.opt_use_smpl && options.opt_aggr == 0) {
		pfmon_close_sampling_output(sdesc, &sdesc->csmpl, -1, mycpu);
	}

	arg->thread_state = THREAD_ERROR;

	/*
	 * indicate we have reach the starting point BUT 
	 * with failures.
	 */
	barrier_wait(&barrier);

	pthread_exit((void *)(~0UL));
	/* NO RETURN */
}

/*
 * only called by the master thread
 */
static void
syswide_sigalarm_handler(int n, siginfo_t *info, void *sc)
{
	if (pthread_equal(pthread_self(), master_thread_id) == 0) {
		warning("SIGALRM not handled by master thread master\n");
		return;
	}
	if (syswide_sigreason == REASON_NONE) syswide_sigreason = REASON_TIMEOUT;
}

/*
 * only called by the master thread
 */
static void
syswide_sigint_handler(int n, siginfo_t *info, void *sc)
{
	DPRINT(("sigint handler by %d master=%d\n", gettid(), master_tid));

	if (pthread_equal(pthread_self(), master_thread_id) == 0) {
		warning("SIGINT not handled by master thread master\n");
		return;
	}
	if (syswide_sigreason == REASON_NONE) syswide_sigreason = REASON_ABORT;
}

/*
 * only called by the master thread
 */
static void
syswide_sigchild_handler(int n, siginfo_t *info, void *sc)
{
	if (pthread_equal(pthread_self(), master_thread_id) == 0) {
		warning("SIGCHLD not handled by master thread master\n");
		return;
	}
	/*
	 * We are only interested in SIGCHLD indicating that the process is
	 * dead
	 */
	if (info->si_code != CLD_EXITED && info->si_code != CLD_KILLED) return;

	/*
	 * if we have a session timeout+child, then we are not using sleep
	 * therefore it is safe to clear the alarm.
	 */
	if (options.session_timeout || options.trigger_delay) alarm(0);

	if (syswide_sigreason == REASON_NONE) syswide_sigreason = REASON_CHILD;
}

/*
 * must be executed by worker on each CPU and never by master thread
 *
 * assume signal is delivered to the thread on the right CPU.
 */
static void
syswide_sigio_handler(int n, siginfo_t *info, void *sc)
{
	pfmon_thread_desc_t *arg = (pfmon_thread_desc_t *)pthread_getspecific(param_key);

	if (pthread_equal(pthread_self(), master_thread_id)) {
		warning("error: SIGIO received by master thread\n");
		return;
	}

	/* keep some statistics */
	ovfl_cnts[arg->cpu]++;

	/* 
	 * force processing of the sampling buffer upon return from the handler
	 */
	sem_post(ovfl_sem+arg->id);
}


static void
setup_global_signals(void)
{
	struct sigaction act;
	sigset_t my_set;

	/*
	 * SIGALRM, SIGINT, SIGCHILD are all asynchronous signals
	 * sent to the process (not a specific thread). POSIX states
	 * that one and only one thread will execute the handler. This
	 * could be any thread that does not have the signal blocked.
	 *
	 * For us, SIGALARM, SIGINT, and SIGCHILD are only handled by 
	 * the master thread. Therefore all the per-CPU worker thread
	 * MUST have those signals blocked.
	 *
	 * Conversly, SIGIO must be delivered to the worker threads.
	 * We cannot control which of the worker thread will get the
	 * signal. 
	 */

	/*
	 * install SIGALRM handler
	 */
	memset(&act,0,sizeof(act));

	sigemptyset(&my_set);
	sigaddset(&my_set, SIGCHLD);
	sigaddset(&my_set, SIGINT);

	act.sa_mask    = my_set;
	act.sa_flags   = SA_SIGINFO;
	act.sa_handler = (sig_t)syswide_sigalarm_handler;

	sigaction (SIGALRM, &act, 0);
	
	/* 
	 * install SIGCHLD handler
	 */
	memset(&act,0,sizeof(act));

	sigemptyset(&my_set);
	sigaddset(&my_set, SIGALRM);
	sigaddset(&my_set, SIGINT);

	act.sa_mask    = my_set;
	act.sa_flags   = SA_SIGINFO;
	act.sa_handler = (__sighandler_t)syswide_sigchild_handler;

	sigaction (SIGCHLD, &act, 0);

	/*
	 * install SIGINT handler
	 */
	memset(&act,0,sizeof(act));

	sigemptyset(&my_set);
	sigaddset(&my_set, SIGCHLD);
	sigaddset(&my_set, SIGALRM);

	act.sa_mask    = my_set;
	act.sa_flags   = SA_SIGINFO;
	act.sa_handler = (__sighandler_t)syswide_sigint_handler;

	sigaction (SIGINT, &act, 0);

	/*
	 * install global SIGIO handler
	 * used by worker thread only,
	 * no need to have other signals 
	 * mask during handler execution because
	 * they are completely masked for the thread.
	 */
	memset(&act,0,sizeof(act));

	act.sa_handler = (__sighandler_t)syswide_sigio_handler;
	act.sa_flags   = SA_SIGINFO;

	sigaction (SIGIO, &act, 0);

	/*
	 * master thread does not handle SIGIO
	 * (inherited in sub threads)
	 */
        sigemptyset(&my_set);
        sigaddset(&my_set, SIGIO);
	pthread_sigmask(SIG_BLOCK, &my_set, NULL);
}

static void
exit_system_wide(int i)
{
	pfmon_thread_desc_t *arg = (pfmon_thread_desc_t *)pthread_getspecific(param_key);

	DPRINT(("thread on CPU%-3u aborting\n", arg->cpu));

	arg->thread_state = THREAD_ERROR;
	pthread_exit((void *)((unsigned long)i));
}

static int
delay_start(void)
{
	unsigned int left_over;

	vbprintf("delaying start for %u seconds\n", options.trigger_delay);

	/*
	 * if aborted by some signal (SIGINT or SIGCHILD), then left_over
	 * is not 0
	 */
	left_over = sleep(options.trigger_delay);

	DPRINT(("delay_start: left_over=%u\n", left_over));

	return left_over ? -1 : 0;
}

static int
delimit_session(char **argv)
{
	struct timeval time_start, time_end;
	time_t the_time;
	struct rusage ru;
	unsigned left_over;
	pid_t pid;
	int status;
	int ret = 0;

	/*
	 * take care of the easy case first: no command to start
	 */
	if (argv == NULL || *argv == NULL) {

		if (options.trigger_delay && delay_start() == -1)
			return -1;

		/*
		 * this will start the session in each "worker" thread
		 */
		barrier_wait(&barrier);

		time(&the_time);
		vbprintf("measurements started on %s\n", asctime(localtime(&the_time)));

		the_time = 0;

		if (options.session_timeout) {
			printf("<session to end in %u seconds>\n", options.session_timeout);

			left_over = sleep(options.session_timeout);
			if (left_over)
				printf("session interrupted by user\n");
			else
				time(&the_time);
		} else {
			printf("<press ENTER to stop session>\n");

			ret = getchar();
			if (ret == EOF) 
				printf("session interrupted by user\n");
			else
				time(&the_time);
		}
		if (the_time) vbprintf("measurements completed at %s\n", asctime(localtime(&the_time)));

		return 0;
	}
	gettimeofday(&time_start, NULL);

	/*
	 * we fork+exec the command to run during our system wide monitoring
	 * session. When the command ends, we stop the session and print
	 * the results.
	 */
	if ((pid=fork()) == -1) {
		warning("Cannot fork new process\n");
		return -1;
	}

	if (pid == 0) {		 
		printf("child fd =%d getfd=0x%x\n", 5, fcntl(5, F_GETFD,0));
		pid = getpid();

		if (options.opt_verbose) {
			char **p = argv;
			vbprintf("starting process [%d]: ", pid);
			while (*p) vbprintf("%s ", *p++);
			vbprintf("\n");
		}
		if (options.opt_pin_cmd) {
			vbprintf("applied cpu-list for %s\n", *argv);
			if (pfmon_set_affinity(pid,  options.virt_cpu_mask)) {
				warning("could not pin %s to cpu-list\n");
			}
		}
		/*
		 * The use of ptrace() allows us to actually start monitoring after the exec()
		 * is done, i.e., when the new program is ready to go back to user mode for the
		 * "first time". With this technique, we can actually activate the workers
		 * only when the process is ready to execute. Hence, we can capture even
		 * the short lived workloads without measuring the overhead caused by fork/exec.
		 * We will capture the overhead of the PTRACE_DETACH, though.
		 */
		if (options.trigger_delay == 0) {
			if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1) {
				warning("cannot ptrace me: %s\n", strerror(errno));
				exit(1);
			}
		}
		if (options.opt_cmd_no_verbose) {
			dup2 (open("/dev/null", O_WRONLY), 1);
			dup2 (open("/dev/null", O_WRONLY), 2);
		}	

		execvp(argv[0], argv);

		warning("child: cannot exec %s: %s\n", argv[0], strerror(errno));
		exit(-1);
	} 

	if (options.trigger_delay) {
		if (delay_start() == -1) {
			warning("process %d terminated before session was activated, nothing measured\n", pid);
			return -1;
		}
	} else {
		vbprintf("waiting for [%d] to exec\n", pid);
		/* 
	 	 * wait for the child to exec 
	 	 */
		waitpid(pid, &status, WUNTRACED);
	}

	/*
	 * this will start the session in each "worker" thread
	 *
	 */
	barrier_wait(&barrier);

	/*
	 * let the task run free now
	 */
	if (options.trigger_delay == 0)	ptrace(PTRACE_DETACH, pid, NULL, NULL);

	ret = wait4(pid, &status, 0, &ru);

	gettimeofday(&time_end, NULL);

	if (ret == -1) {
		if (errno == EINTR) { 
			printf("session interrupted by user\n"); 
			ret = 0;  /* will cause the session to print results so far */
		} else {
			vbprintf("unexpected wait() error for [%d]: %s\n", pid, strerror(errno));
		}
	} else {
		if (WEXITSTATUS(status) != 0) {
			warning("process %d exited with non zero value (%d): results may be incorrect\n", 
				pid, WEXITSTATUS(status));
		}
		if (options.opt_show_rusage) show_task_rusage(&time_start, &time_end, &ru);
	}
	return ret;
}

int
measure_system_wide(pfmon_ctx_t *ctx, char **argv)
{
	void *retval;
	unsigned long i, j, num_cpus;
	int ret;

	master_tid       = gettid();
	master_thread_id = pthread_self();

	setup_global_signals();

	if (options.opt_aggr) {
		/*
		 * used by syswide_aggregate_results()
		 */
		pfmon_clone_sets(options.sets, &sdesc_sys_aggr);

		if (pfmon_setup_aggr_sampling_output(&sdesc_sys_aggr, &sdesc_sys_aggr.csmpl) == -1) 
			return -1;
	}

	session_state = SESSION_INIT;

	num_cpus = options.selected_cpus;

	vbprintf("system wide session on %lu processor(s)\n", num_cpus);

	barrier_init(&barrier, num_cpus+1);

	pthread_key_create(&param_key, NULL);

	pthread_cond_init(&results_cond, NULL);

	register_exit_function(exit_system_wide);

	for(i=0, j=0; num_cpus; i++) {
		
		if (PFMON_CPUMASK_ISSET(options.virt_cpu_mask, i) == 0) continue;

		thread_desc[j].id    = j;
		thread_desc[j].cpu   = i;

		thread_desc[j].thread_state = THREAD_STARTED;
		thread_desc[j].ctx   = ctx;

		sem_init(ovfl_sem+j, 0, 0);

		ret = pthread_create(&thread_desc[j].thread_id, 
				     NULL, 
				     (void *(*)(void *))do_measure_one_cpu, 
				     thread_desc+j);

		if (ret != 0) goto abort;

		DPRINT(("created thread[%u], %d\n", j, thread_desc[j].thread_id));

		num_cpus--;
		j++;
	}

	/* reload number of cpus */
	num_cpus = options.selected_cpus;

	/*
	 * wait for all worker thread to have reach the starting point
	 */
	barrier_wait(&barrier);

	/*
	 * check if some threads got problems
	 */
	for(i=0; i < num_cpus ; i++) {
		if (thread_desc[i].thread_state == THREAD_ERROR) {
			DPRINT(("CPU%-4u thread aborted\n", i));
			goto abort;
		}
	}

	session_state = SESSION_RUN;

	if (delimit_session(argv) == -1) goto abort;

	/*
	 * set end of session and unblock all threads
	 */
	session_state = SESSION_STOP;

	/*
	 * get worker thread out of their mainloop
	 */
	for (i=0; i < num_cpus; i++) sem_post(ovfl_sem+i);

	DPRINT(("main thread after session stop\n"));

	for(i=0; i< num_cpus; i++) {
		ret = pthread_join(thread_desc[i].thread_id, &retval);
		if (ret !=0) warning("cannot join thread %u\n", i);
		DPRINT(("CPU%-4u thread exited with value %ld\n", thread_desc[i].cpu, (unsigned long)retval));
	}

	if (options.opt_aggr) {
		print_results(&sdesc_sys_aggr); /* mask taken from options.virt_cpu_mask */
		if (options.opt_use_smpl) {
			pfmon_close_aggr_sampling_output(&sdesc_sys_aggr, &sdesc_sys_aggr.csmpl);
		}
	}

	pthread_key_delete(param_key);

	register_exit_function(NULL);

	if (options.opt_verbose && options.opt_use_smpl) {
		num_cpus = options.selected_cpus;
		for(i=0; num_cpus; i++) { 
			if (PFMON_CPUMASK_ISSET(options.virt_cpu_mask, i)) {
				vbprintf("CPU%-4u %"PRIu64" sampling buffer overflows\n", i, ovfl_cnts[i]);
				num_cpus--;
			}
		}
	}

	return 0;
abort:
	if (session_state == SESSION_RUN) 
		for (i=0; i < num_cpus; i++) sem_post(ovfl_sem+i);

	session_state = SESSION_ABORTED;

	num_cpus = options.selected_cpus;
	vbprintf("aborting %lu threads\n", num_cpus);

	for(i=0; i < num_cpus; i++) {
		DPRINT(("cancelling on CPU%lu\n", i));
		pthread_cancel(thread_desc[i].thread_id);
	}
	for(i=0; i < num_cpus; i++) {
		ret = pthread_join(thread_desc[i].thread_id, &retval);
		if (ret != 0) warning("cannot join thread %i\n", i);
		DPRINT(("CPU%-3d thread exited with value %ld\n", thread_desc[i].cpu, (unsigned long)retval));
	}

	pthread_key_delete(param_key);

	register_exit_function(NULL);

	return -1;
}
