Home | History | Annotate | Download | only in perf
      1 #include "builtin.h"
      2 #include "perf.h"
      3 
      4 #include "util/util.h"
      5 #include "util/evlist.h"
      6 #include "util/cache.h"
      7 #include "util/evsel.h"
      8 #include "util/symbol.h"
      9 #include "util/thread.h"
     10 #include "util/header.h"
     11 #include "util/session.h"
     12 #include "util/tool.h"
     13 
     14 #include "util/parse-options.h"
     15 #include "util/trace-event.h"
     16 
     17 #include "util/debug.h"
     18 
     19 #include <sys/prctl.h>
     20 #include <sys/resource.h>
     21 
     22 #include <semaphore.h>
     23 #include <pthread.h>
     24 #include <math.h>
     25 
     26 #define PR_SET_NAME		15               /* Set process name */
     27 #define MAX_CPUS		4096
     28 #define COMM_LEN		20
     29 #define SYM_LEN			129
     30 #define MAX_PID			65536
     31 
     32 struct sched_atom;
     33 
     34 struct task_desc {
     35 	unsigned long		nr;
     36 	unsigned long		pid;
     37 	char			comm[COMM_LEN];
     38 
     39 	unsigned long		nr_events;
     40 	unsigned long		curr_event;
     41 	struct sched_atom	**atoms;
     42 
     43 	pthread_t		thread;
     44 	sem_t			sleep_sem;
     45 
     46 	sem_t			ready_for_work;
     47 	sem_t			work_done_sem;
     48 
     49 	u64			cpu_usage;
     50 };
     51 
     52 enum sched_event_type {
     53 	SCHED_EVENT_RUN,
     54 	SCHED_EVENT_SLEEP,
     55 	SCHED_EVENT_WAKEUP,
     56 	SCHED_EVENT_MIGRATION,
     57 };
     58 
     59 struct sched_atom {
     60 	enum sched_event_type	type;
     61 	int			specific_wait;
     62 	u64			timestamp;
     63 	u64			duration;
     64 	unsigned long		nr;
     65 	sem_t			*wait_sem;
     66 	struct task_desc	*wakee;
     67 };
     68 
     69 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
     70 
     71 enum thread_state {
     72 	THREAD_SLEEPING = 0,
     73 	THREAD_WAIT_CPU,
     74 	THREAD_SCHED_IN,
     75 	THREAD_IGNORE
     76 };
     77 
     78 struct work_atom {
     79 	struct list_head	list;
     80 	enum thread_state	state;
     81 	u64			sched_out_time;
     82 	u64			wake_up_time;
     83 	u64			sched_in_time;
     84 	u64			runtime;
     85 };
     86 
     87 struct work_atoms {
     88 	struct list_head	work_list;
     89 	struct thread		*thread;
     90 	struct rb_node		node;
     91 	u64			max_lat;
     92 	u64			max_lat_at;
     93 	u64			total_lat;
     94 	u64			nb_atoms;
     95 	u64			total_runtime;
     96 };
     97 
     98 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
     99 
    100 struct perf_sched;
    101 
    102 struct trace_sched_handler {
    103 	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    104 			    struct perf_sample *sample, struct machine *machine);
    105 
    106 	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    107 			     struct perf_sample *sample, struct machine *machine);
    108 
    109 	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    110 			    struct perf_sample *sample, struct machine *machine);
    111 
    112 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
    113 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
    114 			  struct machine *machine);
    115 
    116 	int (*migrate_task_event)(struct perf_sched *sched,
    117 				  struct perf_evsel *evsel,
    118 				  struct perf_sample *sample,
    119 				  struct machine *machine);
    120 };
    121 
    122 struct perf_sched {
    123 	struct perf_tool tool;
    124 	const char	 *sort_order;
    125 	unsigned long	 nr_tasks;
    126 	struct task_desc *pid_to_task[MAX_PID];
    127 	struct task_desc **tasks;
    128 	const struct trace_sched_handler *tp_handler;
    129 	pthread_mutex_t	 start_work_mutex;
    130 	pthread_mutex_t	 work_done_wait_mutex;
    131 	int		 profile_cpu;
    132 /*
    133  * Track the current task - that way we can know whether there's any
    134  * weird events, such as a task being switched away that is not current.
    135  */
    136 	int		 max_cpu;
    137 	u32		 curr_pid[MAX_CPUS];
    138 	struct thread	 *curr_thread[MAX_CPUS];
    139 	char		 next_shortname1;
    140 	char		 next_shortname2;
    141 	unsigned int	 replay_repeat;
    142 	unsigned long	 nr_run_events;
    143 	unsigned long	 nr_sleep_events;
    144 	unsigned long	 nr_wakeup_events;
    145 	unsigned long	 nr_sleep_corrections;
    146 	unsigned long	 nr_run_events_optimized;
    147 	unsigned long	 targetless_wakeups;
    148 	unsigned long	 multitarget_wakeups;
    149 	unsigned long	 nr_runs;
    150 	unsigned long	 nr_timestamps;
    151 	unsigned long	 nr_unordered_timestamps;
    152 	unsigned long	 nr_state_machine_bugs;
    153 	unsigned long	 nr_context_switch_bugs;
    154 	unsigned long	 nr_events;
    155 	unsigned long	 nr_lost_chunks;
    156 	unsigned long	 nr_lost_events;
    157 	u64		 run_measurement_overhead;
    158 	u64		 sleep_measurement_overhead;
    159 	u64		 start_time;
    160 	u64		 cpu_usage;
    161 	u64		 runavg_cpu_usage;
    162 	u64		 parent_cpu_usage;
    163 	u64		 runavg_parent_cpu_usage;
    164 	u64		 sum_runtime;
    165 	u64		 sum_fluct;
    166 	u64		 run_avg;
    167 	u64		 all_runtime;
    168 	u64		 all_count;
    169 	u64		 cpu_last_switched[MAX_CPUS];
    170 	struct rb_root	 atom_root, sorted_atom_root;
    171 	struct list_head sort_list, cmp_pid;
    172 };
    173 
    174 static u64 get_nsecs(void)
    175 {
    176 	struct timespec ts;
    177 
    178 	clock_gettime(CLOCK_MONOTONIC, &ts);
    179 
    180 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
    181 }
    182 
    183 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
    184 {
    185 	u64 T0 = get_nsecs(), T1;
    186 
    187 	do {
    188 		T1 = get_nsecs();
    189 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
    190 }
    191 
    192 static void sleep_nsecs(u64 nsecs)
    193 {
    194 	struct timespec ts;
    195 
    196 	ts.tv_nsec = nsecs % 999999999;
    197 	ts.tv_sec = nsecs / 999999999;
    198 
    199 	nanosleep(&ts, NULL);
    200 }
    201 
    202 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
    203 {
    204 	u64 T0, T1, delta, min_delta = 1000000000ULL;
    205 	int i;
    206 
    207 	for (i = 0; i < 10; i++) {
    208 		T0 = get_nsecs();
    209 		burn_nsecs(sched, 0);
    210 		T1 = get_nsecs();
    211 		delta = T1-T0;
    212 		min_delta = min(min_delta, delta);
    213 	}
    214 	sched->run_measurement_overhead = min_delta;
    215 
    216 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    217 }
    218 
    219 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
    220 {
    221 	u64 T0, T1, delta, min_delta = 1000000000ULL;
    222 	int i;
    223 
    224 	for (i = 0; i < 10; i++) {
    225 		T0 = get_nsecs();
    226 		sleep_nsecs(10000);
    227 		T1 = get_nsecs();
    228 		delta = T1-T0;
    229 		min_delta = min(min_delta, delta);
    230 	}
    231 	min_delta -= 10000;
    232 	sched->sleep_measurement_overhead = min_delta;
    233 
    234 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    235 }
    236 
    237 static struct sched_atom *
    238 get_new_event(struct task_desc *task, u64 timestamp)
    239 {
    240 	struct sched_atom *event = zalloc(sizeof(*event));
    241 	unsigned long idx = task->nr_events;
    242 	size_t size;
    243 
    244 	event->timestamp = timestamp;
    245 	event->nr = idx;
    246 
    247 	task->nr_events++;
    248 	size = sizeof(struct sched_atom *) * task->nr_events;
    249 	task->atoms = realloc(task->atoms, size);
    250 	BUG_ON(!task->atoms);
    251 
    252 	task->atoms[idx] = event;
    253 
    254 	return event;
    255 }
    256 
    257 static struct sched_atom *last_event(struct task_desc *task)
    258 {
    259 	if (!task->nr_events)
    260 		return NULL;
    261 
    262 	return task->atoms[task->nr_events - 1];
    263 }
    264 
    265 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
    266 				u64 timestamp, u64 duration)
    267 {
    268 	struct sched_atom *event, *curr_event = last_event(task);
    269 
    270 	/*
    271 	 * optimize an existing RUN event by merging this one
    272 	 * to it:
    273 	 */
    274 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
    275 		sched->nr_run_events_optimized++;
    276 		curr_event->duration += duration;
    277 		return;
    278 	}
    279 
    280 	event = get_new_event(task, timestamp);
    281 
    282 	event->type = SCHED_EVENT_RUN;
    283 	event->duration = duration;
    284 
    285 	sched->nr_run_events++;
    286 }
    287 
    288 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
    289 				   u64 timestamp, struct task_desc *wakee)
    290 {
    291 	struct sched_atom *event, *wakee_event;
    292 
    293 	event = get_new_event(task, timestamp);
    294 	event->type = SCHED_EVENT_WAKEUP;
    295 	event->wakee = wakee;
    296 
    297 	wakee_event = last_event(wakee);
    298 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
    299 		sched->targetless_wakeups++;
    300 		return;
    301 	}
    302 	if (wakee_event->wait_sem) {
    303 		sched->multitarget_wakeups++;
    304 		return;
    305 	}
    306 
    307 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
    308 	sem_init(wakee_event->wait_sem, 0, 0);
    309 	wakee_event->specific_wait = 1;
    310 	event->wait_sem = wakee_event->wait_sem;
    311 
    312 	sched->nr_wakeup_events++;
    313 }
    314 
    315 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
    316 				  u64 timestamp, u64 task_state __maybe_unused)
    317 {
    318 	struct sched_atom *event = get_new_event(task, timestamp);
    319 
    320 	event->type = SCHED_EVENT_SLEEP;
    321 
    322 	sched->nr_sleep_events++;
    323 }
    324 
    325 static struct task_desc *register_pid(struct perf_sched *sched,
    326 				      unsigned long pid, const char *comm)
    327 {
    328 	struct task_desc *task;
    329 
    330 	BUG_ON(pid >= MAX_PID);
    331 
    332 	task = sched->pid_to_task[pid];
    333 
    334 	if (task)
    335 		return task;
    336 
    337 	task = zalloc(sizeof(*task));
    338 	task->pid = pid;
    339 	task->nr = sched->nr_tasks;
    340 	strcpy(task->comm, comm);
    341 	/*
    342 	 * every task starts in sleeping state - this gets ignored
    343 	 * if there's no wakeup pointing to this sleep state:
    344 	 */
    345 	add_sched_event_sleep(sched, task, 0, 0);
    346 
    347 	sched->pid_to_task[pid] = task;
    348 	sched->nr_tasks++;
    349 	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
    350 	BUG_ON(!sched->tasks);
    351 	sched->tasks[task->nr] = task;
    352 
    353 	if (verbose)
    354 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
    355 
    356 	return task;
    357 }
    358 
    359 
    360 static void print_task_traces(struct perf_sched *sched)
    361 {
    362 	struct task_desc *task;
    363 	unsigned long i;
    364 
    365 	for (i = 0; i < sched->nr_tasks; i++) {
    366 		task = sched->tasks[i];
    367 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
    368 			task->nr, task->comm, task->pid, task->nr_events);
    369 	}
    370 }
    371 
    372 static void add_cross_task_wakeups(struct perf_sched *sched)
    373 {
    374 	struct task_desc *task1, *task2;
    375 	unsigned long i, j;
    376 
    377 	for (i = 0; i < sched->nr_tasks; i++) {
    378 		task1 = sched->tasks[i];
    379 		j = i + 1;
    380 		if (j == sched->nr_tasks)
    381 			j = 0;
    382 		task2 = sched->tasks[j];
    383 		add_sched_event_wakeup(sched, task1, 0, task2);
    384 	}
    385 }
    386 
    387 static void perf_sched__process_event(struct perf_sched *sched,
    388 				      struct sched_atom *atom)
    389 {
    390 	int ret = 0;
    391 
    392 	switch (atom->type) {
    393 		case SCHED_EVENT_RUN:
    394 			burn_nsecs(sched, atom->duration);
    395 			break;
    396 		case SCHED_EVENT_SLEEP:
    397 			if (atom->wait_sem)
    398 				ret = sem_wait(atom->wait_sem);
    399 			BUG_ON(ret);
    400 			break;
    401 		case SCHED_EVENT_WAKEUP:
    402 			if (atom->wait_sem)
    403 				ret = sem_post(atom->wait_sem);
    404 			BUG_ON(ret);
    405 			break;
    406 		case SCHED_EVENT_MIGRATION:
    407 			break;
    408 		default:
    409 			BUG_ON(1);
    410 	}
    411 }
    412 
    413 static u64 get_cpu_usage_nsec_parent(void)
    414 {
    415 	struct rusage ru;
    416 	u64 sum;
    417 	int err;
    418 
    419 	err = getrusage(RUSAGE_SELF, &ru);
    420 	BUG_ON(err);
    421 
    422 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
    423 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
    424 
    425 	return sum;
    426 }
    427 
    428 static int self_open_counters(void)
    429 {
    430 	struct perf_event_attr attr;
    431 	int fd;
    432 
    433 	memset(&attr, 0, sizeof(attr));
    434 
    435 	attr.type = PERF_TYPE_SOFTWARE;
    436 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
    437 
    438 	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
    439 
    440 	if (fd < 0)
    441 		pr_err("Error: sys_perf_event_open() syscall returned "
    442 		       "with %d (%s)\n", fd, strerror(errno));
    443 	return fd;
    444 }
    445 
    446 static u64 get_cpu_usage_nsec_self(int fd)
    447 {
    448 	u64 runtime;
    449 	int ret;
    450 
    451 	ret = read(fd, &runtime, sizeof(runtime));
    452 	BUG_ON(ret != sizeof(runtime));
    453 
    454 	return runtime;
    455 }
    456 
    457 struct sched_thread_parms {
    458 	struct task_desc  *task;
    459 	struct perf_sched *sched;
    460 };
    461 
    462 static void *thread_func(void *ctx)
    463 {
    464 	struct sched_thread_parms *parms = ctx;
    465 	struct task_desc *this_task = parms->task;
    466 	struct perf_sched *sched = parms->sched;
    467 	u64 cpu_usage_0, cpu_usage_1;
    468 	unsigned long i, ret;
    469 	char comm2[22];
    470 	int fd;
    471 
    472 	free(parms);
    473 
    474 	sprintf(comm2, ":%s", this_task->comm);
    475 	prctl(PR_SET_NAME, comm2);
    476 	fd = self_open_counters();
    477 	if (fd < 0)
    478 		return NULL;
    479 again:
    480 	ret = sem_post(&this_task->ready_for_work);
    481 	BUG_ON(ret);
    482 	ret = pthread_mutex_lock(&sched->start_work_mutex);
    483 	BUG_ON(ret);
    484 	ret = pthread_mutex_unlock(&sched->start_work_mutex);
    485 	BUG_ON(ret);
    486 
    487 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
    488 
    489 	for (i = 0; i < this_task->nr_events; i++) {
    490 		this_task->curr_event = i;
    491 		perf_sched__process_event(sched, this_task->atoms[i]);
    492 	}
    493 
    494 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
    495 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
    496 	ret = sem_post(&this_task->work_done_sem);
    497 	BUG_ON(ret);
    498 
    499 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    500 	BUG_ON(ret);
    501 	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
    502 	BUG_ON(ret);
    503 
    504 	goto again;
    505 }
    506 
    507 static void create_tasks(struct perf_sched *sched)
    508 {
    509 	struct task_desc *task;
    510 	pthread_attr_t attr;
    511 	unsigned long i;
    512 	int err;
    513 
    514 	err = pthread_attr_init(&attr);
    515 	BUG_ON(err);
    516 	err = pthread_attr_setstacksize(&attr,
    517 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
    518 	BUG_ON(err);
    519 	err = pthread_mutex_lock(&sched->start_work_mutex);
    520 	BUG_ON(err);
    521 	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
    522 	BUG_ON(err);
    523 	for (i = 0; i < sched->nr_tasks; i++) {
    524 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
    525 		BUG_ON(parms == NULL);
    526 		parms->task = task = sched->tasks[i];
    527 		parms->sched = sched;
    528 		sem_init(&task->sleep_sem, 0, 0);
    529 		sem_init(&task->ready_for_work, 0, 0);
    530 		sem_init(&task->work_done_sem, 0, 0);
    531 		task->curr_event = 0;
    532 		err = pthread_create(&task->thread, &attr, thread_func, parms);
    533 		BUG_ON(err);
    534 	}
    535 }
    536 
    537 static void wait_for_tasks(struct perf_sched *sched)
    538 {
    539 	u64 cpu_usage_0, cpu_usage_1;
    540 	struct task_desc *task;
    541 	unsigned long i, ret;
    542 
    543 	sched->start_time = get_nsecs();
    544 	sched->cpu_usage = 0;
    545 	pthread_mutex_unlock(&sched->work_done_wait_mutex);
    546 
    547 	for (i = 0; i < sched->nr_tasks; i++) {
    548 		task = sched->tasks[i];
    549 		ret = sem_wait(&task->ready_for_work);
    550 		BUG_ON(ret);
    551 		sem_init(&task->ready_for_work, 0, 0);
    552 	}
    553 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    554 	BUG_ON(ret);
    555 
    556 	cpu_usage_0 = get_cpu_usage_nsec_parent();
    557 
    558 	pthread_mutex_unlock(&sched->start_work_mutex);
    559 
    560 	for (i = 0; i < sched->nr_tasks; i++) {
    561 		task = sched->tasks[i];
    562 		ret = sem_wait(&task->work_done_sem);
    563 		BUG_ON(ret);
    564 		sem_init(&task->work_done_sem, 0, 0);
    565 		sched->cpu_usage += task->cpu_usage;
    566 		task->cpu_usage = 0;
    567 	}
    568 
    569 	cpu_usage_1 = get_cpu_usage_nsec_parent();
    570 	if (!sched->runavg_cpu_usage)
    571 		sched->runavg_cpu_usage = sched->cpu_usage;
    572 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
    573 
    574 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
    575 	if (!sched->runavg_parent_cpu_usage)
    576 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
    577 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
    578 					 sched->parent_cpu_usage)/10;
    579 
    580 	ret = pthread_mutex_lock(&sched->start_work_mutex);
    581 	BUG_ON(ret);
    582 
    583 	for (i = 0; i < sched->nr_tasks; i++) {
    584 		task = sched->tasks[i];
    585 		sem_init(&task->sleep_sem, 0, 0);
    586 		task->curr_event = 0;
    587 	}
    588 }
    589 
    590 static void run_one_test(struct perf_sched *sched)
    591 {
    592 	u64 T0, T1, delta, avg_delta, fluct;
    593 
    594 	T0 = get_nsecs();
    595 	wait_for_tasks(sched);
    596 	T1 = get_nsecs();
    597 
    598 	delta = T1 - T0;
    599 	sched->sum_runtime += delta;
    600 	sched->nr_runs++;
    601 
    602 	avg_delta = sched->sum_runtime / sched->nr_runs;
    603 	if (delta < avg_delta)
    604 		fluct = avg_delta - delta;
    605 	else
    606 		fluct = delta - avg_delta;
    607 	sched->sum_fluct += fluct;
    608 	if (!sched->run_avg)
    609 		sched->run_avg = delta;
    610 	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
    611 
    612 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
    613 
    614 	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
    615 
    616 	printf("cpu: %0.2f / %0.2f",
    617 		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
    618 
    619 #if 0
    620 	/*
    621 	 * rusage statistics done by the parent, these are less
    622 	 * accurate than the sched->sum_exec_runtime based statistics:
    623 	 */
    624 	printf(" [%0.2f / %0.2f]",
    625 		(double)sched->parent_cpu_usage/1e6,
    626 		(double)sched->runavg_parent_cpu_usage/1e6);
    627 #endif
    628 
    629 	printf("\n");
    630 
    631 	if (sched->nr_sleep_corrections)
    632 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
    633 	sched->nr_sleep_corrections = 0;
    634 }
    635 
    636 static void test_calibrations(struct perf_sched *sched)
    637 {
    638 	u64 T0, T1;
    639 
    640 	T0 = get_nsecs();
    641 	burn_nsecs(sched, 1e6);
    642 	T1 = get_nsecs();
    643 
    644 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
    645 
    646 	T0 = get_nsecs();
    647 	sleep_nsecs(1e6);
    648 	T1 = get_nsecs();
    649 
    650 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
    651 }
    652 
    653 static int
    654 replay_wakeup_event(struct perf_sched *sched,
    655 		    struct perf_evsel *evsel, struct perf_sample *sample,
    656 		    struct machine *machine __maybe_unused)
    657 {
    658 	const char *comm = perf_evsel__strval(evsel, sample, "comm");
    659 	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
    660 	struct task_desc *waker, *wakee;
    661 
    662 	if (verbose) {
    663 		printf("sched_wakeup event %p\n", evsel);
    664 
    665 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
    666 	}
    667 
    668 	waker = register_pid(sched, sample->tid, "<unknown>");
    669 	wakee = register_pid(sched, pid, comm);
    670 
    671 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
    672 	return 0;
    673 }
    674 
    675 static int replay_switch_event(struct perf_sched *sched,
    676 			       struct perf_evsel *evsel,
    677 			       struct perf_sample *sample,
    678 			       struct machine *machine __maybe_unused)
    679 {
    680 	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
    681 		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
    682 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
    683 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
    684 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
    685 	struct task_desc *prev, __maybe_unused *next;
    686 	u64 timestamp0, timestamp = sample->time;
    687 	int cpu = sample->cpu;
    688 	s64 delta;
    689 
    690 	if (verbose)
    691 		printf("sched_switch event %p\n", evsel);
    692 
    693 	if (cpu >= MAX_CPUS || cpu < 0)
    694 		return 0;
    695 
    696 	timestamp0 = sched->cpu_last_switched[cpu];
    697 	if (timestamp0)
    698 		delta = timestamp - timestamp0;
    699 	else
    700 		delta = 0;
    701 
    702 	if (delta < 0) {
    703 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
    704 		return -1;
    705 	}
    706 
    707 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
    708 		 prev_comm, prev_pid, next_comm, next_pid, delta);
    709 
    710 	prev = register_pid(sched, prev_pid, prev_comm);
    711 	next = register_pid(sched, next_pid, next_comm);
    712 
    713 	sched->cpu_last_switched[cpu] = timestamp;
    714 
    715 	add_sched_event_run(sched, prev, timestamp, delta);
    716 	add_sched_event_sleep(sched, prev, timestamp, prev_state);
    717 
    718 	return 0;
    719 }
    720 
    721 static int replay_fork_event(struct perf_sched *sched,
    722 			     union perf_event *event,
    723 			     struct machine *machine)
    724 {
    725 	struct thread *child, *parent;
    726 
    727 	child = machine__findnew_thread(machine, event->fork.pid,
    728 					event->fork.tid);
    729 	parent = machine__findnew_thread(machine, event->fork.ppid,
    730 					 event->fork.ptid);
    731 
    732 	if (child == NULL || parent == NULL) {
    733 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
    734 				 child, parent);
    735 		return 0;
    736 	}
    737 
    738 	if (verbose) {
    739 		printf("fork event\n");
    740 		printf("... parent: %s/%d\n", parent->comm, parent->tid);
    741 		printf("...  child: %s/%d\n", child->comm, child->tid);
    742 	}
    743 
    744 	register_pid(sched, parent->tid, parent->comm);
    745 	register_pid(sched, child->tid, child->comm);
    746 	return 0;
    747 }
    748 
    749 struct sort_dimension {
    750 	const char		*name;
    751 	sort_fn_t		cmp;
    752 	struct list_head	list;
    753 };
    754 
    755 static int
    756 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
    757 {
    758 	struct sort_dimension *sort;
    759 	int ret = 0;
    760 
    761 	BUG_ON(list_empty(list));
    762 
    763 	list_for_each_entry(sort, list, list) {
    764 		ret = sort->cmp(l, r);
    765 		if (ret)
    766 			return ret;
    767 	}
    768 
    769 	return ret;
    770 }
    771 
    772 static struct work_atoms *
    773 thread_atoms_search(struct rb_root *root, struct thread *thread,
    774 			 struct list_head *sort_list)
    775 {
    776 	struct rb_node *node = root->rb_node;
    777 	struct work_atoms key = { .thread = thread };
    778 
    779 	while (node) {
    780 		struct work_atoms *atoms;
    781 		int cmp;
    782 
    783 		atoms = container_of(node, struct work_atoms, node);
    784 
    785 		cmp = thread_lat_cmp(sort_list, &key, atoms);
    786 		if (cmp > 0)
    787 			node = node->rb_left;
    788 		else if (cmp < 0)
    789 			node = node->rb_right;
    790 		else {
    791 			BUG_ON(thread != atoms->thread);
    792 			return atoms;
    793 		}
    794 	}
    795 	return NULL;
    796 }
    797 
    798 static void
    799 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
    800 			 struct list_head *sort_list)
    801 {
    802 	struct rb_node **new = &(root->rb_node), *parent = NULL;
    803 
    804 	while (*new) {
    805 		struct work_atoms *this;
    806 		int cmp;
    807 
    808 		this = container_of(*new, struct work_atoms, node);
    809 		parent = *new;
    810 
    811 		cmp = thread_lat_cmp(sort_list, data, this);
    812 
    813 		if (cmp > 0)
    814 			new = &((*new)->rb_left);
    815 		else
    816 			new = &((*new)->rb_right);
    817 	}
    818 
    819 	rb_link_node(&data->node, parent, new);
    820 	rb_insert_color(&data->node, root);
    821 }
    822 
    823 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
    824 {
    825 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
    826 	if (!atoms) {
    827 		pr_err("No memory at %s\n", __func__);
    828 		return -1;
    829 	}
    830 
    831 	atoms->thread = thread;
    832 	INIT_LIST_HEAD(&atoms->work_list);
    833 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
    834 	return 0;
    835 }
    836 
    837 static char sched_out_state(u64 prev_state)
    838 {
    839 	const char *str = TASK_STATE_TO_CHAR_STR;
    840 
    841 	return str[prev_state];
    842 }
    843 
    844 static int
    845 add_sched_out_event(struct work_atoms *atoms,
    846 		    char run_state,
    847 		    u64 timestamp)
    848 {
    849 	struct work_atom *atom = zalloc(sizeof(*atom));
    850 	if (!atom) {
    851 		pr_err("Non memory at %s", __func__);
    852 		return -1;
    853 	}
    854 
    855 	atom->sched_out_time = timestamp;
    856 
    857 	if (run_state == 'R') {
    858 		atom->state = THREAD_WAIT_CPU;
    859 		atom->wake_up_time = atom->sched_out_time;
    860 	}
    861 
    862 	list_add_tail(&atom->list, &atoms->work_list);
    863 	return 0;
    864 }
    865 
    866 static void
    867 add_runtime_event(struct work_atoms *atoms, u64 delta,
    868 		  u64 timestamp __maybe_unused)
    869 {
    870 	struct work_atom *atom;
    871 
    872 	BUG_ON(list_empty(&atoms->work_list));
    873 
    874 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
    875 
    876 	atom->runtime += delta;
    877 	atoms->total_runtime += delta;
    878 }
    879 
    880 static void
    881 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
    882 {
    883 	struct work_atom *atom;
    884 	u64 delta;
    885 
    886 	if (list_empty(&atoms->work_list))
    887 		return;
    888 
    889 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
    890 
    891 	if (atom->state != THREAD_WAIT_CPU)
    892 		return;
    893 
    894 	if (timestamp < atom->wake_up_time) {
    895 		atom->state = THREAD_IGNORE;
    896 		return;
    897 	}
    898 
    899 	atom->state = THREAD_SCHED_IN;
    900 	atom->sched_in_time = timestamp;
    901 
    902 	delta = atom->sched_in_time - atom->wake_up_time;
    903 	atoms->total_lat += delta;
    904 	if (delta > atoms->max_lat) {
    905 		atoms->max_lat = delta;
    906 		atoms->max_lat_at = timestamp;
    907 	}
    908 	atoms->nb_atoms++;
    909 }
    910 
    911 static int latency_switch_event(struct perf_sched *sched,
    912 				struct perf_evsel *evsel,
    913 				struct perf_sample *sample,
    914 				struct machine *machine)
    915 {
    916 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
    917 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
    918 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
    919 	struct work_atoms *out_events, *in_events;
    920 	struct thread *sched_out, *sched_in;
    921 	u64 timestamp0, timestamp = sample->time;
    922 	int cpu = sample->cpu;
    923 	s64 delta;
    924 
    925 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
    926 
    927 	timestamp0 = sched->cpu_last_switched[cpu];
    928 	sched->cpu_last_switched[cpu] = timestamp;
    929 	if (timestamp0)
    930 		delta = timestamp - timestamp0;
    931 	else
    932 		delta = 0;
    933 
    934 	if (delta < 0) {
    935 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
    936 		return -1;
    937 	}
    938 
    939 	sched_out = machine__findnew_thread(machine, 0, prev_pid);
    940 	sched_in = machine__findnew_thread(machine, 0, next_pid);
    941 
    942 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
    943 	if (!out_events) {
    944 		if (thread_atoms_insert(sched, sched_out))
    945 			return -1;
    946 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
    947 		if (!out_events) {
    948 			pr_err("out-event: Internal tree error");
    949 			return -1;
    950 		}
    951 	}
    952 	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
    953 		return -1;
    954 
    955 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
    956 	if (!in_events) {
    957 		if (thread_atoms_insert(sched, sched_in))
    958 			return -1;
    959 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
    960 		if (!in_events) {
    961 			pr_err("in-event: Internal tree error");
    962 			return -1;
    963 		}
    964 		/*
    965 		 * Take came in we have not heard about yet,
    966 		 * add in an initial atom in runnable state:
    967 		 */
    968 		if (add_sched_out_event(in_events, 'R', timestamp))
    969 			return -1;
    970 	}
    971 	add_sched_in_event(in_events, timestamp);
    972 
    973 	return 0;
    974 }
    975 
    976 static int latency_runtime_event(struct perf_sched *sched,
    977 				 struct perf_evsel *evsel,
    978 				 struct perf_sample *sample,
    979 				 struct machine *machine)
    980 {
    981 	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
    982 	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
    983 	struct thread *thread = machine__findnew_thread(machine, 0, pid);
    984 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
    985 	u64 timestamp = sample->time;
    986 	int cpu = sample->cpu;
    987 
    988 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
    989 	if (!atoms) {
    990 		if (thread_atoms_insert(sched, thread))
    991 			return -1;
    992 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
    993 		if (!atoms) {
    994 			pr_err("in-event: Internal tree error");
    995 			return -1;
    996 		}
    997 		if (add_sched_out_event(atoms, 'R', timestamp))
    998 			return -1;
    999 	}
   1000 
   1001 	add_runtime_event(atoms, runtime, timestamp);
   1002 	return 0;
   1003 }
   1004 
   1005 static int latency_wakeup_event(struct perf_sched *sched,
   1006 				struct perf_evsel *evsel,
   1007 				struct perf_sample *sample,
   1008 				struct machine *machine)
   1009 {
   1010 	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid"),
   1011 		  success = perf_evsel__intval(evsel, sample, "success");
   1012 	struct work_atoms *atoms;
   1013 	struct work_atom *atom;
   1014 	struct thread *wakee;
   1015 	u64 timestamp = sample->time;
   1016 
   1017 	/* Note for later, it may be interesting to observe the failing cases */
   1018 	if (!success)
   1019 		return 0;
   1020 
   1021 	wakee = machine__findnew_thread(machine, 0, pid);
   1022 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1023 	if (!atoms) {
   1024 		if (thread_atoms_insert(sched, wakee))
   1025 			return -1;
   1026 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1027 		if (!atoms) {
   1028 			pr_err("wakeup-event: Internal tree error");
   1029 			return -1;
   1030 		}
   1031 		if (add_sched_out_event(atoms, 'S', timestamp))
   1032 			return -1;
   1033 	}
   1034 
   1035 	BUG_ON(list_empty(&atoms->work_list));
   1036 
   1037 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1038 
   1039 	/*
   1040 	 * You WILL be missing events if you've recorded only
   1041 	 * one CPU, or are only looking at only one, so don't
   1042 	 * make useless noise.
   1043 	 */
   1044 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
   1045 		sched->nr_state_machine_bugs++;
   1046 
   1047 	sched->nr_timestamps++;
   1048 	if (atom->sched_out_time > timestamp) {
   1049 		sched->nr_unordered_timestamps++;
   1050 		return 0;
   1051 	}
   1052 
   1053 	atom->state = THREAD_WAIT_CPU;
   1054 	atom->wake_up_time = timestamp;
   1055 	return 0;
   1056 }
   1057 
   1058 static int latency_migrate_task_event(struct perf_sched *sched,
   1059 				      struct perf_evsel *evsel,
   1060 				      struct perf_sample *sample,
   1061 				      struct machine *machine)
   1062 {
   1063 	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
   1064 	u64 timestamp = sample->time;
   1065 	struct work_atoms *atoms;
   1066 	struct work_atom *atom;
   1067 	struct thread *migrant;
   1068 
   1069 	/*
   1070 	 * Only need to worry about migration when profiling one CPU.
   1071 	 */
   1072 	if (sched->profile_cpu == -1)
   1073 		return 0;
   1074 
   1075 	migrant = machine__findnew_thread(machine, 0, pid);
   1076 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1077 	if (!atoms) {
   1078 		if (thread_atoms_insert(sched, migrant))
   1079 			return -1;
   1080 		register_pid(sched, migrant->tid, migrant->comm);
   1081 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1082 		if (!atoms) {
   1083 			pr_err("migration-event: Internal tree error");
   1084 			return -1;
   1085 		}
   1086 		if (add_sched_out_event(atoms, 'R', timestamp))
   1087 			return -1;
   1088 	}
   1089 
   1090 	BUG_ON(list_empty(&atoms->work_list));
   1091 
   1092 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1093 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
   1094 
   1095 	sched->nr_timestamps++;
   1096 
   1097 	if (atom->sched_out_time > timestamp)
   1098 		sched->nr_unordered_timestamps++;
   1099 
   1100 	return 0;
   1101 }
   1102 
   1103 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
   1104 {
   1105 	int i;
   1106 	int ret;
   1107 	u64 avg;
   1108 
   1109 	if (!work_list->nb_atoms)
   1110 		return;
   1111 	/*
   1112 	 * Ignore idle threads:
   1113 	 */
   1114 	if (!strcmp(work_list->thread->comm, "swapper"))
   1115 		return;
   1116 
   1117 	sched->all_runtime += work_list->total_runtime;
   1118 	sched->all_count   += work_list->nb_atoms;
   1119 
   1120 	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->tid);
   1121 
   1122 	for (i = 0; i < 24 - ret; i++)
   1123 		printf(" ");
   1124 
   1125 	avg = work_list->total_lat / work_list->nb_atoms;
   1126 
   1127 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
   1128 	      (double)work_list->total_runtime / 1e6,
   1129 		 work_list->nb_atoms, (double)avg / 1e6,
   1130 		 (double)work_list->max_lat / 1e6,
   1131 		 (double)work_list->max_lat_at / 1e9);
   1132 }
   1133 
   1134 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
   1135 {
   1136 	if (l->thread->tid < r->thread->tid)
   1137 		return -1;
   1138 	if (l->thread->tid > r->thread->tid)
   1139 		return 1;
   1140 
   1141 	return 0;
   1142 }
   1143 
   1144 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
   1145 {
   1146 	u64 avgl, avgr;
   1147 
   1148 	if (!l->nb_atoms)
   1149 		return -1;
   1150 
   1151 	if (!r->nb_atoms)
   1152 		return 1;
   1153 
   1154 	avgl = l->total_lat / l->nb_atoms;
   1155 	avgr = r->total_lat / r->nb_atoms;
   1156 
   1157 	if (avgl < avgr)
   1158 		return -1;
   1159 	if (avgl > avgr)
   1160 		return 1;
   1161 
   1162 	return 0;
   1163 }
   1164 
   1165 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
   1166 {
   1167 	if (l->max_lat < r->max_lat)
   1168 		return -1;
   1169 	if (l->max_lat > r->max_lat)
   1170 		return 1;
   1171 
   1172 	return 0;
   1173 }
   1174 
   1175 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
   1176 {
   1177 	if (l->nb_atoms < r->nb_atoms)
   1178 		return -1;
   1179 	if (l->nb_atoms > r->nb_atoms)
   1180 		return 1;
   1181 
   1182 	return 0;
   1183 }
   1184 
   1185 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
   1186 {
   1187 	if (l->total_runtime < r->total_runtime)
   1188 		return -1;
   1189 	if (l->total_runtime > r->total_runtime)
   1190 		return 1;
   1191 
   1192 	return 0;
   1193 }
   1194 
   1195 static int sort_dimension__add(const char *tok, struct list_head *list)
   1196 {
   1197 	size_t i;
   1198 	static struct sort_dimension avg_sort_dimension = {
   1199 		.name = "avg",
   1200 		.cmp  = avg_cmp,
   1201 	};
   1202 	static struct sort_dimension max_sort_dimension = {
   1203 		.name = "max",
   1204 		.cmp  = max_cmp,
   1205 	};
   1206 	static struct sort_dimension pid_sort_dimension = {
   1207 		.name = "pid",
   1208 		.cmp  = pid_cmp,
   1209 	};
   1210 	static struct sort_dimension runtime_sort_dimension = {
   1211 		.name = "runtime",
   1212 		.cmp  = runtime_cmp,
   1213 	};
   1214 	static struct sort_dimension switch_sort_dimension = {
   1215 		.name = "switch",
   1216 		.cmp  = switch_cmp,
   1217 	};
   1218 	struct sort_dimension *available_sorts[] = {
   1219 		&pid_sort_dimension,
   1220 		&avg_sort_dimension,
   1221 		&max_sort_dimension,
   1222 		&switch_sort_dimension,
   1223 		&runtime_sort_dimension,
   1224 	};
   1225 
   1226 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
   1227 		if (!strcmp(available_sorts[i]->name, tok)) {
   1228 			list_add_tail(&available_sorts[i]->list, list);
   1229 
   1230 			return 0;
   1231 		}
   1232 	}
   1233 
   1234 	return -1;
   1235 }
   1236 
   1237 static void perf_sched__sort_lat(struct perf_sched *sched)
   1238 {
   1239 	struct rb_node *node;
   1240 
   1241 	for (;;) {
   1242 		struct work_atoms *data;
   1243 		node = rb_first(&sched->atom_root);
   1244 		if (!node)
   1245 			break;
   1246 
   1247 		rb_erase(node, &sched->atom_root);
   1248 		data = rb_entry(node, struct work_atoms, node);
   1249 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
   1250 	}
   1251 }
   1252 
   1253 static int process_sched_wakeup_event(struct perf_tool *tool,
   1254 				      struct perf_evsel *evsel,
   1255 				      struct perf_sample *sample,
   1256 				      struct machine *machine)
   1257 {
   1258 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1259 
   1260 	if (sched->tp_handler->wakeup_event)
   1261 		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
   1262 
   1263 	return 0;
   1264 }
   1265 
   1266 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
   1267 			    struct perf_sample *sample, struct machine *machine)
   1268 {
   1269 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
   1270 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
   1271 	struct thread *sched_out __maybe_unused, *sched_in;
   1272 	int new_shortname;
   1273 	u64 timestamp0, timestamp = sample->time;
   1274 	s64 delta;
   1275 	int cpu, this_cpu = sample->cpu;
   1276 
   1277 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
   1278 
   1279 	if (this_cpu > sched->max_cpu)
   1280 		sched->max_cpu = this_cpu;
   1281 
   1282 	timestamp0 = sched->cpu_last_switched[this_cpu];
   1283 	sched->cpu_last_switched[this_cpu] = timestamp;
   1284 	if (timestamp0)
   1285 		delta = timestamp - timestamp0;
   1286 	else
   1287 		delta = 0;
   1288 
   1289 	if (delta < 0) {
   1290 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
   1291 		return -1;
   1292 	}
   1293 
   1294 	sched_out = machine__findnew_thread(machine, 0, prev_pid);
   1295 	sched_in = machine__findnew_thread(machine, 0, next_pid);
   1296 
   1297 	sched->curr_thread[this_cpu] = sched_in;
   1298 
   1299 	printf("  ");
   1300 
   1301 	new_shortname = 0;
   1302 	if (!sched_in->shortname[0]) {
   1303 		sched_in->shortname[0] = sched->next_shortname1;
   1304 		sched_in->shortname[1] = sched->next_shortname2;
   1305 
   1306 		if (sched->next_shortname1 < 'Z') {
   1307 			sched->next_shortname1++;
   1308 		} else {
   1309 			sched->next_shortname1='A';
   1310 			if (sched->next_shortname2 < '9') {
   1311 				sched->next_shortname2++;
   1312 			} else {
   1313 				sched->next_shortname2='0';
   1314 			}
   1315 		}
   1316 		new_shortname = 1;
   1317 	}
   1318 
   1319 	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
   1320 		if (cpu != this_cpu)
   1321 			printf(" ");
   1322 		else
   1323 			printf("*");
   1324 
   1325 		if (sched->curr_thread[cpu]) {
   1326 			if (sched->curr_thread[cpu]->tid)
   1327 				printf("%2s ", sched->curr_thread[cpu]->shortname);
   1328 			else
   1329 				printf(".  ");
   1330 		} else
   1331 			printf("   ");
   1332 	}
   1333 
   1334 	printf("  %12.6f secs ", (double)timestamp/1e9);
   1335 	if (new_shortname) {
   1336 		printf("%s => %s:%d\n",
   1337 			sched_in->shortname, sched_in->comm, sched_in->tid);
   1338 	} else {
   1339 		printf("\n");
   1340 	}
   1341 
   1342 	return 0;
   1343 }
   1344 
   1345 static int process_sched_switch_event(struct perf_tool *tool,
   1346 				      struct perf_evsel *evsel,
   1347 				      struct perf_sample *sample,
   1348 				      struct machine *machine)
   1349 {
   1350 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1351 	int this_cpu = sample->cpu, err = 0;
   1352 	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
   1353 	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
   1354 
   1355 	if (sched->curr_pid[this_cpu] != (u32)-1) {
   1356 		/*
   1357 		 * Are we trying to switch away a PID that is
   1358 		 * not current?
   1359 		 */
   1360 		if (sched->curr_pid[this_cpu] != prev_pid)
   1361 			sched->nr_context_switch_bugs++;
   1362 	}
   1363 
   1364 	if (sched->tp_handler->switch_event)
   1365 		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
   1366 
   1367 	sched->curr_pid[this_cpu] = next_pid;
   1368 	return err;
   1369 }
   1370 
   1371 static int process_sched_runtime_event(struct perf_tool *tool,
   1372 				       struct perf_evsel *evsel,
   1373 				       struct perf_sample *sample,
   1374 				       struct machine *machine)
   1375 {
   1376 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1377 
   1378 	if (sched->tp_handler->runtime_event)
   1379 		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
   1380 
   1381 	return 0;
   1382 }
   1383 
   1384 static int perf_sched__process_fork_event(struct perf_tool *tool,
   1385 					  union perf_event *event,
   1386 					  struct perf_sample *sample,
   1387 					  struct machine *machine)
   1388 {
   1389 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1390 
   1391 	/* run the fork event through the perf machineruy */
   1392 	perf_event__process_fork(tool, event, sample, machine);
   1393 
   1394 	/* and then run additional processing needed for this command */
   1395 	if (sched->tp_handler->fork_event)
   1396 		return sched->tp_handler->fork_event(sched, event, machine);
   1397 
   1398 	return 0;
   1399 }
   1400 
   1401 static int process_sched_migrate_task_event(struct perf_tool *tool,
   1402 					    struct perf_evsel *evsel,
   1403 					    struct perf_sample *sample,
   1404 					    struct machine *machine)
   1405 {
   1406 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1407 
   1408 	if (sched->tp_handler->migrate_task_event)
   1409 		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
   1410 
   1411 	return 0;
   1412 }
   1413 
   1414 typedef int (*tracepoint_handler)(struct perf_tool *tool,
   1415 				  struct perf_evsel *evsel,
   1416 				  struct perf_sample *sample,
   1417 				  struct machine *machine);
   1418 
   1419 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
   1420 						 union perf_event *event __maybe_unused,
   1421 						 struct perf_sample *sample,
   1422 						 struct perf_evsel *evsel,
   1423 						 struct machine *machine)
   1424 {
   1425 	int err = 0;
   1426 
   1427 	evsel->hists.stats.total_period += sample->period;
   1428 	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
   1429 
   1430 	if (evsel->handler.func != NULL) {
   1431 		tracepoint_handler f = evsel->handler.func;
   1432 		err = f(tool, evsel, sample, machine);
   1433 	}
   1434 
   1435 	return err;
   1436 }
   1437 
   1438 static int perf_sched__read_events(struct perf_sched *sched,
   1439 				   struct perf_session **psession)
   1440 {
   1441 	const struct perf_evsel_str_handler handlers[] = {
   1442 		{ "sched:sched_switch",	      process_sched_switch_event, },
   1443 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
   1444 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
   1445 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
   1446 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
   1447 	};
   1448 	struct perf_session *session;
   1449 
   1450 	session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
   1451 	if (session == NULL) {
   1452 		pr_debug("No Memory for session\n");
   1453 		return -1;
   1454 	}
   1455 
   1456 	if (perf_session__set_tracepoints_handlers(session, handlers))
   1457 		goto out_delete;
   1458 
   1459 	if (perf_session__has_traces(session, "record -R")) {
   1460 		int err = perf_session__process_events(session, &sched->tool);
   1461 		if (err) {
   1462 			pr_err("Failed to process events, error %d", err);
   1463 			goto out_delete;
   1464 		}
   1465 
   1466 		sched->nr_events      = session->stats.nr_events[0];
   1467 		sched->nr_lost_events = session->stats.total_lost;
   1468 		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
   1469 	}
   1470 
   1471 	if (psession)
   1472 		*psession = session;
   1473 	else
   1474 		perf_session__delete(session);
   1475 
   1476 	return 0;
   1477 
   1478 out_delete:
   1479 	perf_session__delete(session);
   1480 	return -1;
   1481 }
   1482 
   1483 static void print_bad_events(struct perf_sched *sched)
   1484 {
   1485 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
   1486 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
   1487 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
   1488 			sched->nr_unordered_timestamps, sched->nr_timestamps);
   1489 	}
   1490 	if (sched->nr_lost_events && sched->nr_events) {
   1491 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
   1492 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
   1493 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
   1494 	}
   1495 	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
   1496 		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
   1497 			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
   1498 			sched->nr_state_machine_bugs, sched->nr_timestamps);
   1499 		if (sched->nr_lost_events)
   1500 			printf(" (due to lost events?)");
   1501 		printf("\n");
   1502 	}
   1503 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
   1504 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
   1505 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
   1506 			sched->nr_context_switch_bugs, sched->nr_timestamps);
   1507 		if (sched->nr_lost_events)
   1508 			printf(" (due to lost events?)");
   1509 		printf("\n");
   1510 	}
   1511 }
   1512 
   1513 static int perf_sched__lat(struct perf_sched *sched)
   1514 {
   1515 	struct rb_node *next;
   1516 	struct perf_session *session;
   1517 
   1518 	setup_pager();
   1519 
   1520 	/* save session -- references to threads are held in work_list */
   1521 	if (perf_sched__read_events(sched, &session))
   1522 		return -1;
   1523 
   1524 	perf_sched__sort_lat(sched);
   1525 
   1526 	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
   1527 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
   1528 	printf(" ---------------------------------------------------------------------------------------------------------------\n");
   1529 
   1530 	next = rb_first(&sched->sorted_atom_root);
   1531 
   1532 	while (next) {
   1533 		struct work_atoms *work_list;
   1534 
   1535 		work_list = rb_entry(next, struct work_atoms, node);
   1536 		output_lat_thread(sched, work_list);
   1537 		next = rb_next(next);
   1538 	}
   1539 
   1540 	printf(" -----------------------------------------------------------------------------------------\n");
   1541 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
   1542 		(double)sched->all_runtime / 1e6, sched->all_count);
   1543 
   1544 	printf(" ---------------------------------------------------\n");
   1545 
   1546 	print_bad_events(sched);
   1547 	printf("\n");
   1548 
   1549 	perf_session__delete(session);
   1550 	return 0;
   1551 }
   1552 
   1553 static int perf_sched__map(struct perf_sched *sched)
   1554 {
   1555 	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
   1556 
   1557 	setup_pager();
   1558 	if (perf_sched__read_events(sched, NULL))
   1559 		return -1;
   1560 	print_bad_events(sched);
   1561 	return 0;
   1562 }
   1563 
   1564 static int perf_sched__replay(struct perf_sched *sched)
   1565 {
   1566 	unsigned long i;
   1567 
   1568 	calibrate_run_measurement_overhead(sched);
   1569 	calibrate_sleep_measurement_overhead(sched);
   1570 
   1571 	test_calibrations(sched);
   1572 
   1573 	if (perf_sched__read_events(sched, NULL))
   1574 		return -1;
   1575 
   1576 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
   1577 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
   1578 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
   1579 
   1580 	if (sched->targetless_wakeups)
   1581 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
   1582 	if (sched->multitarget_wakeups)
   1583 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
   1584 	if (sched->nr_run_events_optimized)
   1585 		printf("run atoms optimized: %ld\n",
   1586 			sched->nr_run_events_optimized);
   1587 
   1588 	print_task_traces(sched);
   1589 	add_cross_task_wakeups(sched);
   1590 
   1591 	create_tasks(sched);
   1592 	printf("------------------------------------------------------------\n");
   1593 	for (i = 0; i < sched->replay_repeat; i++)
   1594 		run_one_test(sched);
   1595 
   1596 	return 0;
   1597 }
   1598 
   1599 static void setup_sorting(struct perf_sched *sched, const struct option *options,
   1600 			  const char * const usage_msg[])
   1601 {
   1602 	char *tmp, *tok, *str = strdup(sched->sort_order);
   1603 
   1604 	for (tok = strtok_r(str, ", ", &tmp);
   1605 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
   1606 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
   1607 			error("Unknown --sort key: `%s'", tok);
   1608 			usage_with_options(usage_msg, options);
   1609 		}
   1610 	}
   1611 
   1612 	free(str);
   1613 
   1614 	sort_dimension__add("pid", &sched->cmp_pid);
   1615 }
   1616 
   1617 static int __cmd_record(int argc, const char **argv)
   1618 {
   1619 	unsigned int rec_argc, i, j;
   1620 	const char **rec_argv;
   1621 	const char * const record_args[] = {
   1622 		"record",
   1623 		"-a",
   1624 		"-R",
   1625 		"-m", "1024",
   1626 		"-c", "1",
   1627 		"-e", "sched:sched_switch",
   1628 		"-e", "sched:sched_stat_wait",
   1629 		"-e", "sched:sched_stat_sleep",
   1630 		"-e", "sched:sched_stat_iowait",
   1631 		"-e", "sched:sched_stat_runtime",
   1632 		"-e", "sched:sched_process_fork",
   1633 		"-e", "sched:sched_wakeup",
   1634 		"-e", "sched:sched_migrate_task",
   1635 	};
   1636 
   1637 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
   1638 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
   1639 
   1640 	if (rec_argv == NULL)
   1641 		return -ENOMEM;
   1642 
   1643 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
   1644 		rec_argv[i] = strdup(record_args[i]);
   1645 
   1646 	for (j = 1; j < (unsigned int)argc; j++, i++)
   1647 		rec_argv[i] = argv[j];
   1648 
   1649 	BUG_ON(i != rec_argc);
   1650 
   1651 	return cmd_record(i, rec_argv, NULL);
   1652 }
   1653 
   1654 static const char default_sort_order[] = "avg, max, switch, runtime";
   1655 static struct perf_sched sched = {
   1656 	.tool = {
   1657 		.sample		 = perf_sched__process_tracepoint_sample,
   1658 		.comm		 = perf_event__process_comm,
   1659 		.lost		 = perf_event__process_lost,
   1660 		.fork		 = perf_sched__process_fork_event,
   1661 		.ordered_samples = true,
   1662 	},
   1663 	.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
   1664 	.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
   1665 	.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
   1666 	.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
   1667 	.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
   1668 	.sort_order	      = default_sort_order,
   1669 	.replay_repeat	      = 10,
   1670 	.profile_cpu	      = -1,
   1671 	.next_shortname1      = 'A',
   1672 	.next_shortname2      = '0',
   1673 };
   1674 
   1675 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
   1676 {
   1677 	const struct option latency_options[] = {
   1678 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
   1679 		   "sort by key(s): runtime, switch, avg, max"),
   1680 	OPT_INCR('v', "verbose", &verbose,
   1681 		    "be more verbose (show symbol address, etc)"),
   1682 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
   1683 		    "CPU to profile on"),
   1684 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1685 		    "dump raw trace in ASCII"),
   1686 	OPT_END()
   1687 	};
   1688 	const struct option replay_options[] = {
   1689 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
   1690 		     "repeat the workload replay N times (-1: infinite)"),
   1691 	OPT_INCR('v', "verbose", &verbose,
   1692 		    "be more verbose (show symbol address, etc)"),
   1693 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1694 		    "dump raw trace in ASCII"),
   1695 	OPT_END()
   1696 	};
   1697 	const struct option sched_options[] = {
   1698 	OPT_STRING('i', "input", &input_name, "file",
   1699 		    "input file name"),
   1700 	OPT_INCR('v', "verbose", &verbose,
   1701 		    "be more verbose (show symbol address, etc)"),
   1702 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1703 		    "dump raw trace in ASCII"),
   1704 	OPT_END()
   1705 	};
   1706 	const char * const latency_usage[] = {
   1707 		"perf sched latency [<options>]",
   1708 		NULL
   1709 	};
   1710 	const char * const replay_usage[] = {
   1711 		"perf sched replay [<options>]",
   1712 		NULL
   1713 	};
   1714 	const char * const sched_usage[] = {
   1715 		"perf sched [<options>] {record|latency|map|replay|script}",
   1716 		NULL
   1717 	};
   1718 	struct trace_sched_handler lat_ops  = {
   1719 		.wakeup_event	    = latency_wakeup_event,
   1720 		.switch_event	    = latency_switch_event,
   1721 		.runtime_event	    = latency_runtime_event,
   1722 		.migrate_task_event = latency_migrate_task_event,
   1723 	};
   1724 	struct trace_sched_handler map_ops  = {
   1725 		.switch_event	    = map_switch_event,
   1726 	};
   1727 	struct trace_sched_handler replay_ops  = {
   1728 		.wakeup_event	    = replay_wakeup_event,
   1729 		.switch_event	    = replay_switch_event,
   1730 		.fork_event	    = replay_fork_event,
   1731 	};
   1732 
   1733 	argc = parse_options(argc, argv, sched_options, sched_usage,
   1734 			     PARSE_OPT_STOP_AT_NON_OPTION);
   1735 	if (!argc)
   1736 		usage_with_options(sched_usage, sched_options);
   1737 
   1738 	/*
   1739 	 * Aliased to 'perf script' for now:
   1740 	 */
   1741 	if (!strcmp(argv[0], "script"))
   1742 		return cmd_script(argc, argv, prefix);
   1743 
   1744 	symbol__init();
   1745 	if (!strncmp(argv[0], "rec", 3)) {
   1746 		return __cmd_record(argc, argv);
   1747 	} else if (!strncmp(argv[0], "lat", 3)) {
   1748 		sched.tp_handler = &lat_ops;
   1749 		if (argc > 1) {
   1750 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
   1751 			if (argc)
   1752 				usage_with_options(latency_usage, latency_options);
   1753 		}
   1754 		setup_sorting(&sched, latency_options, latency_usage);
   1755 		return perf_sched__lat(&sched);
   1756 	} else if (!strcmp(argv[0], "map")) {
   1757 		sched.tp_handler = &map_ops;
   1758 		setup_sorting(&sched, latency_options, latency_usage);
   1759 		return perf_sched__map(&sched);
   1760 	} else if (!strncmp(argv[0], "rep", 3)) {
   1761 		sched.tp_handler = &replay_ops;
   1762 		if (argc) {
   1763 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
   1764 			if (argc)
   1765 				usage_with_options(replay_usage, replay_options);
   1766 		}
   1767 		return perf_sched__replay(&sched);
   1768 	} else {
   1769 		usage_with_options(sched_usage, sched_options);
   1770 	}
   1771 
   1772 	return 0;
   1773 }
   1774