Home | History | Annotate | Download | only in perf
      1 #include "builtin.h"
      2 #include "perf.h"
      3 
      4 #include "util/util.h"
      5 #include "util/evlist.h"
      6 #include "util/cache.h"
      7 #include "util/evsel.h"
      8 #include "util/symbol.h"
      9 #include "util/thread.h"
     10 #include "util/header.h"
     11 #include "util/session.h"
     12 #include "util/tool.h"
     13 
     14 #include "util/parse-options.h"
     15 #include "util/trace-event.h"
     16 
     17 #include "util/debug.h"
     18 
     19 #include <sys/prctl.h>
     20 #include <sys/resource.h>
     21 
     22 #include <semaphore.h>
     23 #include <pthread.h>
     24 #include <math.h>
     25 
     26 #define PR_SET_NAME		15               /* Set process name */
     27 #define MAX_CPUS		4096
     28 #define COMM_LEN		20
     29 #define SYM_LEN			129
     30 #define MAX_PID			65536
     31 
     32 struct sched_atom;
     33 
     34 struct task_desc {
     35 	unsigned long		nr;
     36 	unsigned long		pid;
     37 	char			comm[COMM_LEN];
     38 
     39 	unsigned long		nr_events;
     40 	unsigned long		curr_event;
     41 	struct sched_atom	**atoms;
     42 
     43 	pthread_t		thread;
     44 	sem_t			sleep_sem;
     45 
     46 	sem_t			ready_for_work;
     47 	sem_t			work_done_sem;
     48 
     49 	u64			cpu_usage;
     50 };
     51 
     52 enum sched_event_type {
     53 	SCHED_EVENT_RUN,
     54 	SCHED_EVENT_SLEEP,
     55 	SCHED_EVENT_WAKEUP,
     56 	SCHED_EVENT_MIGRATION,
     57 };
     58 
     59 struct sched_atom {
     60 	enum sched_event_type	type;
     61 	int			specific_wait;
     62 	u64			timestamp;
     63 	u64			duration;
     64 	unsigned long		nr;
     65 	sem_t			*wait_sem;
     66 	struct task_desc	*wakee;
     67 };
     68 
     69 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
     70 
     71 enum thread_state {
     72 	THREAD_SLEEPING = 0,
     73 	THREAD_WAIT_CPU,
     74 	THREAD_SCHED_IN,
     75 	THREAD_IGNORE
     76 };
     77 
     78 struct work_atom {
     79 	struct list_head	list;
     80 	enum thread_state	state;
     81 	u64			sched_out_time;
     82 	u64			wake_up_time;
     83 	u64			sched_in_time;
     84 	u64			runtime;
     85 };
     86 
     87 struct work_atoms {
     88 	struct list_head	work_list;
     89 	struct thread		*thread;
     90 	struct rb_node		node;
     91 	u64			max_lat;
     92 	u64			max_lat_at;
     93 	u64			total_lat;
     94 	u64			nb_atoms;
     95 	u64			total_runtime;
     96 };
     97 
     98 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
     99 
    100 struct perf_sched;
    101 
    102 struct trace_sched_handler {
    103 	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    104 			    struct perf_sample *sample, struct machine *machine);
    105 
    106 	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    107 			     struct perf_sample *sample, struct machine *machine);
    108 
    109 	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
    110 			    struct perf_sample *sample, struct machine *machine);
    111 
    112 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
    113 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
    114 			  struct machine *machine);
    115 
    116 	int (*migrate_task_event)(struct perf_sched *sched,
    117 				  struct perf_evsel *evsel,
    118 				  struct perf_sample *sample,
    119 				  struct machine *machine);
    120 };
    121 
    122 struct perf_sched {
    123 	struct perf_tool tool;
    124 	const char	 *sort_order;
    125 	unsigned long	 nr_tasks;
    126 	struct task_desc *pid_to_task[MAX_PID];
    127 	struct task_desc **tasks;
    128 	const struct trace_sched_handler *tp_handler;
    129 	pthread_mutex_t	 start_work_mutex;
    130 	pthread_mutex_t	 work_done_wait_mutex;
    131 	int		 profile_cpu;
    132 /*
    133  * Track the current task - that way we can know whether there's any
    134  * weird events, such as a task being switched away that is not current.
    135  */
    136 	int		 max_cpu;
    137 	u32		 curr_pid[MAX_CPUS];
    138 	struct thread	 *curr_thread[MAX_CPUS];
    139 	char		 next_shortname1;
    140 	char		 next_shortname2;
    141 	unsigned int	 replay_repeat;
    142 	unsigned long	 nr_run_events;
    143 	unsigned long	 nr_sleep_events;
    144 	unsigned long	 nr_wakeup_events;
    145 	unsigned long	 nr_sleep_corrections;
    146 	unsigned long	 nr_run_events_optimized;
    147 	unsigned long	 targetless_wakeups;
    148 	unsigned long	 multitarget_wakeups;
    149 	unsigned long	 nr_runs;
    150 	unsigned long	 nr_timestamps;
    151 	unsigned long	 nr_unordered_timestamps;
    152 	unsigned long	 nr_state_machine_bugs;
    153 	unsigned long	 nr_context_switch_bugs;
    154 	unsigned long	 nr_events;
    155 	unsigned long	 nr_lost_chunks;
    156 	unsigned long	 nr_lost_events;
    157 	u64		 run_measurement_overhead;
    158 	u64		 sleep_measurement_overhead;
    159 	u64		 start_time;
    160 	u64		 cpu_usage;
    161 	u64		 runavg_cpu_usage;
    162 	u64		 parent_cpu_usage;
    163 	u64		 runavg_parent_cpu_usage;
    164 	u64		 sum_runtime;
    165 	u64		 sum_fluct;
    166 	u64		 run_avg;
    167 	u64		 all_runtime;
    168 	u64		 all_count;
    169 	u64		 cpu_last_switched[MAX_CPUS];
    170 	struct rb_root	 atom_root, sorted_atom_root;
    171 	struct list_head sort_list, cmp_pid;
    172 };
    173 
    174 static u64 get_nsecs(void)
    175 {
    176 #ifndef __APPLE__
    177 	struct timespec ts;
    178 
    179 	clock_gettime(CLOCK_MONOTONIC, &ts);
    180 
    181 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
    182 #else
    183 	perror("get_nsecs not supported on MacOS");
    184 	return 0;
    185 #endif
    186 }
    187 
    188 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
    189 {
    190 	u64 T0 = get_nsecs(), T1;
    191 
    192 	do {
    193 		T1 = get_nsecs();
    194 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
    195 }
    196 
    197 static void sleep_nsecs(u64 nsecs)
    198 {
    199 	struct timespec ts;
    200 
    201 	ts.tv_nsec = nsecs % 999999999;
    202 	ts.tv_sec = nsecs / 999999999;
    203 
    204 	nanosleep(&ts, NULL);
    205 }
    206 
    207 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
    208 {
    209 	u64 T0, T1, delta, min_delta = 1000000000ULL;
    210 	int i;
    211 
    212 	for (i = 0; i < 10; i++) {
    213 		T0 = get_nsecs();
    214 		burn_nsecs(sched, 0);
    215 		T1 = get_nsecs();
    216 		delta = T1-T0;
    217 		min_delta = min(min_delta, delta);
    218 	}
    219 	sched->run_measurement_overhead = min_delta;
    220 
    221 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    222 }
    223 
    224 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
    225 {
    226 	u64 T0, T1, delta, min_delta = 1000000000ULL;
    227 	int i;
    228 
    229 	for (i = 0; i < 10; i++) {
    230 		T0 = get_nsecs();
    231 		sleep_nsecs(10000);
    232 		T1 = get_nsecs();
    233 		delta = T1-T0;
    234 		min_delta = min(min_delta, delta);
    235 	}
    236 	min_delta -= 10000;
    237 	sched->sleep_measurement_overhead = min_delta;
    238 
    239 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    240 }
    241 
    242 static struct sched_atom *
    243 get_new_event(struct task_desc *task, u64 timestamp)
    244 {
    245 	struct sched_atom *event = zalloc(sizeof(*event));
    246 	unsigned long idx = task->nr_events;
    247 	size_t size;
    248 
    249 	event->timestamp = timestamp;
    250 	event->nr = idx;
    251 
    252 	task->nr_events++;
    253 	size = sizeof(struct sched_atom *) * task->nr_events;
    254 	task->atoms = realloc(task->atoms, size);
    255 	BUG_ON(!task->atoms);
    256 
    257 	task->atoms[idx] = event;
    258 
    259 	return event;
    260 }
    261 
    262 static struct sched_atom *last_event(struct task_desc *task)
    263 {
    264 	if (!task->nr_events)
    265 		return NULL;
    266 
    267 	return task->atoms[task->nr_events - 1];
    268 }
    269 
    270 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
    271 				u64 timestamp, u64 duration)
    272 {
    273 	struct sched_atom *event, *curr_event = last_event(task);
    274 
    275 	/*
    276 	 * optimize an existing RUN event by merging this one
    277 	 * to it:
    278 	 */
    279 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
    280 		sched->nr_run_events_optimized++;
    281 		curr_event->duration += duration;
    282 		return;
    283 	}
    284 
    285 	event = get_new_event(task, timestamp);
    286 
    287 	event->type = SCHED_EVENT_RUN;
    288 	event->duration = duration;
    289 
    290 	sched->nr_run_events++;
    291 }
    292 
    293 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
    294 				   u64 timestamp, struct task_desc *wakee)
    295 {
    296 	struct sched_atom *event, *wakee_event;
    297 
    298 	event = get_new_event(task, timestamp);
    299 	event->type = SCHED_EVENT_WAKEUP;
    300 	event->wakee = wakee;
    301 
    302 	wakee_event = last_event(wakee);
    303 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
    304 		sched->targetless_wakeups++;
    305 		return;
    306 	}
    307 	if (wakee_event->wait_sem) {
    308 		sched->multitarget_wakeups++;
    309 		return;
    310 	}
    311 
    312 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
    313 	sem_init(wakee_event->wait_sem, 0, 0);
    314 	wakee_event->specific_wait = 1;
    315 	event->wait_sem = wakee_event->wait_sem;
    316 
    317 	sched->nr_wakeup_events++;
    318 }
    319 
    320 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
    321 				  u64 timestamp, u64 task_state __maybe_unused)
    322 {
    323 	struct sched_atom *event = get_new_event(task, timestamp);
    324 
    325 	event->type = SCHED_EVENT_SLEEP;
    326 
    327 	sched->nr_sleep_events++;
    328 }
    329 
    330 static struct task_desc *register_pid(struct perf_sched *sched,
    331 				      unsigned long pid, const char *comm)
    332 {
    333 	struct task_desc *task;
    334 
    335 	BUG_ON(pid >= MAX_PID);
    336 
    337 	task = sched->pid_to_task[pid];
    338 
    339 	if (task)
    340 		return task;
    341 
    342 	task = zalloc(sizeof(*task));
    343 	task->pid = pid;
    344 	task->nr = sched->nr_tasks;
    345 	strcpy(task->comm, comm);
    346 	/*
    347 	 * every task starts in sleeping state - this gets ignored
    348 	 * if there's no wakeup pointing to this sleep state:
    349 	 */
    350 	add_sched_event_sleep(sched, task, 0, 0);
    351 
    352 	sched->pid_to_task[pid] = task;
    353 	sched->nr_tasks++;
    354 	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
    355 	BUG_ON(!sched->tasks);
    356 	sched->tasks[task->nr] = task;
    357 
    358 	if (verbose)
    359 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
    360 
    361 	return task;
    362 }
    363 
    364 
    365 static void print_task_traces(struct perf_sched *sched)
    366 {
    367 	struct task_desc *task;
    368 	unsigned long i;
    369 
    370 	for (i = 0; i < sched->nr_tasks; i++) {
    371 		task = sched->tasks[i];
    372 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
    373 			task->nr, task->comm, task->pid, task->nr_events);
    374 	}
    375 }
    376 
    377 static void add_cross_task_wakeups(struct perf_sched *sched)
    378 {
    379 	struct task_desc *task1, *task2;
    380 	unsigned long i, j;
    381 
    382 	for (i = 0; i < sched->nr_tasks; i++) {
    383 		task1 = sched->tasks[i];
    384 		j = i + 1;
    385 		if (j == sched->nr_tasks)
    386 			j = 0;
    387 		task2 = sched->tasks[j];
    388 		add_sched_event_wakeup(sched, task1, 0, task2);
    389 	}
    390 }
    391 
    392 static void perf_sched__process_event(struct perf_sched *sched,
    393 				      struct sched_atom *atom)
    394 {
    395 	int ret = 0;
    396 
    397 	switch (atom->type) {
    398 		case SCHED_EVENT_RUN:
    399 			burn_nsecs(sched, atom->duration);
    400 			break;
    401 		case SCHED_EVENT_SLEEP:
    402 			if (atom->wait_sem)
    403 				ret = sem_wait(atom->wait_sem);
    404 			BUG_ON(ret);
    405 			break;
    406 		case SCHED_EVENT_WAKEUP:
    407 			if (atom->wait_sem)
    408 				ret = sem_post(atom->wait_sem);
    409 			BUG_ON(ret);
    410 			break;
    411 		case SCHED_EVENT_MIGRATION:
    412 			break;
    413 		default:
    414 			BUG_ON(1);
    415 	}
    416 }
    417 
    418 static u64 get_cpu_usage_nsec_parent(void)
    419 {
    420 	struct rusage ru;
    421 	u64 sum;
    422 	int err;
    423 
    424 	err = getrusage(RUSAGE_SELF, &ru);
    425 	BUG_ON(err);
    426 
    427 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
    428 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
    429 
    430 	return sum;
    431 }
    432 
    433 static int self_open_counters(void)
    434 {
    435 	struct perf_event_attr attr;
    436 	int fd;
    437 
    438 	memset(&attr, 0, sizeof(attr));
    439 
    440 	attr.type = PERF_TYPE_SOFTWARE;
    441 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
    442 
    443 	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
    444 
    445 	if (fd < 0)
    446 		pr_err("Error: sys_perf_event_open() syscall returned "
    447 		       "with %d (%s)\n", fd, strerror(errno));
    448 	return fd;
    449 }
    450 
    451 static u64 get_cpu_usage_nsec_self(int fd)
    452 {
    453 	u64 runtime;
    454 	int ret;
    455 
    456 	ret = read(fd, &runtime, sizeof(runtime));
    457 	BUG_ON(ret != sizeof(runtime));
    458 
    459 	return runtime;
    460 }
    461 
    462 struct sched_thread_parms {
    463 	struct task_desc  *task;
    464 	struct perf_sched *sched;
    465 };
    466 
    467 static void *thread_func(void *ctx)
    468 {
    469 #ifndef __APPLE__
    470 	struct sched_thread_parms *parms = ctx;
    471 	struct task_desc *this_task = parms->task;
    472 	struct perf_sched *sched = parms->sched;
    473 	u64 cpu_usage_0, cpu_usage_1;
    474 	unsigned long i, ret;
    475 	char comm2[22];
    476 	int fd;
    477 
    478 	free(parms);
    479 
    480 	sprintf(comm2, ":%s", this_task->comm);
    481 	prctl(PR_SET_NAME, comm2);
    482 	fd = self_open_counters();
    483 	if (fd < 0)
    484 		return NULL;
    485 again:
    486 	ret = sem_post(&this_task->ready_for_work);
    487 	BUG_ON(ret);
    488 	ret = pthread_mutex_lock(&sched->start_work_mutex);
    489 	BUG_ON(ret);
    490 	ret = pthread_mutex_unlock(&sched->start_work_mutex);
    491 	BUG_ON(ret);
    492 
    493 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
    494 
    495 	for (i = 0; i < this_task->nr_events; i++) {
    496 		this_task->curr_event = i;
    497 		perf_sched__process_event(sched, this_task->atoms[i]);
    498 	}
    499 
    500 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
    501 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
    502 	ret = sem_post(&this_task->work_done_sem);
    503 	BUG_ON(ret);
    504 
    505 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    506 	BUG_ON(ret);
    507 	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
    508 	BUG_ON(ret);
    509 
    510 	goto again;
    511 #else
    512 	perror("thread_func not supported on MacOS");
    513 	return NULL;
    514 #endif
    515 }
    516 
    517 static void create_tasks(struct perf_sched *sched)
    518 {
    519 	struct task_desc *task;
    520 	pthread_attr_t attr;
    521 	unsigned long i;
    522 	int err;
    523 
    524 	err = pthread_attr_init(&attr);
    525 	BUG_ON(err);
    526 	err = pthread_attr_setstacksize(&attr,
    527 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
    528 	BUG_ON(err);
    529 	err = pthread_mutex_lock(&sched->start_work_mutex);
    530 	BUG_ON(err);
    531 	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
    532 	BUG_ON(err);
    533 	for (i = 0; i < sched->nr_tasks; i++) {
    534 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
    535 		BUG_ON(parms == NULL);
    536 		parms->task = task = sched->tasks[i];
    537 		parms->sched = sched;
    538 		sem_init(&task->sleep_sem, 0, 0);
    539 		sem_init(&task->ready_for_work, 0, 0);
    540 		sem_init(&task->work_done_sem, 0, 0);
    541 		task->curr_event = 0;
    542 		err = pthread_create(&task->thread, &attr, thread_func, parms);
    543 		BUG_ON(err);
    544 	}
    545 }
    546 
    547 static void wait_for_tasks(struct perf_sched *sched)
    548 {
    549 	u64 cpu_usage_0, cpu_usage_1;
    550 	struct task_desc *task;
    551 	unsigned long i, ret;
    552 
    553 	sched->start_time = get_nsecs();
    554 	sched->cpu_usage = 0;
    555 	pthread_mutex_unlock(&sched->work_done_wait_mutex);
    556 
    557 	for (i = 0; i < sched->nr_tasks; i++) {
    558 		task = sched->tasks[i];
    559 		ret = sem_wait(&task->ready_for_work);
    560 		BUG_ON(ret);
    561 		sem_init(&task->ready_for_work, 0, 0);
    562 	}
    563 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    564 	BUG_ON(ret);
    565 
    566 	cpu_usage_0 = get_cpu_usage_nsec_parent();
    567 
    568 	pthread_mutex_unlock(&sched->start_work_mutex);
    569 
    570 	for (i = 0; i < sched->nr_tasks; i++) {
    571 		task = sched->tasks[i];
    572 		ret = sem_wait(&task->work_done_sem);
    573 		BUG_ON(ret);
    574 		sem_init(&task->work_done_sem, 0, 0);
    575 		sched->cpu_usage += task->cpu_usage;
    576 		task->cpu_usage = 0;
    577 	}
    578 
    579 	cpu_usage_1 = get_cpu_usage_nsec_parent();
    580 	if (!sched->runavg_cpu_usage)
    581 		sched->runavg_cpu_usage = sched->cpu_usage;
    582 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
    583 
    584 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
    585 	if (!sched->runavg_parent_cpu_usage)
    586 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
    587 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
    588 					 sched->parent_cpu_usage)/10;
    589 
    590 	ret = pthread_mutex_lock(&sched->start_work_mutex);
    591 	BUG_ON(ret);
    592 
    593 	for (i = 0; i < sched->nr_tasks; i++) {
    594 		task = sched->tasks[i];
    595 		sem_init(&task->sleep_sem, 0, 0);
    596 		task->curr_event = 0;
    597 	}
    598 }
    599 
    600 static void run_one_test(struct perf_sched *sched)
    601 {
    602 	u64 T0, T1, delta, avg_delta, fluct;
    603 
    604 	T0 = get_nsecs();
    605 	wait_for_tasks(sched);
    606 	T1 = get_nsecs();
    607 
    608 	delta = T1 - T0;
    609 	sched->sum_runtime += delta;
    610 	sched->nr_runs++;
    611 
    612 	avg_delta = sched->sum_runtime / sched->nr_runs;
    613 	if (delta < avg_delta)
    614 		fluct = avg_delta - delta;
    615 	else
    616 		fluct = delta - avg_delta;
    617 	sched->sum_fluct += fluct;
    618 	if (!sched->run_avg)
    619 		sched->run_avg = delta;
    620 	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
    621 
    622 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
    623 
    624 	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
    625 
    626 	printf("cpu: %0.2f / %0.2f",
    627 		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
    628 
    629 #if 0
    630 	/*
    631 	 * rusage statistics done by the parent, these are less
    632 	 * accurate than the sched->sum_exec_runtime based statistics:
    633 	 */
    634 	printf(" [%0.2f / %0.2f]",
    635 		(double)sched->parent_cpu_usage/1e6,
    636 		(double)sched->runavg_parent_cpu_usage/1e6);
    637 #endif
    638 
    639 	printf("\n");
    640 
    641 	if (sched->nr_sleep_corrections)
    642 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
    643 	sched->nr_sleep_corrections = 0;
    644 }
    645 
    646 static void test_calibrations(struct perf_sched *sched)
    647 {
    648 	u64 T0, T1;
    649 
    650 	T0 = get_nsecs();
    651 	burn_nsecs(sched, 1e6);
    652 	T1 = get_nsecs();
    653 
    654 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
    655 
    656 	T0 = get_nsecs();
    657 	sleep_nsecs(1e6);
    658 	T1 = get_nsecs();
    659 
    660 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
    661 }
    662 
    663 static int
    664 replay_wakeup_event(struct perf_sched *sched,
    665 		    struct perf_evsel *evsel, struct perf_sample *sample,
    666 		    struct machine *machine __maybe_unused)
    667 {
    668 	const char *comm = perf_evsel__strval(evsel, sample, "comm");
    669 	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
    670 	struct task_desc *waker, *wakee;
    671 
    672 	if (verbose) {
    673 		printf("sched_wakeup event %p\n", evsel);
    674 
    675 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
    676 	}
    677 
    678 	waker = register_pid(sched, sample->tid, "<unknown>");
    679 	wakee = register_pid(sched, pid, comm);
    680 
    681 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
    682 	return 0;
    683 }
    684 
    685 static int replay_switch_event(struct perf_sched *sched,
    686 			       struct perf_evsel *evsel,
    687 			       struct perf_sample *sample,
    688 			       struct machine *machine __maybe_unused)
    689 {
    690 	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
    691 		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
    692 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
    693 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
    694 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
    695 	struct task_desc *prev, __maybe_unused *next;
    696 	u64 timestamp0, timestamp = sample->time;
    697 	int cpu = sample->cpu;
    698 	s64 delta;
    699 
    700 	if (verbose)
    701 		printf("sched_switch event %p\n", evsel);
    702 
    703 	if (cpu >= MAX_CPUS || cpu < 0)
    704 		return 0;
    705 
    706 	timestamp0 = sched->cpu_last_switched[cpu];
    707 	if (timestamp0)
    708 		delta = timestamp - timestamp0;
    709 	else
    710 		delta = 0;
    711 
    712 	if (delta < 0) {
    713 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
    714 		return -1;
    715 	}
    716 
    717 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
    718 		 prev_comm, prev_pid, next_comm, next_pid, delta);
    719 
    720 	prev = register_pid(sched, prev_pid, prev_comm);
    721 	next = register_pid(sched, next_pid, next_comm);
    722 
    723 	sched->cpu_last_switched[cpu] = timestamp;
    724 
    725 	add_sched_event_run(sched, prev, timestamp, delta);
    726 	add_sched_event_sleep(sched, prev, timestamp, prev_state);
    727 
    728 	return 0;
    729 }
    730 
    731 static int replay_fork_event(struct perf_sched *sched,
    732 			     union perf_event *event,
    733 			     struct machine *machine)
    734 {
    735 	struct thread *child, *parent;
    736 
    737 	child = machine__findnew_thread(machine, event->fork.pid,
    738 					event->fork.tid);
    739 	parent = machine__findnew_thread(machine, event->fork.ppid,
    740 					 event->fork.ptid);
    741 
    742 	if (child == NULL || parent == NULL) {
    743 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
    744 				 child, parent);
    745 		return 0;
    746 	}
    747 
    748 	if (verbose) {
    749 		printf("fork event\n");
    750 		printf("... parent: %s/%d\n", parent->comm, parent->tid);
    751 		printf("...  child: %s/%d\n", child->comm, child->tid);
    752 	}
    753 
    754 	register_pid(sched, parent->tid, parent->comm);
    755 	register_pid(sched, child->tid, child->comm);
    756 	return 0;
    757 }
    758 
    759 struct sort_dimension {
    760 	const char		*name;
    761 	sort_fn_t		cmp;
    762 	struct list_head	list;
    763 };
    764 
    765 static int
    766 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
    767 {
    768 	struct sort_dimension *sort;
    769 	int ret = 0;
    770 
    771 	BUG_ON(list_empty(list));
    772 
    773 	list_for_each_entry(sort, list, list) {
    774 		ret = sort->cmp(l, r);
    775 		if (ret)
    776 			return ret;
    777 	}
    778 
    779 	return ret;
    780 }
    781 
    782 static struct work_atoms *
    783 thread_atoms_search(struct rb_root *root, struct thread *thread,
    784 			 struct list_head *sort_list)
    785 {
    786 	struct rb_node *node = root->rb_node;
    787 	struct work_atoms key = { .thread = thread };
    788 
    789 	while (node) {
    790 		struct work_atoms *atoms;
    791 		int cmp;
    792 
    793 		atoms = container_of(node, struct work_atoms, node);
    794 
    795 		cmp = thread_lat_cmp(sort_list, &key, atoms);
    796 		if (cmp > 0)
    797 			node = node->rb_left;
    798 		else if (cmp < 0)
    799 			node = node->rb_right;
    800 		else {
    801 			BUG_ON(thread != atoms->thread);
    802 			return atoms;
    803 		}
    804 	}
    805 	return NULL;
    806 }
    807 
    808 static void
    809 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
    810 			 struct list_head *sort_list)
    811 {
    812 	struct rb_node **new = &(root->rb_node), *parent = NULL;
    813 
    814 	while (*new) {
    815 		struct work_atoms *this;
    816 		int cmp;
    817 
    818 		this = container_of(*new, struct work_atoms, node);
    819 		parent = *new;
    820 
    821 		cmp = thread_lat_cmp(sort_list, data, this);
    822 
    823 		if (cmp > 0)
    824 			new = &((*new)->rb_left);
    825 		else
    826 			new = &((*new)->rb_right);
    827 	}
    828 
    829 	rb_link_node(&data->node, parent, new);
    830 	rb_insert_color(&data->node, root);
    831 }
    832 
    833 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
    834 {
    835 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
    836 	if (!atoms) {
    837 		pr_err("No memory at %s\n", __func__);
    838 		return -1;
    839 	}
    840 
    841 	atoms->thread = thread;
    842 	INIT_LIST_HEAD(&atoms->work_list);
    843 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
    844 	return 0;
    845 }
    846 
    847 static char sched_out_state(u64 prev_state)
    848 {
    849 	const char *str = TASK_STATE_TO_CHAR_STR;
    850 
    851 	return str[prev_state];
    852 }
    853 
    854 static int
    855 add_sched_out_event(struct work_atoms *atoms,
    856 		    char run_state,
    857 		    u64 timestamp)
    858 {
    859 	struct work_atom *atom = zalloc(sizeof(*atom));
    860 	if (!atom) {
    861 		pr_err("Non memory at %s", __func__);
    862 		return -1;
    863 	}
    864 
    865 	atom->sched_out_time = timestamp;
    866 
    867 	if (run_state == 'R') {
    868 		atom->state = THREAD_WAIT_CPU;
    869 		atom->wake_up_time = atom->sched_out_time;
    870 	}
    871 
    872 	list_add_tail(&atom->list, &atoms->work_list);
    873 	return 0;
    874 }
    875 
    876 static void
    877 add_runtime_event(struct work_atoms *atoms, u64 delta,
    878 		  u64 timestamp __maybe_unused)
    879 {
    880 	struct work_atom *atom;
    881 
    882 	BUG_ON(list_empty(&atoms->work_list));
    883 
    884 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
    885 
    886 	atom->runtime += delta;
    887 	atoms->total_runtime += delta;
    888 }
    889 
    890 static void
    891 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
    892 {
    893 	struct work_atom *atom;
    894 	u64 delta;
    895 
    896 	if (list_empty(&atoms->work_list))
    897 		return;
    898 
    899 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
    900 
    901 	if (atom->state != THREAD_WAIT_CPU)
    902 		return;
    903 
    904 	if (timestamp < atom->wake_up_time) {
    905 		atom->state = THREAD_IGNORE;
    906 		return;
    907 	}
    908 
    909 	atom->state = THREAD_SCHED_IN;
    910 	atom->sched_in_time = timestamp;
    911 
    912 	delta = atom->sched_in_time - atom->wake_up_time;
    913 	atoms->total_lat += delta;
    914 	if (delta > atoms->max_lat) {
    915 		atoms->max_lat = delta;
    916 		atoms->max_lat_at = timestamp;
    917 	}
    918 	atoms->nb_atoms++;
    919 }
    920 
    921 static int latency_switch_event(struct perf_sched *sched,
    922 				struct perf_evsel *evsel,
    923 				struct perf_sample *sample,
    924 				struct machine *machine)
    925 {
    926 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
    927 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
    928 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
    929 	struct work_atoms *out_events, *in_events;
    930 	struct thread *sched_out, *sched_in;
    931 	u64 timestamp0, timestamp = sample->time;
    932 	int cpu = sample->cpu;
    933 	s64 delta;
    934 
    935 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
    936 
    937 	timestamp0 = sched->cpu_last_switched[cpu];
    938 	sched->cpu_last_switched[cpu] = timestamp;
    939 	if (timestamp0)
    940 		delta = timestamp - timestamp0;
    941 	else
    942 		delta = 0;
    943 
    944 	if (delta < 0) {
    945 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
    946 		return -1;
    947 	}
    948 
    949 	sched_out = machine__findnew_thread(machine, 0, prev_pid);
    950 	sched_in = machine__findnew_thread(machine, 0, next_pid);
    951 
    952 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
    953 	if (!out_events) {
    954 		if (thread_atoms_insert(sched, sched_out))
    955 			return -1;
    956 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
    957 		if (!out_events) {
    958 			pr_err("out-event: Internal tree error");
    959 			return -1;
    960 		}
    961 	}
    962 	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
    963 		return -1;
    964 
    965 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
    966 	if (!in_events) {
    967 		if (thread_atoms_insert(sched, sched_in))
    968 			return -1;
    969 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
    970 		if (!in_events) {
    971 			pr_err("in-event: Internal tree error");
    972 			return -1;
    973 		}
    974 		/*
    975 		 * Take came in we have not heard about yet,
    976 		 * add in an initial atom in runnable state:
    977 		 */
    978 		if (add_sched_out_event(in_events, 'R', timestamp))
    979 			return -1;
    980 	}
    981 	add_sched_in_event(in_events, timestamp);
    982 
    983 	return 0;
    984 }
    985 
    986 static int latency_runtime_event(struct perf_sched *sched,
    987 				 struct perf_evsel *evsel,
    988 				 struct perf_sample *sample,
    989 				 struct machine *machine)
    990 {
    991 	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
    992 	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
    993 	struct thread *thread = machine__findnew_thread(machine, 0, pid);
    994 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
    995 	u64 timestamp = sample->time;
    996 	int cpu = sample->cpu;
    997 
    998 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
    999 	if (!atoms) {
   1000 		if (thread_atoms_insert(sched, thread))
   1001 			return -1;
   1002 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
   1003 		if (!atoms) {
   1004 			pr_err("in-event: Internal tree error");
   1005 			return -1;
   1006 		}
   1007 		if (add_sched_out_event(atoms, 'R', timestamp))
   1008 			return -1;
   1009 	}
   1010 
   1011 	add_runtime_event(atoms, runtime, timestamp);
   1012 	return 0;
   1013 }
   1014 
   1015 static int latency_wakeup_event(struct perf_sched *sched,
   1016 				struct perf_evsel *evsel,
   1017 				struct perf_sample *sample,
   1018 				struct machine *machine)
   1019 {
   1020 	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid"),
   1021 		  success = perf_evsel__intval(evsel, sample, "success");
   1022 	struct work_atoms *atoms;
   1023 	struct work_atom *atom;
   1024 	struct thread *wakee;
   1025 	u64 timestamp = sample->time;
   1026 
   1027 	/* Note for later, it may be interesting to observe the failing cases */
   1028 	if (!success)
   1029 		return 0;
   1030 
   1031 	wakee = machine__findnew_thread(machine, 0, pid);
   1032 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1033 	if (!atoms) {
   1034 		if (thread_atoms_insert(sched, wakee))
   1035 			return -1;
   1036 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1037 		if (!atoms) {
   1038 			pr_err("wakeup-event: Internal tree error");
   1039 			return -1;
   1040 		}
   1041 		if (add_sched_out_event(atoms, 'S', timestamp))
   1042 			return -1;
   1043 	}
   1044 
   1045 	BUG_ON(list_empty(&atoms->work_list));
   1046 
   1047 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1048 
   1049 	/*
   1050 	 * You WILL be missing events if you've recorded only
   1051 	 * one CPU, or are only looking at only one, so don't
   1052 	 * make useless noise.
   1053 	 */
   1054 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
   1055 		sched->nr_state_machine_bugs++;
   1056 
   1057 	sched->nr_timestamps++;
   1058 	if (atom->sched_out_time > timestamp) {
   1059 		sched->nr_unordered_timestamps++;
   1060 		return 0;
   1061 	}
   1062 
   1063 	atom->state = THREAD_WAIT_CPU;
   1064 	atom->wake_up_time = timestamp;
   1065 	return 0;
   1066 }
   1067 
   1068 static int latency_migrate_task_event(struct perf_sched *sched,
   1069 				      struct perf_evsel *evsel,
   1070 				      struct perf_sample *sample,
   1071 				      struct machine *machine)
   1072 {
   1073 	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
   1074 	u64 timestamp = sample->time;
   1075 	struct work_atoms *atoms;
   1076 	struct work_atom *atom;
   1077 	struct thread *migrant;
   1078 
   1079 	/*
   1080 	 * Only need to worry about migration when profiling one CPU.
   1081 	 */
   1082 	if (sched->profile_cpu == -1)
   1083 		return 0;
   1084 
   1085 	migrant = machine__findnew_thread(machine, 0, pid);
   1086 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1087 	if (!atoms) {
   1088 		if (thread_atoms_insert(sched, migrant))
   1089 			return -1;
   1090 		register_pid(sched, migrant->tid, migrant->comm);
   1091 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1092 		if (!atoms) {
   1093 			pr_err("migration-event: Internal tree error");
   1094 			return -1;
   1095 		}
   1096 		if (add_sched_out_event(atoms, 'R', timestamp))
   1097 			return -1;
   1098 	}
   1099 
   1100 	BUG_ON(list_empty(&atoms->work_list));
   1101 
   1102 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1103 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
   1104 
   1105 	sched->nr_timestamps++;
   1106 
   1107 	if (atom->sched_out_time > timestamp)
   1108 		sched->nr_unordered_timestamps++;
   1109 
   1110 	return 0;
   1111 }
   1112 
   1113 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
   1114 {
   1115 	int i;
   1116 	int ret;
   1117 	u64 avg;
   1118 
   1119 	if (!work_list->nb_atoms)
   1120 		return;
   1121 	/*
   1122 	 * Ignore idle threads:
   1123 	 */
   1124 	if (!strcmp(work_list->thread->comm, "swapper"))
   1125 		return;
   1126 
   1127 	sched->all_runtime += work_list->total_runtime;
   1128 	sched->all_count   += work_list->nb_atoms;
   1129 
   1130 	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->tid);
   1131 
   1132 	for (i = 0; i < 24 - ret; i++)
   1133 		printf(" ");
   1134 
   1135 	avg = work_list->total_lat / work_list->nb_atoms;
   1136 
   1137 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
   1138 	      (double)work_list->total_runtime / 1e6,
   1139 		 work_list->nb_atoms, (double)avg / 1e6,
   1140 		 (double)work_list->max_lat / 1e6,
   1141 		 (double)work_list->max_lat_at / 1e9);
   1142 }
   1143 
   1144 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
   1145 {
   1146 	if (l->thread->tid < r->thread->tid)
   1147 		return -1;
   1148 	if (l->thread->tid > r->thread->tid)
   1149 		return 1;
   1150 
   1151 	return 0;
   1152 }
   1153 
   1154 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
   1155 {
   1156 	u64 avgl, avgr;
   1157 
   1158 	if (!l->nb_atoms)
   1159 		return -1;
   1160 
   1161 	if (!r->nb_atoms)
   1162 		return 1;
   1163 
   1164 	avgl = l->total_lat / l->nb_atoms;
   1165 	avgr = r->total_lat / r->nb_atoms;
   1166 
   1167 	if (avgl < avgr)
   1168 		return -1;
   1169 	if (avgl > avgr)
   1170 		return 1;
   1171 
   1172 	return 0;
   1173 }
   1174 
   1175 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
   1176 {
   1177 	if (l->max_lat < r->max_lat)
   1178 		return -1;
   1179 	if (l->max_lat > r->max_lat)
   1180 		return 1;
   1181 
   1182 	return 0;
   1183 }
   1184 
   1185 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
   1186 {
   1187 	if (l->nb_atoms < r->nb_atoms)
   1188 		return -1;
   1189 	if (l->nb_atoms > r->nb_atoms)
   1190 		return 1;
   1191 
   1192 	return 0;
   1193 }
   1194 
   1195 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
   1196 {
   1197 	if (l->total_runtime < r->total_runtime)
   1198 		return -1;
   1199 	if (l->total_runtime > r->total_runtime)
   1200 		return 1;
   1201 
   1202 	return 0;
   1203 }
   1204 
   1205 static int sort_dimension__add(const char *tok, struct list_head *list)
   1206 {
   1207 	size_t i;
   1208 	static struct sort_dimension avg_sort_dimension = {
   1209 		.name = "avg",
   1210 		.cmp  = avg_cmp,
   1211 	};
   1212 	static struct sort_dimension max_sort_dimension = {
   1213 		.name = "max",
   1214 		.cmp  = max_cmp,
   1215 	};
   1216 	static struct sort_dimension pid_sort_dimension = {
   1217 		.name = "pid",
   1218 		.cmp  = pid_cmp,
   1219 	};
   1220 	static struct sort_dimension runtime_sort_dimension = {
   1221 		.name = "runtime",
   1222 		.cmp  = runtime_cmp,
   1223 	};
   1224 	static struct sort_dimension switch_sort_dimension = {
   1225 		.name = "switch",
   1226 		.cmp  = switch_cmp,
   1227 	};
   1228 	struct sort_dimension *available_sorts[] = {
   1229 		&pid_sort_dimension,
   1230 		&avg_sort_dimension,
   1231 		&max_sort_dimension,
   1232 		&switch_sort_dimension,
   1233 		&runtime_sort_dimension,
   1234 	};
   1235 
   1236 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
   1237 		if (!strcmp(available_sorts[i]->name, tok)) {
   1238 			list_add_tail(&available_sorts[i]->list, list);
   1239 
   1240 			return 0;
   1241 		}
   1242 	}
   1243 
   1244 	return -1;
   1245 }
   1246 
   1247 static void perf_sched__sort_lat(struct perf_sched *sched)
   1248 {
   1249 	struct rb_node *node;
   1250 
   1251 	for (;;) {
   1252 		struct work_atoms *data;
   1253 		node = rb_first(&sched->atom_root);
   1254 		if (!node)
   1255 			break;
   1256 
   1257 		rb_erase(node, &sched->atom_root);
   1258 		data = rb_entry(node, struct work_atoms, node);
   1259 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
   1260 	}
   1261 }
   1262 
   1263 static int process_sched_wakeup_event(struct perf_tool *tool,
   1264 				      struct perf_evsel *evsel,
   1265 				      struct perf_sample *sample,
   1266 				      struct machine *machine)
   1267 {
   1268 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1269 
   1270 	if (sched->tp_handler->wakeup_event)
   1271 		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
   1272 
   1273 	return 0;
   1274 }
   1275 
   1276 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
   1277 			    struct perf_sample *sample, struct machine *machine)
   1278 {
   1279 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
   1280 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
   1281 	struct thread *sched_out __maybe_unused, *sched_in;
   1282 	int new_shortname;
   1283 	u64 timestamp0, timestamp = sample->time;
   1284 	s64 delta;
   1285 	int cpu, this_cpu = sample->cpu;
   1286 
   1287 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
   1288 
   1289 	if (this_cpu > sched->max_cpu)
   1290 		sched->max_cpu = this_cpu;
   1291 
   1292 	timestamp0 = sched->cpu_last_switched[this_cpu];
   1293 	sched->cpu_last_switched[this_cpu] = timestamp;
   1294 	if (timestamp0)
   1295 		delta = timestamp - timestamp0;
   1296 	else
   1297 		delta = 0;
   1298 
   1299 	if (delta < 0) {
   1300 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
   1301 		return -1;
   1302 	}
   1303 
   1304 	sched_out = machine__findnew_thread(machine, 0, prev_pid);
   1305 	sched_in = machine__findnew_thread(machine, 0, next_pid);
   1306 
   1307 	sched->curr_thread[this_cpu] = sched_in;
   1308 
   1309 	printf("  ");
   1310 
   1311 	new_shortname = 0;
   1312 	if (!sched_in->shortname[0]) {
   1313 		sched_in->shortname[0] = sched->next_shortname1;
   1314 		sched_in->shortname[1] = sched->next_shortname2;
   1315 
   1316 		if (sched->next_shortname1 < 'Z') {
   1317 			sched->next_shortname1++;
   1318 		} else {
   1319 			sched->next_shortname1='A';
   1320 			if (sched->next_shortname2 < '9') {
   1321 				sched->next_shortname2++;
   1322 			} else {
   1323 				sched->next_shortname2='0';
   1324 			}
   1325 		}
   1326 		new_shortname = 1;
   1327 	}
   1328 
   1329 	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
   1330 		if (cpu != this_cpu)
   1331 			printf(" ");
   1332 		else
   1333 			printf("*");
   1334 
   1335 		if (sched->curr_thread[cpu]) {
   1336 			if (sched->curr_thread[cpu]->tid)
   1337 				printf("%2s ", sched->curr_thread[cpu]->shortname);
   1338 			else
   1339 				printf(".  ");
   1340 		} else
   1341 			printf("   ");
   1342 	}
   1343 
   1344 	printf("  %12.6f secs ", (double)timestamp/1e9);
   1345 	if (new_shortname) {
   1346 		printf("%s => %s:%d\n",
   1347 			sched_in->shortname, sched_in->comm, sched_in->tid);
   1348 	} else {
   1349 		printf("\n");
   1350 	}
   1351 
   1352 	return 0;
   1353 }
   1354 
   1355 static int process_sched_switch_event(struct perf_tool *tool,
   1356 				      struct perf_evsel *evsel,
   1357 				      struct perf_sample *sample,
   1358 				      struct machine *machine)
   1359 {
   1360 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1361 	int this_cpu = sample->cpu, err = 0;
   1362 	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
   1363 	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
   1364 
   1365 	if (sched->curr_pid[this_cpu] != (u32)-1) {
   1366 		/*
   1367 		 * Are we trying to switch away a PID that is
   1368 		 * not current?
   1369 		 */
   1370 		if (sched->curr_pid[this_cpu] != prev_pid)
   1371 			sched->nr_context_switch_bugs++;
   1372 	}
   1373 
   1374 	if (sched->tp_handler->switch_event)
   1375 		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
   1376 
   1377 	sched->curr_pid[this_cpu] = next_pid;
   1378 	return err;
   1379 }
   1380 
   1381 static int process_sched_runtime_event(struct perf_tool *tool,
   1382 				       struct perf_evsel *evsel,
   1383 				       struct perf_sample *sample,
   1384 				       struct machine *machine)
   1385 {
   1386 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1387 
   1388 	if (sched->tp_handler->runtime_event)
   1389 		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
   1390 
   1391 	return 0;
   1392 }
   1393 
   1394 static int perf_sched__process_fork_event(struct perf_tool *tool,
   1395 					  union perf_event *event,
   1396 					  struct perf_sample *sample,
   1397 					  struct machine *machine)
   1398 {
   1399 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1400 
   1401 	/* run the fork event through the perf machineruy */
   1402 	perf_event__process_fork(tool, event, sample, machine);
   1403 
   1404 	/* and then run additional processing needed for this command */
   1405 	if (sched->tp_handler->fork_event)
   1406 		return sched->tp_handler->fork_event(sched, event, machine);
   1407 
   1408 	return 0;
   1409 }
   1410 
   1411 static int process_sched_migrate_task_event(struct perf_tool *tool,
   1412 					    struct perf_evsel *evsel,
   1413 					    struct perf_sample *sample,
   1414 					    struct machine *machine)
   1415 {
   1416 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1417 
   1418 	if (sched->tp_handler->migrate_task_event)
   1419 		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
   1420 
   1421 	return 0;
   1422 }
   1423 
   1424 typedef int (*tracepoint_handler)(struct perf_tool *tool,
   1425 				  struct perf_evsel *evsel,
   1426 				  struct perf_sample *sample,
   1427 				  struct machine *machine);
   1428 
   1429 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
   1430 						 union perf_event *event __maybe_unused,
   1431 						 struct perf_sample *sample,
   1432 						 struct perf_evsel *evsel,
   1433 						 struct machine *machine)
   1434 {
   1435 	int err = 0;
   1436 
   1437 	evsel->hists.stats.total_period += sample->period;
   1438 	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
   1439 
   1440 	if (evsel->handler.func != NULL) {
   1441 		tracepoint_handler f = evsel->handler.func;
   1442 		err = f(tool, evsel, sample, machine);
   1443 	}
   1444 
   1445 	return err;
   1446 }
   1447 
   1448 static int perf_sched__read_events(struct perf_sched *sched,
   1449 				   struct perf_session **psession)
   1450 {
   1451 	const struct perf_evsel_str_handler handlers[] = {
   1452 		{ "sched:sched_switch",	      process_sched_switch_event, },
   1453 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
   1454 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
   1455 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
   1456 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
   1457 	};
   1458 	struct perf_session *session;
   1459 
   1460 	session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
   1461 	if (session == NULL) {
   1462 		pr_debug("No Memory for session\n");
   1463 		return -1;
   1464 	}
   1465 
   1466 	if (perf_session__set_tracepoints_handlers(session, handlers))
   1467 		goto out_delete;
   1468 
   1469 	if (perf_session__has_traces(session, "record -R")) {
   1470 		int err = perf_session__process_events(session, &sched->tool);
   1471 		if (err) {
   1472 			pr_err("Failed to process events, error %d", err);
   1473 			goto out_delete;
   1474 		}
   1475 
   1476 		sched->nr_events      = session->stats.nr_events[0];
   1477 		sched->nr_lost_events = session->stats.total_lost;
   1478 		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
   1479 	}
   1480 
   1481 	if (psession)
   1482 		*psession = session;
   1483 	else
   1484 		perf_session__delete(session);
   1485 
   1486 	return 0;
   1487 
   1488 out_delete:
   1489 	perf_session__delete(session);
   1490 	return -1;
   1491 }
   1492 
   1493 static void print_bad_events(struct perf_sched *sched)
   1494 {
   1495 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
   1496 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
   1497 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
   1498 			sched->nr_unordered_timestamps, sched->nr_timestamps);
   1499 	}
   1500 	if (sched->nr_lost_events && sched->nr_events) {
   1501 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
   1502 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
   1503 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
   1504 	}
   1505 	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
   1506 		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
   1507 			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
   1508 			sched->nr_state_machine_bugs, sched->nr_timestamps);
   1509 		if (sched->nr_lost_events)
   1510 			printf(" (due to lost events?)");
   1511 		printf("\n");
   1512 	}
   1513 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
   1514 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
   1515 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
   1516 			sched->nr_context_switch_bugs, sched->nr_timestamps);
   1517 		if (sched->nr_lost_events)
   1518 			printf(" (due to lost events?)");
   1519 		printf("\n");
   1520 	}
   1521 }
   1522 
   1523 static int perf_sched__lat(struct perf_sched *sched)
   1524 {
   1525 	struct rb_node *next;
   1526 	struct perf_session *session;
   1527 
   1528 	setup_pager();
   1529 
   1530 	/* save session -- references to threads are held in work_list */
   1531 	if (perf_sched__read_events(sched, &session))
   1532 		return -1;
   1533 
   1534 	perf_sched__sort_lat(sched);
   1535 
   1536 	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
   1537 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
   1538 	printf(" ---------------------------------------------------------------------------------------------------------------\n");
   1539 
   1540 	next = rb_first(&sched->sorted_atom_root);
   1541 
   1542 	while (next) {
   1543 		struct work_atoms *work_list;
   1544 
   1545 		work_list = rb_entry(next, struct work_atoms, node);
   1546 		output_lat_thread(sched, work_list);
   1547 		next = rb_next(next);
   1548 	}
   1549 
   1550 	printf(" -----------------------------------------------------------------------------------------\n");
   1551 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
   1552 		(double)sched->all_runtime / 1e6, sched->all_count);
   1553 
   1554 	printf(" ---------------------------------------------------\n");
   1555 
   1556 	print_bad_events(sched);
   1557 	printf("\n");
   1558 
   1559 	perf_session__delete(session);
   1560 	return 0;
   1561 }
   1562 
   1563 static int perf_sched__map(struct perf_sched *sched)
   1564 {
   1565 	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
   1566 
   1567 	setup_pager();
   1568 	if (perf_sched__read_events(sched, NULL))
   1569 		return -1;
   1570 	print_bad_events(sched);
   1571 	return 0;
   1572 }
   1573 
   1574 static int perf_sched__replay(struct perf_sched *sched)
   1575 {
   1576 	unsigned long i;
   1577 
   1578 	calibrate_run_measurement_overhead(sched);
   1579 	calibrate_sleep_measurement_overhead(sched);
   1580 
   1581 	test_calibrations(sched);
   1582 
   1583 	if (perf_sched__read_events(sched, NULL))
   1584 		return -1;
   1585 
   1586 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
   1587 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
   1588 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
   1589 
   1590 	if (sched->targetless_wakeups)
   1591 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
   1592 	if (sched->multitarget_wakeups)
   1593 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
   1594 	if (sched->nr_run_events_optimized)
   1595 		printf("run atoms optimized: %ld\n",
   1596 			sched->nr_run_events_optimized);
   1597 
   1598 	print_task_traces(sched);
   1599 	add_cross_task_wakeups(sched);
   1600 
   1601 	create_tasks(sched);
   1602 	printf("------------------------------------------------------------\n");
   1603 	for (i = 0; i < sched->replay_repeat; i++)
   1604 		run_one_test(sched);
   1605 
   1606 	return 0;
   1607 }
   1608 
   1609 static void setup_sorting(struct perf_sched *sched, const struct option *options,
   1610 			  const char * const usage_msg[])
   1611 {
   1612 	char *tmp, *tok, *str = strdup(sched->sort_order);
   1613 
   1614 	for (tok = strtok_r(str, ", ", &tmp);
   1615 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
   1616 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
   1617 			error("Unknown --sort key: `%s'", tok);
   1618 			usage_with_options(usage_msg, options);
   1619 		}
   1620 	}
   1621 
   1622 	free(str);
   1623 
   1624 	sort_dimension__add("pid", &sched->cmp_pid);
   1625 }
   1626 
   1627 static int __cmd_record(int argc, const char **argv)
   1628 {
   1629 	unsigned int rec_argc, i, j;
   1630 	const char **rec_argv;
   1631 	const char * const record_args[] = {
   1632 		"record",
   1633 		"-a",
   1634 		"-R",
   1635 		"-m", "1024",
   1636 		"-c", "1",
   1637 		"-e", "sched:sched_switch",
   1638 		"-e", "sched:sched_stat_wait",
   1639 		"-e", "sched:sched_stat_sleep",
   1640 		"-e", "sched:sched_stat_iowait",
   1641 		"-e", "sched:sched_stat_runtime",
   1642 		"-e", "sched:sched_process_fork",
   1643 		"-e", "sched:sched_wakeup",
   1644 		"-e", "sched:sched_migrate_task",
   1645 	};
   1646 
   1647 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
   1648 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
   1649 
   1650 	if (rec_argv == NULL)
   1651 		return -ENOMEM;
   1652 
   1653 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
   1654 		rec_argv[i] = strdup(record_args[i]);
   1655 
   1656 	for (j = 1; j < (unsigned int)argc; j++, i++)
   1657 		rec_argv[i] = argv[j];
   1658 
   1659 	BUG_ON(i != rec_argc);
   1660 
   1661 	return cmd_record(i, rec_argv, NULL);
   1662 }
   1663 
   1664 static const char default_sort_order[] = "avg, max, switch, runtime";
   1665 static struct perf_sched sched = {
   1666 	.tool = {
   1667 		.sample		 = perf_sched__process_tracepoint_sample,
   1668 		.comm		 = perf_event__process_comm,
   1669 		.lost		 = perf_event__process_lost,
   1670 		.fork		 = perf_sched__process_fork_event,
   1671 		.ordered_samples = true,
   1672 	},
   1673 	.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
   1674 	.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
   1675 	.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
   1676 	.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
   1677 	.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
   1678 	.sort_order	      = default_sort_order,
   1679 	.replay_repeat	      = 10,
   1680 	.profile_cpu	      = -1,
   1681 	.next_shortname1      = 'A',
   1682 	.next_shortname2      = '0',
   1683 };
   1684 
   1685 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
   1686 {
   1687 	const struct option latency_options[] = {
   1688 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
   1689 		   "sort by key(s): runtime, switch, avg, max"),
   1690 	OPT_INCR('v', "verbose", &verbose,
   1691 		    "be more verbose (show symbol address, etc)"),
   1692 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
   1693 		    "CPU to profile on"),
   1694 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1695 		    "dump raw trace in ASCII"),
   1696 	OPT_END()
   1697 	};
   1698 	const struct option replay_options[] = {
   1699 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
   1700 		     "repeat the workload replay N times (-1: infinite)"),
   1701 	OPT_INCR('v', "verbose", &verbose,
   1702 		    "be more verbose (show symbol address, etc)"),
   1703 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1704 		    "dump raw trace in ASCII"),
   1705 	OPT_END()
   1706 	};
   1707 	const struct option sched_options[] = {
   1708 	OPT_STRING('i', "input", &input_name, "file",
   1709 		    "input file name"),
   1710 	OPT_INCR('v', "verbose", &verbose,
   1711 		    "be more verbose (show symbol address, etc)"),
   1712 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   1713 		    "dump raw trace in ASCII"),
   1714 	OPT_END()
   1715 	};
   1716 	const char * const latency_usage[] = {
   1717 		"perf sched latency [<options>]",
   1718 		NULL
   1719 	};
   1720 	const char * const replay_usage[] = {
   1721 		"perf sched replay [<options>]",
   1722 		NULL
   1723 	};
   1724 	const char * const sched_usage[] = {
   1725 		"perf sched [<options>] {record|latency|map|replay|script}",
   1726 		NULL
   1727 	};
   1728 	struct trace_sched_handler lat_ops  = {
   1729 		.wakeup_event	    = latency_wakeup_event,
   1730 		.switch_event	    = latency_switch_event,
   1731 		.runtime_event	    = latency_runtime_event,
   1732 		.migrate_task_event = latency_migrate_task_event,
   1733 	};
   1734 	struct trace_sched_handler map_ops  = {
   1735 		.switch_event	    = map_switch_event,
   1736 	};
   1737 	struct trace_sched_handler replay_ops  = {
   1738 		.wakeup_event	    = replay_wakeup_event,
   1739 		.switch_event	    = replay_switch_event,
   1740 		.fork_event	    = replay_fork_event,
   1741 	};
   1742 
   1743 	argc = parse_options(argc, argv, sched_options, sched_usage,
   1744 			     PARSE_OPT_STOP_AT_NON_OPTION);
   1745 	if (!argc)
   1746 		usage_with_options(sched_usage, sched_options);
   1747 
   1748 	/*
   1749 	 * Aliased to 'perf script' for now:
   1750 	 */
   1751 	if (!strcmp(argv[0], "script"))
   1752 		return cmd_script(argc, argv, prefix);
   1753 
   1754 	symbol__init();
   1755 	if (!strncmp(argv[0], "rec", 3)) {
   1756 		return __cmd_record(argc, argv);
   1757 	} else if (!strncmp(argv[0], "lat", 3)) {
   1758 		sched.tp_handler = &lat_ops;
   1759 		if (argc > 1) {
   1760 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
   1761 			if (argc)
   1762 				usage_with_options(latency_usage, latency_options);
   1763 		}
   1764 		setup_sorting(&sched, latency_options, latency_usage);
   1765 		return perf_sched__lat(&sched);
   1766 	} else if (!strcmp(argv[0], "map")) {
   1767 		sched.tp_handler = &map_ops;
   1768 		setup_sorting(&sched, latency_options, latency_usage);
   1769 		return perf_sched__map(&sched);
   1770 	} else if (!strncmp(argv[0], "rep", 3)) {
   1771 		sched.tp_handler = &replay_ops;
   1772 		if (argc) {
   1773 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
   1774 			if (argc)
   1775 				usage_with_options(replay_usage, replay_options);
   1776 		}
   1777 		return perf_sched__replay(&sched);
   1778 	} else {
   1779 		usage_with_options(sched_usage, sched_options);
   1780 	}
   1781 
   1782 	return 0;
   1783 }
   1784