Home | History | Annotate | Download | only in perf
      1 /*
      2  * builtin-record.c
      3  *
      4  * Builtin record command: Record the profile of a workload
      5  * (or a CPU, or a PID) into the perf.data output file - for
      6  * later analysis via perf report.
      7  */
      8 #include "builtin.h"
      9 
     10 #include "perf.h"
     11 
     12 #include "util/build-id.h"
     13 #include "util/util.h"
     14 #include "util/parse-options.h"
     15 #include "util/parse-events.h"
     16 
     17 #include "util/header.h"
     18 #include "util/event.h"
     19 #include "util/evlist.h"
     20 #include "util/evsel.h"
     21 #include "util/debug.h"
     22 #include "util/session.h"
     23 #include "util/tool.h"
     24 #include "util/symbol.h"
     25 #include "util/cpumap.h"
     26 #include "util/thread_map.h"
     27 
     28 #include <unistd.h>
     29 #include <sched.h>
     30 #include <sys/mman.h>
     31 
     32 #ifndef HAVE_ON_EXIT
     33 #ifndef ATEXIT_MAX
     34 #define ATEXIT_MAX 32
     35 #endif
     36 static int __on_exit_count = 0;
     37 typedef void (*on_exit_func_t) (int, void *);
     38 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
     39 static void *__on_exit_args[ATEXIT_MAX];
     40 static int __exitcode = 0;
     41 static void __handle_on_exit_funcs(void);
     42 static int on_exit(on_exit_func_t function, void *arg);
     43 #define exit(x) (exit)(__exitcode = (x))
     44 
     45 static int on_exit(on_exit_func_t function, void *arg)
     46 {
     47 	if (__on_exit_count == ATEXIT_MAX)
     48 		return -ENOMEM;
     49 	else if (__on_exit_count == 0)
     50 		atexit(__handle_on_exit_funcs);
     51 	__on_exit_funcs[__on_exit_count] = function;
     52 	__on_exit_args[__on_exit_count++] = arg;
     53 	return 0;
     54 }
     55 
     56 static void __handle_on_exit_funcs(void)
     57 {
     58 	int i;
     59 	for (i = 0; i < __on_exit_count; i++)
     60 		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
     61 }
     62 #endif
     63 
     64 struct perf_record {
     65 	struct perf_tool	tool;
     66 	struct perf_record_opts	opts;
     67 	u64			bytes_written;
     68 	const char		*output_name;
     69 	struct perf_evlist	*evlist;
     70 	struct perf_session	*session;
     71 	const char		*progname;
     72 	int			output;
     73 	unsigned int		page_size;
     74 	int			realtime_prio;
     75 	bool			no_buildid;
     76 	bool			no_buildid_cache;
     77 	long			samples;
     78 	off_t			post_processing_offset;
     79 };
     80 
     81 static void advance_output(struct perf_record *rec, size_t size)
     82 {
     83 	rec->bytes_written += size;
     84 }
     85 
     86 static int write_output(struct perf_record *rec, void *buf, size_t size)
     87 {
     88 	while (size) {
     89 		int ret = write(rec->output, buf, size);
     90 
     91 		if (ret < 0) {
     92 			pr_err("failed to write\n");
     93 			return -1;
     94 		}
     95 
     96 		size -= ret;
     97 		buf += ret;
     98 
     99 		rec->bytes_written += ret;
    100 	}
    101 
    102 	return 0;
    103 }
    104 
    105 static int process_synthesized_event(struct perf_tool *tool,
    106 				     union perf_event *event,
    107 				     struct perf_sample *sample __maybe_unused,
    108 				     struct machine *machine __maybe_unused)
    109 {
    110 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
    111 	if (write_output(rec, event, event->header.size) < 0)
    112 		return -1;
    113 
    114 	return 0;
    115 }
    116 
    117 static int perf_record__mmap_read(struct perf_record *rec,
    118 				   struct perf_mmap *md)
    119 {
    120 	unsigned int head = perf_mmap__read_head(md);
    121 	unsigned int old = md->prev;
    122 	unsigned char *data = md->base + rec->page_size;
    123 	unsigned long size;
    124 	void *buf;
    125 	int rc = 0;
    126 
    127 	if (old == head)
    128 		return 0;
    129 
    130 	rec->samples++;
    131 
    132 	size = head - old;
    133 
    134 	if ((old & md->mask) + size != (head & md->mask)) {
    135 		buf = &data[old & md->mask];
    136 		size = md->mask + 1 - (old & md->mask);
    137 		old += size;
    138 
    139 		if (write_output(rec, buf, size) < 0) {
    140 			rc = -1;
    141 			goto out;
    142 		}
    143 	}
    144 
    145 	buf = &data[old & md->mask];
    146 	size = head - old;
    147 	old += size;
    148 
    149 	if (write_output(rec, buf, size) < 0) {
    150 		rc = -1;
    151 		goto out;
    152 	}
    153 
    154 	md->prev = old;
    155 	perf_mmap__write_tail(md, old);
    156 
    157 out:
    158 	return rc;
    159 }
    160 
    161 static volatile int done = 0;
    162 static volatile int signr = -1;
    163 static volatile int child_finished = 0;
    164 
    165 static void sig_handler(int sig)
    166 {
    167 	if (sig == SIGCHLD)
    168 		child_finished = 1;
    169 
    170 	done = 1;
    171 	signr = sig;
    172 }
    173 
    174 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
    175 {
    176 	struct perf_record *rec = arg;
    177 	int status;
    178 
    179 	if (rec->evlist->workload.pid > 0) {
    180 		if (!child_finished)
    181 			kill(rec->evlist->workload.pid, SIGTERM);
    182 
    183 		wait(&status);
    184 		if (WIFSIGNALED(status))
    185 			psignal(WTERMSIG(status), rec->progname);
    186 	}
    187 
    188 	if (signr == -1 || signr == SIGUSR1)
    189 		return;
    190 
    191 	signal(signr, SIG_DFL);
    192 }
    193 
    194 static int perf_record__open(struct perf_record *rec)
    195 {
    196 	char msg[512];
    197 	struct perf_evsel *pos;
    198 	struct perf_evlist *evlist = rec->evlist;
    199 	struct perf_session *session = rec->session;
    200 	struct perf_record_opts *opts = &rec->opts;
    201 	int rc = 0;
    202 
    203 	perf_evlist__config(evlist, opts);
    204 
    205 	list_for_each_entry(pos, &evlist->entries, node) {
    206 try_again:
    207 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
    208 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
    209 				if (verbose)
    210 					ui__warning("%s\n", msg);
    211 				goto try_again;
    212 			}
    213 
    214 			rc = -errno;
    215 			perf_evsel__open_strerror(pos, &opts->target,
    216 						  errno, msg, sizeof(msg));
    217 			ui__error("%s\n", msg);
    218 			goto out;
    219 		}
    220 	}
    221 
    222 	if (perf_evlist__apply_filters(evlist)) {
    223 		error("failed to set filter with %d (%s)\n", errno,
    224 			strerror(errno));
    225 		rc = -1;
    226 		goto out;
    227 	}
    228 
    229 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
    230 		if (errno == EPERM) {
    231 			pr_err("Permission error mapping pages.\n"
    232 			       "Consider increasing "
    233 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
    234 			       "or try again with a smaller value of -m/--mmap_pages.\n"
    235 			       "(current value: %d)\n", opts->mmap_pages);
    236 			rc = -errno;
    237 		} else if (!is_power_of_2(opts->mmap_pages) &&
    238 			   (opts->mmap_pages != UINT_MAX)) {
    239 			pr_err("--mmap_pages/-m value must be a power of two.");
    240 			rc = -EINVAL;
    241 		} else {
    242 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
    243 			rc = -errno;
    244 		}
    245 		goto out;
    246 	}
    247 
    248 	session->evlist = evlist;
    249 	perf_session__set_id_hdr_size(session);
    250 out:
    251 	return rc;
    252 }
    253 
    254 static int process_buildids(struct perf_record *rec)
    255 {
    256 	u64 size = lseek(rec->output, 0, SEEK_CUR);
    257 
    258 	if (size == 0)
    259 		return 0;
    260 
    261 	rec->session->fd = rec->output;
    262 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
    263 					      size - rec->post_processing_offset,
    264 					      size, &build_id__mark_dso_hit_ops);
    265 }
    266 
    267 static void perf_record__exit(int status, void *arg)
    268 {
    269 	struct perf_record *rec = arg;
    270 
    271 	if (status != 0)
    272 		return;
    273 
    274 	if (!rec->opts.pipe_output) {
    275 		rec->session->header.data_size += rec->bytes_written;
    276 
    277 		if (!rec->no_buildid)
    278 			process_buildids(rec);
    279 		perf_session__write_header(rec->session, rec->evlist,
    280 					   rec->output, true);
    281 		perf_session__delete(rec->session);
    282 		perf_evlist__delete(rec->evlist);
    283 		symbol__exit();
    284 	}
    285 }
    286 
    287 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
    288 {
    289 	int err;
    290 	struct perf_tool *tool = data;
    291 	/*
    292 	 *As for guest kernel when processing subcommand record&report,
    293 	 *we arrange module mmap prior to guest kernel mmap and trigger
    294 	 *a preload dso because default guest module symbols are loaded
    295 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
    296 	 *method is used to avoid symbol missing when the first addr is
    297 	 *in module instead of in guest kernel.
    298 	 */
    299 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
    300 					     machine);
    301 	if (err < 0)
    302 		pr_err("Couldn't record guest kernel [%d]'s reference"
    303 		       " relocation symbol.\n", machine->pid);
    304 
    305 	/*
    306 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
    307 	 * have no _text sometimes.
    308 	 */
    309 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
    310 						 machine, "_text");
    311 	if (err < 0)
    312 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
    313 							 machine, "_stext");
    314 	if (err < 0)
    315 		pr_err("Couldn't record guest kernel [%d]'s reference"
    316 		       " relocation symbol.\n", machine->pid);
    317 }
    318 
    319 static struct perf_event_header finished_round_event = {
    320 	.size = sizeof(struct perf_event_header),
    321 	.type = PERF_RECORD_FINISHED_ROUND,
    322 };
    323 
    324 static int perf_record__mmap_read_all(struct perf_record *rec)
    325 {
    326 	int i;
    327 	int rc = 0;
    328 
    329 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
    330 		if (rec->evlist->mmap[i].base) {
    331 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
    332 				rc = -1;
    333 				goto out;
    334 			}
    335 		}
    336 	}
    337 
    338 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
    339 		rc = write_output(rec, &finished_round_event,
    340 				  sizeof(finished_round_event));
    341 
    342 out:
    343 	return rc;
    344 }
    345 
    346 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
    347 {
    348 #ifndef __APPLE__
    349 	struct stat st;
    350 	int flags;
    351 	int err, output, feat;
    352 	unsigned long waking = 0;
    353 	const bool forks = argc > 0;
    354 	struct machine *machine;
    355 	struct perf_tool *tool = &rec->tool;
    356 	struct perf_record_opts *opts = &rec->opts;
    357 	struct perf_evlist *evsel_list = rec->evlist;
    358 	const char *output_name = rec->output_name;
    359 	struct perf_session *session;
    360 	bool disabled = false;
    361 
    362 	rec->progname = argv[0];
    363 
    364 	rec->page_size = sysconf(_SC_PAGE_SIZE);
    365 
    366 	on_exit(perf_record__sig_exit, rec);
    367 	signal(SIGCHLD, sig_handler);
    368 	signal(SIGINT, sig_handler);
    369 	signal(SIGUSR1, sig_handler);
    370 	signal(SIGTERM, sig_handler);
    371 
    372 	if (!output_name) {
    373 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
    374 			opts->pipe_output = true;
    375 		else
    376 			rec->output_name = output_name = "perf.data";
    377 	}
    378 	if (output_name) {
    379 		if (!strcmp(output_name, "-"))
    380 			opts->pipe_output = true;
    381 		else if (!stat(output_name, &st) && st.st_size) {
    382 			char oldname[PATH_MAX];
    383 			snprintf(oldname, sizeof(oldname), "%s.old",
    384 				 output_name);
    385 			unlink(oldname);
    386 			rename(output_name, oldname);
    387 		}
    388 	}
    389 
    390 	flags = O_CREAT|O_RDWR|O_TRUNC;
    391 
    392 	if (opts->pipe_output)
    393 		output = STDOUT_FILENO;
    394 	else
    395 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
    396 	if (output < 0) {
    397 		perror("failed to create output file");
    398 		return -1;
    399 	}
    400 
    401 	rec->output = output;
    402 
    403 	session = perf_session__new(output_name, O_WRONLY,
    404 				    true, false, NULL);
    405 	if (session == NULL) {
    406 		pr_err("Not enough memory for reading perf file header\n");
    407 		return -1;
    408 	}
    409 
    410 	rec->session = session;
    411 
    412 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
    413 		perf_header__set_feat(&session->header, feat);
    414 
    415 	if (rec->no_buildid)
    416 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
    417 
    418 	if (!have_tracepoints(&evsel_list->entries))
    419 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
    420 
    421 	if (!rec->opts.branch_stack)
    422 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
    423 
    424 	if (forks) {
    425 		err = perf_evlist__prepare_workload(evsel_list, &opts->target,
    426 						    argv, opts->pipe_output,
    427 						    true);
    428 		if (err < 0) {
    429 			pr_err("Couldn't run the workload!\n");
    430 			goto out_delete_session;
    431 		}
    432 	}
    433 
    434 	if (perf_record__open(rec) != 0) {
    435 		err = -1;
    436 		goto out_delete_session;
    437 	}
    438 
    439 	if (!evsel_list->nr_groups)
    440 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
    441 
    442 	/*
    443 	 * perf_session__delete(session) will be called at perf_record__exit()
    444 	 */
    445 	on_exit(perf_record__exit, rec);
    446 
    447 	if (opts->pipe_output) {
    448 		err = perf_header__write_pipe(output);
    449 		if (err < 0)
    450 			goto out_delete_session;
    451 	} else {
    452 		err = perf_session__write_header(session, evsel_list,
    453 						 output, false);
    454 		if (err < 0)
    455 			goto out_delete_session;
    456 	}
    457 
    458 	if (!rec->no_buildid
    459 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
    460 		pr_err("Couldn't generate buildids. "
    461 		       "Use --no-buildid to profile anyway.\n");
    462 		err = -1;
    463 		goto out_delete_session;
    464 	}
    465 
    466 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
    467 
    468 	machine = &session->machines.host;
    469 
    470 	if (opts->pipe_output) {
    471 		err = perf_event__synthesize_attrs(tool, session,
    472 						   process_synthesized_event);
    473 		if (err < 0) {
    474 			pr_err("Couldn't synthesize attrs.\n");
    475 			goto out_delete_session;
    476 		}
    477 
    478 		if (have_tracepoints(&evsel_list->entries)) {
    479 			/*
    480 			 * FIXME err <= 0 here actually means that
    481 			 * there were no tracepoints so its not really
    482 			 * an error, just that we don't need to
    483 			 * synthesize anything.  We really have to
    484 			 * return this more properly and also
    485 			 * propagate errors that now are calling die()
    486 			 */
    487 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
    488 								  process_synthesized_event);
    489 			if (err <= 0) {
    490 				pr_err("Couldn't record tracing data.\n");
    491 				goto out_delete_session;
    492 			}
    493 			advance_output(rec, err);
    494 		}
    495 	}
    496 
    497 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
    498 						 machine, "_text");
    499 	if (err < 0)
    500 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
    501 							 machine, "_stext");
    502 	if (err < 0)
    503 		pr_err("Couldn't record kernel reference relocation symbol\n"
    504 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
    505 		       "Check /proc/kallsyms permission or run as root.\n");
    506 
    507 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
    508 					     machine);
    509 	if (err < 0)
    510 		pr_err("Couldn't record kernel module information.\n"
    511 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
    512 		       "Check /proc/modules permission or run as root.\n");
    513 
    514 	if (perf_guest) {
    515 		machines__process_guests(&session->machines,
    516 					 perf_event__synthesize_guest_os, tool);
    517 	}
    518 
    519 	if (perf_target__has_task(&opts->target))
    520 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
    521 						  process_synthesized_event,
    522 						  machine);
    523 	else if (perf_target__has_cpu(&opts->target))
    524 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
    525 					       machine);
    526 	else /* command specified */
    527 		err = 0;
    528 
    529 	if (err != 0)
    530 		goto out_delete_session;
    531 
    532 	if (rec->realtime_prio) {
    533 		struct sched_param param;
    534 
    535 		param.sched_priority = rec->realtime_prio;
    536 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
    537 			pr_err("Could not set realtime priority.\n");
    538 			err = -1;
    539 			goto out_delete_session;
    540 		}
    541 	}
    542 
    543 	/*
    544 	 * When perf is starting the traced process, all the events
    545 	 * (apart from group members) have enable_on_exec=1 set,
    546 	 * so don't spoil it by prematurely enabling them.
    547 	 */
    548 	if (!perf_target__none(&opts->target))
    549 		perf_evlist__enable(evsel_list);
    550 
    551 	/*
    552 	 * Let the child rip
    553 	 */
    554 	if (forks)
    555 		perf_evlist__start_workload(evsel_list);
    556 
    557 	for (;;) {
    558 		int hits = rec->samples;
    559 
    560 		if (perf_record__mmap_read_all(rec) < 0) {
    561 			err = -1;
    562 			goto out_delete_session;
    563 		}
    564 
    565 		if (hits == rec->samples) {
    566 			if (done)
    567 				break;
    568 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
    569 			waking++;
    570 		}
    571 
    572 		/*
    573 		 * When perf is starting the traced process, at the end events
    574 		 * die with the process and we wait for that. Thus no need to
    575 		 * disable events in this case.
    576 		 */
    577 		if (done && !disabled && !perf_target__none(&opts->target)) {
    578 			perf_evlist__disable(evsel_list);
    579 			disabled = true;
    580 		}
    581 	}
    582 
    583 	if (quiet || signr == SIGUSR1)
    584 		return 0;
    585 
    586 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
    587 
    588 	/*
    589 	 * Approximate RIP event size: 24 bytes.
    590 	 */
    591 	fprintf(stderr,
    592 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
    593 		(double)rec->bytes_written / 1024.0 / 1024.0,
    594 		output_name,
    595 		rec->bytes_written / 24);
    596 
    597 	return 0;
    598 
    599 out_delete_session:
    600 	perf_session__delete(session);
    601 	return err;
    602 #else
    603 	return -1;
    604 #endif
    605 }
    606 
    607 #define BRANCH_OPT(n, m) \
    608 	{ .name = n, .mode = (m) }
    609 
    610 #define BRANCH_END { .name = NULL }
    611 
    612 struct branch_mode {
    613 	const char *name;
    614 	int mode;
    615 };
    616 
    617 static const struct branch_mode branch_modes[] = {
    618 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
    619 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
    620 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
    621 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
    622 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
    623 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
    624 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
    625 	BRANCH_END
    626 };
    627 
    628 static int
    629 parse_branch_stack(const struct option *opt, const char *str, int unset)
    630 {
    631 #define ONLY_PLM \
    632 	(PERF_SAMPLE_BRANCH_USER	|\
    633 	 PERF_SAMPLE_BRANCH_KERNEL	|\
    634 	 PERF_SAMPLE_BRANCH_HV)
    635 
    636 	uint64_t *mode = (uint64_t *)opt->value;
    637 	const struct branch_mode *br;
    638 	char *s, *os = NULL, *p;
    639 	int ret = -1;
    640 
    641 	if (unset)
    642 		return 0;
    643 
    644 	/*
    645 	 * cannot set it twice, -b + --branch-filter for instance
    646 	 */
    647 	if (*mode)
    648 		return -1;
    649 
    650 	/* str may be NULL in case no arg is passed to -b */
    651 	if (str) {
    652 		/* because str is read-only */
    653 		s = os = strdup(str);
    654 		if (!s)
    655 			return -1;
    656 
    657 		for (;;) {
    658 			p = strchr(s, ',');
    659 			if (p)
    660 				*p = '\0';
    661 
    662 			for (br = branch_modes; br->name; br++) {
    663 				if (!strcasecmp(s, br->name))
    664 					break;
    665 			}
    666 			if (!br->name) {
    667 				ui__warning("unknown branch filter %s,"
    668 					    " check man page\n", s);
    669 				goto error;
    670 			}
    671 
    672 			*mode |= br->mode;
    673 
    674 			if (!p)
    675 				break;
    676 
    677 			s = p + 1;
    678 		}
    679 	}
    680 	ret = 0;
    681 
    682 	/* default to any branch */
    683 	if ((*mode & ~ONLY_PLM) == 0) {
    684 		*mode = PERF_SAMPLE_BRANCH_ANY;
    685 	}
    686 error:
    687 	free(os);
    688 	return ret;
    689 }
    690 
    691 #ifdef LIBUNWIND_SUPPORT
    692 static int get_stack_size(char *str, unsigned long *_size)
    693 {
    694 	char *endptr;
    695 	unsigned long size;
    696 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
    697 
    698 	size = strtoul(str, &endptr, 0);
    699 
    700 	do {
    701 		if (*endptr)
    702 			break;
    703 
    704 		size = round_up(size, sizeof(u64));
    705 		if (!size || size > max_size)
    706 			break;
    707 
    708 		*_size = size;
    709 		return 0;
    710 
    711 	} while (0);
    712 
    713 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
    714 	       max_size, str);
    715 	return -1;
    716 }
    717 #endif /* LIBUNWIND_SUPPORT */
    718 
    719 int record_parse_callchain(const char *arg, struct perf_record_opts *opts)
    720 {
    721 	char *tok, *name, *saveptr = NULL;
    722 	char *buf;
    723 	int ret = -1;
    724 
    725 	/* We need buffer that we know we can write to. */
    726 	buf = malloc(strlen(arg) + 1);
    727 	if (!buf)
    728 		return -ENOMEM;
    729 
    730 	strcpy(buf, arg);
    731 
    732 	tok = strtok_r((char *)buf, ",", &saveptr);
    733 	name = tok ? : (char *)buf;
    734 
    735 	do {
    736 		/* Framepointer style */
    737 		if (!strncmp(name, "fp", sizeof("fp"))) {
    738 			if (!strtok_r(NULL, ",", &saveptr)) {
    739 				opts->call_graph = CALLCHAIN_FP;
    740 				ret = 0;
    741 			} else
    742 				pr_err("callchain: No more arguments "
    743 				       "needed for -g fp\n");
    744 			break;
    745 
    746 #ifdef LIBUNWIND_SUPPORT
    747 		/* Dwarf style */
    748 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
    749 			const unsigned long default_stack_dump_size = 8192;
    750 
    751 			ret = 0;
    752 			opts->call_graph = CALLCHAIN_DWARF;
    753 			opts->stack_dump_size = default_stack_dump_size;
    754 
    755 			tok = strtok_r(NULL, ",", &saveptr);
    756 			if (tok) {
    757 				unsigned long size = 0;
    758 
    759 				ret = get_stack_size(tok, &size);
    760 				opts->stack_dump_size = size;
    761 			}
    762 #endif /* LIBUNWIND_SUPPORT */
    763 		} else {
    764 			pr_err("callchain: Unknown --call-graph option "
    765 			       "value: %s\n", arg);
    766 			break;
    767 		}
    768 
    769 	} while (0);
    770 
    771 	free(buf);
    772 	return ret;
    773 }
    774 
    775 static void callchain_debug(struct perf_record_opts *opts)
    776 {
    777 	pr_debug("callchain: type %d\n", opts->call_graph);
    778 
    779 	if (opts->call_graph == CALLCHAIN_DWARF)
    780 		pr_debug("callchain: stack dump size %d\n",
    781 			 opts->stack_dump_size);
    782 }
    783 
    784 int record_parse_callchain_opt(const struct option *opt,
    785 			       const char *arg,
    786 			       int unset)
    787 {
    788 	struct perf_record_opts *opts = opt->value;
    789 	int ret;
    790 
    791 	/* --no-call-graph */
    792 	if (unset) {
    793 		opts->call_graph = CALLCHAIN_NONE;
    794 		pr_debug("callchain: disabled\n");
    795 		return 0;
    796 	}
    797 
    798 	ret = record_parse_callchain(arg, opts);
    799 	if (!ret)
    800 		callchain_debug(opts);
    801 
    802 	return ret;
    803 }
    804 
    805 int record_callchain_opt(const struct option *opt,
    806 			 const char *arg __maybe_unused,
    807 			 int unset __maybe_unused)
    808 {
    809 	struct perf_record_opts *opts = opt->value;
    810 
    811 	if (opts->call_graph == CALLCHAIN_NONE)
    812 		opts->call_graph = CALLCHAIN_FP;
    813 
    814 	callchain_debug(opts);
    815 	return 0;
    816 }
    817 
    818 static const char * const record_usage[] = {
    819 	"perf record [<options>] [<command>]",
    820 	"perf record [<options>] -- <command> [<options>]",
    821 	NULL
    822 };
    823 
    824 /*
    825  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
    826  * because we need to have access to it in perf_record__exit, that is called
    827  * after cmd_record() exits, but since record_options need to be accessible to
    828  * builtin-script, leave it here.
    829  *
    830  * At least we don't ouch it in all the other functions here directly.
    831  *
    832  * Just say no to tons of global variables, sigh.
    833  */
    834 static struct perf_record record = {
    835 	.opts = {
    836 		.mmap_pages	     = UINT_MAX,
    837 		.user_freq	     = UINT_MAX,
    838 		.user_interval	     = ULLONG_MAX,
    839 		.freq		     = 4000,
    840 		.target		     = {
    841 			.uses_mmap   = true,
    842 		},
    843 	},
    844 };
    845 
    846 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
    847 
    848 #ifdef LIBUNWIND_SUPPORT
    849 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf";
    850 #else
    851 const char record_callchain_help[] = CALLCHAIN_HELP "fp";
    852 #endif
    853 
    854 /*
    855  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
    856  * with it and switch to use the library functions in perf_evlist that came
    857  * from builtin-record.c, i.e. use perf_record_opts,
    858  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
    859  * using pipes, etc.
    860  */
    861 const struct option record_options[] = {
    862 	OPT_CALLBACK('e', "event", &record.evlist, "event",
    863 		     "event selector. use 'perf list' to list available events",
    864 		     parse_events_option),
    865 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
    866 		     "event filter", parse_filter),
    867 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
    868 		    "record events on existing process id"),
    869 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
    870 		    "record events on existing thread id"),
    871 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
    872 		    "collect data with this RT SCHED_FIFO priority"),
    873 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
    874 		    "collect data without buffering"),
    875 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
    876 		    "collect raw sample records from all opened counters"),
    877 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
    878 			    "system-wide collection from all CPUs"),
    879 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
    880 		    "list of cpus to monitor"),
    881 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
    882 	OPT_STRING('o', "output", &record.output_name, "file",
    883 		    "output file name"),
    884 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
    885 		    "child tasks do not inherit counters"),
    886 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
    887 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
    888 		     "number of mmap data pages"),
    889 	OPT_BOOLEAN(0, "group", &record.opts.group,
    890 		    "put the counters into a counter group"),
    891 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
    892 			   NULL, "enables call-graph recording" ,
    893 			   &record_callchain_opt),
    894 	OPT_CALLBACK(0, "call-graph", &record.opts,
    895 		     "mode[,dump_size]", record_callchain_help,
    896 		     &record_parse_callchain_opt),
    897 	OPT_INCR('v', "verbose", &verbose,
    898 		    "be more verbose (show counter open errors, etc)"),
    899 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
    900 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
    901 		    "per thread counts"),
    902 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
    903 		    "Sample addresses"),
    904 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
    905 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
    906 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
    907 		    "don't sample"),
    908 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
    909 		    "do not update the buildid cache"),
    910 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
    911 		    "do not collect buildids in perf.data"),
    912 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
    913 		     "monitor event in cgroup name only",
    914 		     parse_cgroups),
    915 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
    916 		   "user to profile"),
    917 
    918 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
    919 		     "branch any", "sample any taken branches",
    920 		     parse_branch_stack),
    921 
    922 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
    923 		     "branch filter mask", "branch stack filter modes",
    924 		     parse_branch_stack),
    925 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
    926 		    "sample by weight (on special events only)"),
    927 	OPT_END()
    928 };
    929 
    930 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
    931 {
    932 	int err = -ENOMEM;
    933 	struct perf_evlist *evsel_list;
    934 	struct perf_record *rec = &record;
    935 	char errbuf[BUFSIZ];
    936 
    937 	evsel_list = perf_evlist__new();
    938 	if (evsel_list == NULL)
    939 		return -ENOMEM;
    940 
    941 	rec->evlist = evsel_list;
    942 
    943 	argc = parse_options(argc, argv, record_options, record_usage,
    944 			    PARSE_OPT_STOP_AT_NON_OPTION);
    945 	if (!argc && perf_target__none(&rec->opts.target))
    946 		usage_with_options(record_usage, record_options);
    947 
    948 	if (nr_cgroups && !rec->opts.target.system_wide) {
    949 		ui__error("cgroup monitoring only available in"
    950 			  " system-wide mode\n");
    951 		usage_with_options(record_usage, record_options);
    952 	}
    953 
    954 	symbol__init();
    955 
    956 	if (symbol_conf.kptr_restrict)
    957 		pr_warning(
    958 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
    959 "check /proc/sys/kernel/kptr_restrict.\n\n"
    960 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
    961 "file is not found in the buildid cache or in the vmlinux path.\n\n"
    962 "Samples in kernel modules won't be resolved at all.\n\n"
    963 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
    964 "even with a suitable vmlinux or kallsyms file.\n\n");
    965 
    966 	if (rec->no_buildid_cache || rec->no_buildid)
    967 		disable_buildid_cache();
    968 
    969 	if (evsel_list->nr_entries == 0 &&
    970 	    perf_evlist__add_default(evsel_list) < 0) {
    971 		pr_err("Not enough memory for event selector list\n");
    972 		goto out_symbol_exit;
    973 	}
    974 
    975 	err = perf_target__validate(&rec->opts.target);
    976 	if (err) {
    977 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
    978 		ui__warning("%s", errbuf);
    979 	}
    980 
    981 	err = perf_target__parse_uid(&rec->opts.target);
    982 	if (err) {
    983 		int saved_errno = errno;
    984 
    985 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
    986 		ui__error("%s", errbuf);
    987 
    988 		err = -saved_errno;
    989 		goto out_symbol_exit;
    990 	}
    991 
    992 	err = -ENOMEM;
    993 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
    994 		usage_with_options(record_usage, record_options);
    995 
    996 	if (rec->opts.user_interval != ULLONG_MAX)
    997 		rec->opts.default_interval = rec->opts.user_interval;
    998 	if (rec->opts.user_freq != UINT_MAX)
    999 		rec->opts.freq = rec->opts.user_freq;
   1000 
   1001 	/*
   1002 	 * User specified count overrides default frequency.
   1003 	 */
   1004 	if (rec->opts.default_interval)
   1005 		rec->opts.freq = 0;
   1006 	else if (rec->opts.freq) {
   1007 		rec->opts.default_interval = rec->opts.freq;
   1008 	} else {
   1009 		ui__error("frequency and count are zero, aborting\n");
   1010 		err = -EINVAL;
   1011 		goto out_free_fd;
   1012 	}
   1013 
   1014 	err = __cmd_record(&record, argc, argv);
   1015 
   1016 	perf_evlist__munmap(evsel_list);
   1017 	perf_evlist__close(evsel_list);
   1018 out_free_fd:
   1019 	perf_evlist__delete_maps(evsel_list);
   1020 out_symbol_exit:
   1021 	symbol__exit();
   1022 	return err;
   1023 }
   1024