Home | History | Annotate | Download | only in module
      1 /**
      2  * @file oprofile.c
      3  * Main driver code
      4  *
      5  * @remark Copyright 2002 OProfile authors
      6  * @remark Read the file COPYING
      7  *
      8  * @author John Levon
      9  * @author Philippe Elie
     10  */
     11 
     12 #include "oprofile.h"
     13 #include "op_util.h"
     14 #include "config.h"
     15 
     16 EXPORT_NO_SYMBOLS;
     17 
     18 MODULE_AUTHOR("John Levon (levon (at) movementarian.org)");
     19 MODULE_DESCRIPTION("Continuous Profiling Module");
     20 MODULE_LICENSE("GPL");
     21 
     22 MODULE_PARM(allow_unload, "i");
     23 MODULE_PARM_DESC(allow_unload, "Allow module to be unloaded.");
     24 #ifdef CONFIG_SMP
     25 static int allow_unload;
     26 #else
     27 static int allow_unload = 1;
     28 #endif
     29 
     30 /* sysctl settables */
     31 struct oprof_sysctl sysctl_parms;
     32 /* some of the sys ctl settable variable needs to be copied to protect
     33  * against user that try to change through /proc/sys/dev/oprofile/ * running
     34  * parameters during profiling */
     35 struct oprof_sysctl sysctl;
     36 
     37 static enum oprof_state state __cacheline_aligned_in_smp = STOPPED;
     38 
     39 static int op_major;
     40 
     41 static volatile ulong oprof_opened __cacheline_aligned_in_smp;
     42 static volatile ulong oprof_note_opened __cacheline_aligned_in_smp;
     43 static DECLARE_WAIT_QUEUE_HEAD(oprof_wait);
     44 
     45 static u32 oprof_ready[NR_CPUS] __cacheline_aligned_in_smp;
     46 struct _oprof_data oprof_data[NR_CPUS] __cacheline_aligned;
     47 
     48 struct op_note * note_buffer __cacheline_aligned_in_smp;
     49 u32 note_pos __cacheline_aligned_in_smp;
     50 
     51 // the interrupt handler ops structure to use
     52 static struct op_int_operations const * int_ops;
     53 
     54 static char const * op_version = PACKAGE " " VERSION;
     55 
     56 /* ---------------- interrupt entry routines ------------------ */
     57 
     58 inline static int need_wakeup(uint cpu, struct _oprof_data * data)
     59 {
     60 	return data->nextbuf >= (data->buf_size - data->buf_watermark) && !oprof_ready[cpu];
     61 }
     62 
     63 inline static void next_sample(struct _oprof_data * data)
     64 {
     65 	if (unlikely(++data->nextbuf == data->buf_size))
     66 		data->nextbuf = 0;
     67 }
     68 
     69 inline static void evict_op_entry(uint cpu, struct _oprof_data * data, long irq_enabled)
     70 {
     71 	next_sample(data);
     72 	if (likely(!need_wakeup(cpu, data)))
     73 		return;
     74 
     75 	/* locking rationale :
     76 	 *
     77 	 * other CPUs are not a race concern since we synch on oprof_wait->lock.
     78 	 *
     79 	 * for the current CPU, we might have interrupted another user of e.g.
     80 	 * runqueue_lock, deadlocking on SMP and racing on UP. So we check that IRQs
     81 	 * were not disabled (corresponding to the irqsave/restores in __wake_up().
     82 	 *
     83 	 * Note that this requires all spinlocks taken by the full wake_up path
     84 	 * to have saved IRQs - otherwise we can interrupt whilst holding a spinlock
     85 	 * taken from some non-wake_up() path and deadlock. Currently this means only
     86 	 * oprof_wait->lock and runqueue_lock: all instances disable IRQs before
     87 	 * taking the lock.
     88 	 *
     89 	 * This will mean that approaching the end of the buffer, a number of the
     90 	 * evictions may fail to wake up the daemon. We simply hope this doesn't
     91 	 * take long; a pathological case could cause buffer overflow.
     92 	 *
     93 	 * Note that we use oprof_ready as our flag for whether we have initiated a
     94 	 * wake-up. Once the wake-up is received, the flag is reset as well as
     95 	 * data->nextbuf, preventing multiple wakeups.
     96 	 *
     97 	 * On 2.2, a global waitqueue_lock is used, so we must check it's not held
     98 	 * by the current CPU. We make sure that any users of the wait queue (i.e.
     99 	 * us and the code for wait_event_interruptible()) disable interrupts so it's
    100 	 * still safe to check IF_MASK.
    101 	 */
    102 	if (likely(irq_enabled)) {
    103 		oprof_ready[cpu] = 1;
    104 		wake_up(&oprof_wait);
    105 	}
    106 }
    107 
    108 inline static void
    109 fill_op_entry(struct op_sample * ops, long eip, pid_t pid, pid_t tgid, int ctr)
    110 {
    111 	ops->eip = eip;
    112 	ops->pid = pid;
    113 	ops->tgid = tgid;
    114 	ops->counter = ctr;
    115 }
    116 
    117 void op_do_profile(uint cpu, long eip, long irq_enabled, int ctr)
    118 {
    119 	struct _oprof_data * data = &oprof_data[cpu];
    120 	pid_t const pid = current->pid;
    121 	pid_t const tgid = op_get_tgid();
    122 	struct op_sample * samples = &data->buffer[data->nextbuf];
    123 
    124 	data->nr_irq++;
    125 
    126 	fill_op_entry(samples, eip, pid, tgid, ctr);
    127 	evict_op_entry(cpu, data, irq_enabled);
    128 }
    129 
    130 /* ---------------- driver routines ------------------ */
    131 
    132 /* only stop and start profiling interrupt when we are
    133  * fully running !
    134  */
    135 static void stop_cpu_perfctr(int cpu)
    136 {
    137 	if (state == RUNNING)
    138 		int_ops->stop_cpu(cpu);
    139 }
    140 
    141 static void start_cpu_perfctr(int cpu)
    142 {
    143 	if (state == RUNNING)
    144 		int_ops->start_cpu(cpu);
    145 }
    146 
    147 spinlock_t note_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
    148 /* which buffer nr. is waiting to be read ? */
    149 int cpu_buffer_waiting;
    150 
    151 static int is_ready(void)
    152 {
    153 	uint cpu_nr;
    154 	for (cpu_nr = 0 ; cpu_nr < smp_num_cpus; cpu_nr++) {
    155 		if (oprof_ready[cpu_nr]) {
    156 			cpu_buffer_waiting = cpu_nr;
    157 			return 1;
    158 		}
    159 	}
    160 	return 0;
    161 }
    162 
    163 inline static void up_and_check_note(void)
    164 {
    165 	note_pos++;
    166 	if (likely(note_pos < (sysctl.note_size - OP_PRE_NOTE_WATERMARK(sysctl.note_size)) && !is_ready()))
    167 		return;
    168 
    169 	/* if we reach the end of the buffer, just pin
    170 	 * to the last entry until it is read. This loses
    171 	 * notes, but we have no choice. */
    172 	if (unlikely(note_pos == sysctl.note_size)) {
    173 		static int warned;
    174 		if (!warned) {
    175 			printk(KERN_WARNING "note buffer overflow: restart "
    176 			       "oprofile with a larger note buffer.\n");
    177 			warned = 1;
    178 		}
    179 		sysctl.nr_note_buffer_overflow++;
    180 		note_pos = sysctl.note_size - 1;
    181 	}
    182 
    183 	/* we just use cpu 0 as a convenient one to wake up */
    184 	oprof_ready[0] = 2;
    185 	oprof_wake_up(&oprof_wait);
    186 }
    187 
    188 /* if holding note_lock */
    189 void __oprof_put_note(struct op_note * onote)
    190 {
    191 	/* ignore note if we're not up and running fully */
    192 	if (state != RUNNING)
    193 		return;
    194 
    195 	memcpy(&note_buffer[note_pos], onote, sizeof(struct op_note));
    196 	up_and_check_note();
    197 }
    198 
    199 void oprof_put_note(struct op_note * onote)
    200 {
    201 	spin_lock(&note_lock);
    202 	__oprof_put_note(onote);
    203 	spin_unlock(&note_lock);
    204 }
    205 
    206 static ssize_t oprof_note_read(char * buf, size_t count, loff_t * ppos)
    207 {
    208 	struct op_note * mybuf;
    209 	uint num;
    210 	ssize_t max;
    211 
    212 	max = sizeof(struct op_note) * sysctl.note_size;
    213 
    214 	if (*ppos || count != max)
    215 		return -EINVAL;
    216 
    217 	mybuf = vmalloc(max);
    218 	if (!mybuf)
    219 		return -EFAULT;
    220 
    221 	spin_lock(&note_lock);
    222 
    223 	num = note_pos;
    224 
    225 	count = note_pos * sizeof(struct op_note);
    226 
    227 	if (count)
    228 		memcpy(mybuf, note_buffer, count);
    229 
    230 	note_pos = 0;
    231 
    232 	spin_unlock(&note_lock);
    233 
    234 	if (count && copy_to_user(buf, mybuf, count))
    235 		count = -EFAULT;
    236 
    237 	vfree(mybuf);
    238 	return count;
    239 }
    240 
    241 static int oprof_note_open(void)
    242 {
    243 	if (test_and_set_bit(0, &oprof_note_opened))
    244 		return -EBUSY;
    245 	INC_USE_COUNT_MAYBE;
    246 	return 0;
    247 }
    248 
    249 static int oprof_note_release(void)
    250 {
    251 	BUG_ON(!oprof_note_opened);
    252 	clear_bit(0, &oprof_note_opened);
    253 	DEC_USE_COUNT_MAYBE;
    254 	return 0;
    255 }
    256 
    257 static int check_buffer_amount(int cpu_nr)
    258 {
    259 	struct _oprof_data * data = &oprof_data[cpu_nr];
    260 	int size = data->buf_size;
    261 	int num = data->nextbuf;
    262 	if (num < size - data->buf_watermark && oprof_ready[cpu_nr] != 2) {
    263 		printk(KERN_WARNING "oprofile: Detected overflow of size %d. "
    264 		       "You must increase the module buffer size with\n"
    265 		       "opcontrol --setup --bufer-size= or reduce the "
    266 		       "interrupt frequency\n", num);
    267 		data->nr_buffer_overflow += num;
    268 		num = size;
    269 	} else
    270 		data->nextbuf = 0;
    271 	return num;
    272 }
    273 
    274 static int copy_buffer(char * buf, int cpu_nr)
    275 {
    276 	struct op_buffer_head head;
    277 	int ret = -EFAULT;
    278 
    279 	stop_cpu_perfctr(cpu_nr);
    280 
    281 	head.cpu_nr = cpu_nr;
    282 	head.count = check_buffer_amount(cpu_nr);
    283 	head.state = state;
    284 
    285 	oprof_ready[cpu_nr] = 0;
    286 
    287 	if (copy_to_user(buf, &head, sizeof(struct op_buffer_head)))
    288 		goto out;
    289 
    290 	if (head.count) {
    291 		size_t const size = head.count * sizeof(struct op_sample);
    292 		if (copy_to_user(buf + sizeof(struct op_buffer_head),
    293 			oprof_data[cpu_nr].buffer, size))
    294 			goto out;
    295 		ret = size + sizeof(struct op_buffer_head);
    296 	} else {
    297 		ret = sizeof(struct op_buffer_head);
    298 	}
    299 
    300 out:
    301 	start_cpu_perfctr(cpu_nr);
    302 	return ret;
    303 }
    304 
    305 static ssize_t oprof_read(struct file * file, char * buf, size_t count, loff_t * ppos)
    306 {
    307 	ssize_t max;
    308 
    309 	if (!capable(CAP_SYS_PTRACE))
    310 		return -EPERM;
    311 
    312 	switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
    313 		case 2: return oprof_note_read(buf, count, ppos);
    314 		case 0: break;
    315 		default: return -EINVAL;
    316 	}
    317 
    318 	max = sizeof(struct op_buffer_head) + sizeof(struct op_sample) * sysctl.buf_size;
    319 
    320 	if (*ppos || count != max)
    321 		return -EINVAL;
    322 
    323 	switch (state) {
    324 		case RUNNING:
    325 			wait_event_interruptible(oprof_wait, is_ready());
    326 			if (signal_pending(current))
    327 				return -EINTR;
    328 			break;
    329 
    330 		/* Non-obvious. If O_NONBLOCK is set, that means
    331 		 * the daemon knows it has to quit and is asking
    332 		 * for final buffer data. If it's not set, then we
    333 		 * have just transitioned to STOPPING, and we must
    334 		 * inform the daemon (which we can do just by a normal
    335 		 * operation).
    336 		 */
    337 		case STOPPING: {
    338 			int cpu;
    339 
    340 			if (!(file->f_flags & O_NONBLOCK))
    341 				break;
    342 
    343 			for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
    344 				if (oprof_data[cpu].nextbuf) {
    345 					cpu_buffer_waiting = cpu;
    346 					oprof_ready[cpu] = 2;
    347 					break;
    348 				}
    349 			}
    350 
    351 			if (cpu == smp_num_cpus)
    352 				return -EAGAIN;
    353 
    354 		}
    355 			break;
    356 
    357 		case STOPPED: BUG();
    358 	}
    359 
    360 	return copy_buffer(buf, cpu_buffer_waiting);
    361 }
    362 
    363 
    364 static int oprof_start(void);
    365 static int oprof_stop(void);
    366 
    367 static int oprof_open(struct inode * ino, struct file * file)
    368 {
    369 	int err;
    370 
    371 	if (!capable(CAP_SYS_PTRACE))
    372 		return -EPERM;
    373 
    374 	switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
    375 		case 1: return oprof_hash_map_open();
    376 		case 2: return oprof_note_open();
    377 		case 0:
    378 			/* make sure the other devices are open */
    379 			if (is_map_ready())
    380 				break;
    381 		default:
    382 			return -EINVAL;
    383 	}
    384 
    385 	if (test_and_set_bit(0, &oprof_opened))
    386 		return -EBUSY;
    387 
    388 	err = oprof_start();
    389 	if (err)
    390 		clear_bit(0, &oprof_opened);
    391 	return err;
    392 }
    393 
    394 static int oprof_release(struct inode * ino, struct file * file)
    395 {
    396 	switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
    397 		case 1: return oprof_hash_map_release();
    398 		case 2: return oprof_note_release();
    399 		case 0: break;
    400 		default: return -EINVAL;
    401 	}
    402 
    403 	BUG_ON(!oprof_opened);
    404 
    405 	clear_bit(0, &oprof_opened);
    406 
    407 	// FIXME: is this safe when I kill -9 the daemon ?
    408 	return oprof_stop();
    409 }
    410 
    411 static int oprof_mmap(struct file * file, struct vm_area_struct * vma)
    412 {
    413 	if (MINOR(file->f_dentry->d_inode->i_rdev) == 1)
    414 		return oprof_hash_map_mmap(file, vma);
    415 	return -EINVAL;
    416 }
    417 
    418 /* called under spinlock, cannot sleep */
    419 static void oprof_free_mem(uint num)
    420 {
    421 	uint i;
    422 	for (i=0; i < num; i++) {
    423 		if (oprof_data[i].buffer)
    424 			vfree(oprof_data[i].buffer);
    425 		oprof_data[i].buffer = NULL;
    426 	}
    427 	vfree(note_buffer);
    428 	note_buffer = NULL;
    429 }
    430 
    431 static int oprof_init_data(void)
    432 {
    433 	uint i, notebufsize;
    434 	ulong buf_size;
    435 	struct _oprof_data * data;
    436 
    437 	sysctl.nr_note_buffer_overflow = 0;
    438 	notebufsize = sizeof(struct op_note) * sysctl.note_size;
    439 	note_buffer = vmalloc(notebufsize);
    440  	if (!note_buffer) {
    441 		printk(KERN_ERR "oprofile: failed to allocate note buffer of %u bytes\n",
    442 			notebufsize);
    443 		return -EFAULT;
    444 	}
    445 	note_pos = 0;
    446 
    447 	// safe init
    448 	for (i = 0; i < smp_num_cpus; ++i) {
    449 		data = &oprof_data[i];
    450 		data->buf_size = 0;
    451 		data->buffer = 0;
    452 		data->buf_watermark = 0;
    453 		data->nr_buffer_overflow = 0;
    454 	}
    455 
    456 	buf_size = (sizeof(struct op_sample) * sysctl.buf_size);
    457 
    458 	for (i = 0 ; i < smp_num_cpus ; ++i) {
    459 		data = &oprof_data[i];
    460 
    461 		data->buffer = vmalloc(buf_size);
    462 		if (!data->buffer) {
    463 			printk(KERN_ERR "oprofile: failed to allocate eviction buffer of %lu bytes\n", buf_size);
    464 			oprof_free_mem(i);
    465 			return -EFAULT;
    466 		}
    467 
    468 		memset(data->buffer, 0, buf_size);
    469 
    470 		data->buf_size = sysctl.buf_size;
    471 		data->buf_watermark = OP_PRE_WATERMARK(data->buf_size);
    472 		data->nextbuf = 0;
    473 	}
    474 
    475 	return 0;
    476 }
    477 
    478 static int parms_check(void)
    479 {
    480 	int err;
    481 
    482 	if ((err = check_range(sysctl.buf_size, OP_MIN_BUF_SIZE, OP_MAX_BUF_SIZE,
    483 		"sysctl.buf_size value %d not in range (%d %d)\n")))
    484 		return err;
    485 	if ((err = check_range(sysctl.note_size, OP_MIN_NOTE_TABLE_SIZE, OP_MAX_NOTE_TABLE_SIZE,
    486 		"sysctl.note_size value %d not in range (%d %d)\n")))
    487 		return err;
    488 
    489 	if ((err = int_ops->check_params()))
    490 		return err;
    491 
    492 	return 0;
    493 }
    494 
    495 
    496 static DECLARE_MUTEX(sysctlsem);
    497 
    498 
    499 static int oprof_start(void)
    500 {
    501 	int err = 0;
    502 
    503 	down(&sysctlsem);
    504 
    505 	/* save the sysctl settable things to protect against change through
    506 	 * systcl the profiler params */
    507 	sysctl_parms.cpu_type = sysctl.cpu_type;
    508 	sysctl = sysctl_parms;
    509 
    510 	if ((err = oprof_init_data()))
    511 		goto out;
    512 
    513 	if ((err = parms_check())) {
    514 		oprof_free_mem(smp_num_cpus);
    515 		goto out;
    516 	}
    517 
    518 	if ((err = int_ops->setup())) {
    519 		oprof_free_mem(smp_num_cpus);
    520 		goto out;
    521 	}
    522 
    523 	op_intercept_syscalls();
    524 
    525 	int_ops->start();
    526 
    527 	state = RUNNING;
    528 
    529 out:
    530 	up(&sysctlsem);
    531 	return err;
    532 }
    533 
    534 /*
    535  * stop interrupts being generated and notes arriving.
    536  * This is idempotent.
    537  */
    538 static void oprof_partial_stop(void)
    539 {
    540 	BUG_ON(state == STOPPED);
    541 
    542 	if (state == RUNNING) {
    543 		op_restore_syscalls();
    544 		int_ops->stop();
    545 	}
    546 
    547 	state = STOPPING;
    548 }
    549 
    550 static int oprof_stop(void)
    551 {
    552 	uint i;
    553 	// FIXME: err not needed
    554 	int err = -EINVAL;
    555 
    556 	down(&sysctlsem);
    557 
    558 	BUG_ON(state == STOPPED);
    559 
    560 	/* here we need to :
    561 	 * bring back the old system calls
    562 	 * stop the perf counter
    563 	 * bring back the old NMI handler
    564 	 * reset the map buffer stuff and ready values
    565 	 *
    566 	 * Nothing will be able to write into the map buffer because
    567 	 * we synchronise via the spinlocks
    568 	 */
    569 
    570 	oprof_partial_stop();
    571 
    572 	spin_lock(&note_lock);
    573 
    574 	for (i = 0 ; i < smp_num_cpus; i++) {
    575 		struct _oprof_data * data = &oprof_data[i];
    576 		oprof_ready[i] = 0;
    577 		data->nextbuf = 0;
    578 	}
    579 
    580 	oprof_free_mem(smp_num_cpus);
    581 
    582 	spin_unlock(&note_lock);
    583 	err = 0;
    584 
    585 	/* FIXME: can we really say this ? */
    586 	state = STOPPED;
    587 	up(&sysctlsem);
    588 	return err;
    589 }
    590 
    591 static struct file_operations oprof_fops = {
    592 #ifdef HAVE_FILE_OPERATIONS_OWNER
    593 	owner: THIS_MODULE,
    594 #endif
    595 	open: oprof_open,
    596 	release: oprof_release,
    597 	read: oprof_read,
    598 	mmap: oprof_mmap,
    599 };
    600 
    601 /*
    602  * /proc/sys/dev/oprofile/
    603  *                        bufsize
    604  *                        notesize
    605  *                        dump
    606  *                        dump_stop
    607  *                        nr_interrupts
    608  *                        #ctr/
    609  *                          event
    610  *                          enabled
    611  *                          count
    612  *                          unit_mask
    613  *                          kernel
    614  *                          user
    615  *
    616  * #ctr is in [0-1] for PPro core, [0-3] for Athlon core
    617  *
    618  */
    619 
    620 /* These access routines are basically not safe on SMP for module unload.
    621  * And there is nothing we can do about it - the API is broken. We'll just
    622  * make a best-efforts thing. Note the sem is needed to prevent parms_check
    623  * bypassing during oprof_start().
    624  */
    625 
    626 static void lock_sysctl(void)
    627 {
    628 	MOD_INC_USE_COUNT;
    629 	down(&sysctlsem);
    630 }
    631 
    632 static void unlock_sysctl(void)
    633 {
    634 	up(&sysctlsem);
    635 	MOD_DEC_USE_COUNT;
    636 }
    637 
    638 static int get_nr_interrupts(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
    639 {
    640 	uint cpu;
    641 	int ret = -EINVAL;
    642 
    643 	lock_sysctl();
    644 
    645 	if (write)
    646 		goto out;
    647 
    648 	sysctl.nr_interrupts = 0;
    649 
    650 	for (cpu = 0 ; cpu < smp_num_cpus; cpu++) {
    651 		sysctl.nr_interrupts += oprof_data[cpu].nr_irq;
    652 		oprof_data[cpu].nr_irq = 0;
    653 	}
    654 
    655 	ret =  proc_dointvec(table, write, filp, buffer, lenp);
    656 out:
    657 	unlock_sysctl();
    658 	return ret;
    659 }
    660 
    661 static int get_nr_buffer_overflow(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
    662 {
    663 	uint cpu;
    664 	int ret = -EINVAL;
    665 
    666 	lock_sysctl();
    667 
    668 	if (write)
    669 		goto out;
    670 
    671 	for (cpu = 0 ; cpu < smp_num_cpus; cpu++) {
    672 		sysctl.nr_buffer_overflow += oprof_data[cpu].nr_buffer_overflow;
    673 		oprof_data[cpu].nr_buffer_overflow = 0;
    674 	}
    675 
    676 	ret =  proc_dointvec(table, write, filp, buffer, lenp);
    677 out:
    678 	unlock_sysctl();
    679 	return ret;
    680 }
    681 
    682 int lproc_dointvec(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
    683 {
    684 	int err;
    685 
    686 	lock_sysctl();
    687 	err = proc_dointvec(table, write, filp, buffer, lenp);
    688 	unlock_sysctl();
    689 
    690 	return err;
    691 }
    692 
    693 static void do_actual_dump(void)
    694 {
    695 	uint cpu;
    696 
    697 	for (cpu = 0 ; cpu < smp_num_cpus; cpu++)
    698 		oprof_ready[cpu] = 2;
    699 	oprof_wake_up(&oprof_wait);
    700 }
    701 
    702 static int sysctl_do_dump(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
    703 {
    704 	int err = -EINVAL;
    705 
    706 	lock_sysctl();
    707 
    708 	if (state != RUNNING)
    709 		goto out;
    710 
    711 	if (!write) {
    712 		err = proc_dointvec(table, write, filp, buffer, lenp);
    713 		goto out;
    714 	}
    715 
    716 	do_actual_dump();
    717 
    718 	err = 0;
    719 out:
    720 	unlock_sysctl();
    721 	return err;
    722 }
    723 
    724 static int sysctl_do_dump_stop(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
    725 {
    726 	int err = -EINVAL;
    727 
    728 	lock_sysctl();
    729 
    730 	if (state != RUNNING)
    731 		goto out;
    732 
    733 	if (!write) {
    734 		err = proc_dointvec(table, write, filp, buffer, lenp);
    735 		goto out;
    736 	}
    737 
    738 	oprof_partial_stop();
    739 
    740 	/* also wakes up daemon */
    741 	do_actual_dump();
    742 
    743 	err = 0;
    744 out:
    745 	unlock_sysctl();
    746 	return err;
    747 }
    748 
    749 static int const nr_oprof_static = 8;
    750 
    751 static ctl_table oprof_table[] = {
    752 	{ 1, "bufsize", &sysctl_parms.buf_size, sizeof(int), 0644, NULL, &lproc_dointvec, NULL, },
    753 	{ 1, "dump", &sysctl_parms.dump, sizeof(int), 0666, NULL, &sysctl_do_dump, NULL, },
    754 	{ 1, "dump_stop", &sysctl_parms.dump_stop, sizeof(int), 0644, NULL, &sysctl_do_dump_stop, NULL, },
    755 	{ 1, "nr_interrupts", &sysctl.nr_interrupts, sizeof(int), 0444, NULL, &get_nr_interrupts, NULL, },
    756 	{ 1, "notesize", &sysctl_parms.note_size, sizeof(int), 0644, NULL, &lproc_dointvec, NULL, },
    757 	{ 1, "cpu_type", &sysctl.cpu_type, sizeof(int), 0444, NULL, &lproc_dointvec, NULL, },
    758 	{ 1, "note_buffer_overflow", &sysctl.nr_note_buffer_overflow, sizeof(int), 0444, NULL, &lproc_dointvec, NULL, },
    759 	{ 1, "buffer_overflow", &sysctl.nr_buffer_overflow, sizeof(int), 0444, NULL, &get_nr_buffer_overflow, NULL, },
    760 	{ 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, },
    761 	{ 0, },
    762 };
    763 
    764 static ctl_table oprof_root[] = {
    765 	{1, "oprofile", NULL, 0, 0755, oprof_table},
    766  	{0, },
    767 };
    768 
    769 static ctl_table dev_root[] = {
    770 	{CTL_DEV, "dev", NULL, 0, 0555, oprof_root},
    771 	{0, },
    772 };
    773 
    774 static struct ctl_table_header * sysctl_header;
    775 
    776 /* NOTE: we do *not* support sysctl() syscall */
    777 
    778 static int __init init_sysctl(void)
    779 {
    780 	int err = 0;
    781 	ctl_table * next = &oprof_table[nr_oprof_static];
    782 
    783 	/* these sysctl parms need sensible value */
    784 	sysctl_parms.buf_size = OP_DEFAULT_BUF_SIZE;
    785 	sysctl_parms.note_size = OP_DEFAULT_NOTE_SIZE;
    786 
    787 	if ((err = int_ops->add_sysctls(next)))
    788 		return err;
    789 
    790 	sysctl_header = register_sysctl_table(dev_root, 0);
    791 	return err;
    792 }
    793 
    794 /* not safe to mark as __exit since used from __init code */
    795 static void cleanup_sysctl(void)
    796 {
    797 	ctl_table * next = &oprof_table[nr_oprof_static];
    798 	unregister_sysctl_table(sysctl_header);
    799 
    800 	int_ops->remove_sysctls(next);
    801 
    802 	return;
    803 }
    804 
    805 static int can_unload(void)
    806 {
    807 	int can = -EBUSY;
    808 	down(&sysctlsem);
    809 
    810 	if (allow_unload && state == STOPPED && !GET_USE_COUNT(THIS_MODULE))
    811 		can = 0;
    812 	up(&sysctlsem);
    813 	return can;
    814 }
    815 
    816 int __init oprof_init(void)
    817 {
    818 	int err = 0;
    819 
    820 	if (sysctl.cpu_type != CPU_RTC) {
    821 		int_ops = op_int_interface();
    822 
    823 		// try to init, fall back to rtc if not
    824 		if ((err = int_ops->init())) {
    825 			int_ops = &op_rtc_ops;
    826 			if ((err = int_ops->init()))
    827 				return err;
    828 			sysctl.cpu_type = CPU_RTC;
    829 		}
    830 	} else {
    831 		int_ops = &op_rtc_ops;
    832 		if ((err = int_ops->init()))
    833 			return err;
    834 	}
    835 
    836 	if ((err = init_sysctl()))
    837 		goto out_err;
    838 
    839 	err = op_major = register_chrdev(0, "oprof", &oprof_fops);
    840 	if (err < 0)
    841 		goto out_err2;
    842 
    843 	err = oprof_init_hashmap();
    844 	if (err < 0) {
    845 		printk(KERN_ERR "oprofile: couldn't allocate hash map !\n");
    846 		unregister_chrdev(op_major, "oprof");
    847 		goto out_err2;
    848 	}
    849 
    850 	/* module might not be unloadable */
    851 	THIS_MODULE->can_unload = can_unload;
    852 
    853 	/* do this now so we don't have to track save/restores later */
    854 	op_save_syscalls();
    855 
    856 	printk(KERN_INFO "%s loaded, major %u\n", op_version, op_major);
    857 	return 0;
    858 
    859 out_err2:
    860 	cleanup_sysctl();
    861 out_err:
    862 	int_ops->deinit();
    863 	return err;
    864 }
    865 
    866 void __exit oprof_exit(void)
    867 {
    868 	oprof_free_hashmap();
    869 
    870 	unregister_chrdev(op_major, "oprof");
    871 
    872 	cleanup_sysctl();
    873 
    874 	int_ops->deinit();
    875 }
    876 
    877 /*
    878  * "The most valuable commodity I know of is information."
    879  *      - Gordon Gekko
    880  */
    881