/**
* @file oprofile.c
* Main driver code
*
* @remark Copyright 2002 OProfile authors
* @remark Read the file COPYING
*
* @author John Levon
* @author Philippe Elie
*/
#include "oprofile.h"
#include "op_util.h"
#include "config.h"
EXPORT_NO_SYMBOLS;
MODULE_AUTHOR("John Levon (levon@movementarian.org)");
MODULE_DESCRIPTION("Continuous Profiling Module");
MODULE_LICENSE("GPL");
MODULE_PARM(allow_unload, "i");
MODULE_PARM_DESC(allow_unload, "Allow module to be unloaded.");
#ifdef CONFIG_SMP
static int allow_unload;
#else
static int allow_unload = 1;
#endif
/* sysctl settables */
struct oprof_sysctl sysctl_parms;
/* some of the sys ctl settable variable needs to be copied to protect
* against user that try to change through /proc/sys/dev/oprofile/ * running
* parameters during profiling */
struct oprof_sysctl sysctl;
static enum oprof_state state __cacheline_aligned_in_smp = STOPPED;
static int op_major;
static volatile ulong oprof_opened __cacheline_aligned_in_smp;
static volatile ulong oprof_note_opened __cacheline_aligned_in_smp;
static DECLARE_WAIT_QUEUE_HEAD(oprof_wait);
static u32 oprof_ready[NR_CPUS] __cacheline_aligned_in_smp;
struct _oprof_data oprof_data[NR_CPUS] __cacheline_aligned;
struct op_note * note_buffer __cacheline_aligned_in_smp;
u32 note_pos __cacheline_aligned_in_smp;
// the interrupt handler ops structure to use
static struct op_int_operations const * int_ops;
static char const * op_version = PACKAGE " " VERSION;
/* ---------------- interrupt entry routines ------------------ */
inline static int need_wakeup(uint cpu, struct _oprof_data * data)
{
return data->nextbuf >= (data->buf_size - data->buf_watermark) && !oprof_ready[cpu];
}
inline static void next_sample(struct _oprof_data * data)
{
if (unlikely(++data->nextbuf == data->buf_size))
data->nextbuf = 0;
}
inline static void evict_op_entry(uint cpu, struct _oprof_data * data, long irq_enabled)
{
next_sample(data);
if (likely(!need_wakeup(cpu, data)))
return;
/* locking rationale :
*
* other CPUs are not a race concern since we synch on oprof_wait->lock.
*
* for the current CPU, we might have interrupted another user of e.g.
* runqueue_lock, deadlocking on SMP and racing on UP. So we check that IRQs
* were not disabled (corresponding to the irqsave/restores in __wake_up().
*
* Note that this requires all spinlocks taken by the full wake_up path
* to have saved IRQs - otherwise we can interrupt whilst holding a spinlock
* taken from some non-wake_up() path and deadlock. Currently this means only
* oprof_wait->lock and runqueue_lock: all instances disable IRQs before
* taking the lock.
*
* This will mean that approaching the end of the buffer, a number of the
* evictions may fail to wake up the daemon. We simply hope this doesn't
* take long; a pathological case could cause buffer overflow.
*
* Note that we use oprof_ready as our flag for whether we have initiated a
* wake-up. Once the wake-up is received, the flag is reset as well as
* data->nextbuf, preventing multiple wakeups.
*
* On 2.2, a global waitqueue_lock is used, so we must check it's not held
* by the current CPU. We make sure that any users of the wait queue (i.e.
* us and the code for wait_event_interruptible()) disable interrupts so it's
* still safe to check IF_MASK.
*/
if (likely(irq_enabled)) {
oprof_ready[cpu] = 1;
wake_up(&oprof_wait);
}
}
inline static void
fill_op_entry(struct op_sample * ops, long eip, pid_t pid, pid_t tgid, int ctr)
{
ops->eip = eip;
ops->pid = pid;
ops->tgid = tgid;
ops->counter = ctr;
}
void op_do_profile(uint cpu, long eip, long irq_enabled, int ctr)
{
struct _oprof_data * data = &oprof_data[cpu];
pid_t const pid = current->pid;
pid_t const tgid = op_get_tgid();
struct op_sample * samples = &data->buffer[data->nextbuf];
data->nr_irq++;
fill_op_entry(samples, eip, pid, tgid, ctr);
evict_op_entry(cpu, data, irq_enabled);
}
/* ---------------- driver routines ------------------ */
/* only stop and start profiling interrupt when we are
* fully running !
*/
static void stop_cpu_perfctr(int cpu)
{
if (state == RUNNING)
int_ops->stop_cpu(cpu);
}
static void start_cpu_perfctr(int cpu)
{
if (state == RUNNING)
int_ops->start_cpu(cpu);
}
spinlock_t note_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/* which buffer nr. is waiting to be read ? */
int cpu_buffer_waiting;
static int is_ready(void)
{
uint cpu_nr;
for (cpu_nr = 0 ; cpu_nr < smp_num_cpus; cpu_nr++) {
if (oprof_ready[cpu_nr]) {
cpu_buffer_waiting = cpu_nr;
return 1;
}
}
return 0;
}
inline static void up_and_check_note(void)
{
note_pos++;
if (likely(note_pos < (sysctl.note_size - OP_PRE_NOTE_WATERMARK(sysctl.note_size)) && !is_ready()))
return;
/* if we reach the end of the buffer, just pin
* to the last entry until it is read. This loses
* notes, but we have no choice. */
if (unlikely(note_pos == sysctl.note_size)) {
static int warned;
if (!warned) {
printk(KERN_WARNING "note buffer overflow: restart "
"oprofile with a larger note buffer.\n");
warned = 1;
}
sysctl.nr_note_buffer_overflow++;
note_pos = sysctl.note_size - 1;
}
/* we just use cpu 0 as a convenient one to wake up */
oprof_ready[0] = 2;
oprof_wake_up(&oprof_wait);
}
/* if holding note_lock */
void __oprof_put_note(struct op_note * onote)
{
/* ignore note if we're not up and running fully */
if (state != RUNNING)
return;
memcpy(¬e_buffer[note_pos], onote, sizeof(struct op_note));
up_and_check_note();
}
void oprof_put_note(struct op_note * onote)
{
spin_lock(¬e_lock);
__oprof_put_note(onote);
spin_unlock(¬e_lock);
}
static ssize_t oprof_note_read(char * buf, size_t count, loff_t * ppos)
{
struct op_note * mybuf;
uint num;
ssize_t max;
max = sizeof(struct op_note) * sysctl.note_size;
if (*ppos || count != max)
return -EINVAL;
mybuf = vmalloc(max);
if (!mybuf)
return -EFAULT;
spin_lock(¬e_lock);
num = note_pos;
count = note_pos * sizeof(struct op_note);
if (count)
memcpy(mybuf, note_buffer, count);
note_pos = 0;
spin_unlock(¬e_lock);
if (count && copy_to_user(buf, mybuf, count))
count = -EFAULT;
vfree(mybuf);
return count;
}
static int oprof_note_open(void)
{
if (test_and_set_bit(0, &oprof_note_opened))
return -EBUSY;
INC_USE_COUNT_MAYBE;
return 0;
}
static int oprof_note_release(void)
{
BUG_ON(!oprof_note_opened);
clear_bit(0, &oprof_note_opened);
DEC_USE_COUNT_MAYBE;
return 0;
}
static int check_buffer_amount(int cpu_nr)
{
struct _oprof_data * data = &oprof_data[cpu_nr];
int size = data->buf_size;
int num = data->nextbuf;
if (num < size - data->buf_watermark && oprof_ready[cpu_nr] != 2) {
printk(KERN_WARNING "oprofile: Detected overflow of size %d. "
"You must increase the module buffer size with\n"
"opcontrol --setup --bufer-size= or reduce the "
"interrupt frequency\n", num);
data->nr_buffer_overflow += num;
num = size;
} else
data->nextbuf = 0;
return num;
}
static int copy_buffer(char * buf, int cpu_nr)
{
struct op_buffer_head head;
int ret = -EFAULT;
stop_cpu_perfctr(cpu_nr);
head.cpu_nr = cpu_nr;
head.count = check_buffer_amount(cpu_nr);
head.state = state;
oprof_ready[cpu_nr] = 0;
if (copy_to_user(buf, &head, sizeof(struct op_buffer_head)))
goto out;
if (head.count) {
size_t const size = head.count * sizeof(struct op_sample);
if (copy_to_user(buf + sizeof(struct op_buffer_head),
oprof_data[cpu_nr].buffer, size))
goto out;
ret = size + sizeof(struct op_buffer_head);
} else {
ret = sizeof(struct op_buffer_head);
}
out:
start_cpu_perfctr(cpu_nr);
return ret;
}
static ssize_t oprof_read(struct file * file, char * buf, size_t count, loff_t * ppos)
{
ssize_t max;
if (!capable(CAP_SYS_PTRACE))
return -EPERM;
switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
case 2: return oprof_note_read(buf, count, ppos);
case 0: break;
default: return -EINVAL;
}
max = sizeof(struct op_buffer_head) + sizeof(struct op_sample) * sysctl.buf_size;
if (*ppos || count != max)
return -EINVAL;
switch (state) {
case RUNNING:
wait_event_interruptible(oprof_wait, is_ready());
if (signal_pending(current))
return -EINTR;
break;
/* Non-obvious. If O_NONBLOCK is set, that means
* the daemon knows it has to quit and is asking
* for final buffer data. If it's not set, then we
* have just transitioned to STOPPING, and we must
* inform the daemon (which we can do just by a normal
* operation).
*/
case STOPPING: {
int cpu;
if (!(file->f_flags & O_NONBLOCK))
break;
for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
if (oprof_data[cpu].nextbuf) {
cpu_buffer_waiting = cpu;
oprof_ready[cpu] = 2;
break;
}
}
if (cpu == smp_num_cpus)
return -EAGAIN;
}
break;
case STOPPED: BUG();
}
return copy_buffer(buf, cpu_buffer_waiting);
}
static int oprof_start(void);
static int oprof_stop(void);
static int oprof_open(struct inode * ino, struct file * file)
{
int err;
if (!capable(CAP_SYS_PTRACE))
return -EPERM;
switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
case 1: return oprof_hash_map_open();
case 2: return oprof_note_open();
case 0:
/* make sure the other devices are open */
if (is_map_ready())
break;
default:
return -EINVAL;
}
if (test_and_set_bit(0, &oprof_opened))
return -EBUSY;
err = oprof_start();
if (err)
clear_bit(0, &oprof_opened);
return err;
}
static int oprof_release(struct inode * ino, struct file * file)
{
switch (MINOR(file->f_dentry->d_inode->i_rdev)) {
case 1: return oprof_hash_map_release();
case 2: return oprof_note_release();
case 0: break;
default: return -EINVAL;
}
BUG_ON(!oprof_opened);
clear_bit(0, &oprof_opened);
// FIXME: is this safe when I kill -9 the daemon ?
return oprof_stop();
}
static int oprof_mmap(struct file * file, struct vm_area_struct * vma)
{
if (MINOR(file->f_dentry->d_inode->i_rdev) == 1)
return oprof_hash_map_mmap(file, vma);
return -EINVAL;
}
/* called under spinlock, cannot sleep */
static void oprof_free_mem(uint num)
{
uint i;
for (i=0; i < num; i++) {
if (oprof_data[i].buffer)
vfree(oprof_data[i].buffer);
oprof_data[i].buffer = NULL;
}
vfree(note_buffer);
note_buffer = NULL;
}
static int oprof_init_data(void)
{
uint i, notebufsize;
ulong buf_size;
struct _oprof_data * data;
sysctl.nr_note_buffer_overflow = 0;
notebufsize = sizeof(struct op_note) * sysctl.note_size;
note_buffer = vmalloc(notebufsize);
if (!note_buffer) {
printk(KERN_ERR "oprofile: failed to allocate note buffer of %u bytes\n",
notebufsize);
return -EFAULT;
}
note_pos = 0;
// safe init
for (i = 0; i < smp_num_cpus; ++i) {
data = &oprof_data[i];
data->buf_size = 0;
data->buffer = 0;
data->buf_watermark = 0;
data->nr_buffer_overflow = 0;
}
buf_size = (sizeof(struct op_sample) * sysctl.buf_size);
for (i = 0 ; i < smp_num_cpus ; ++i) {
data = &oprof_data[i];
data->buffer = vmalloc(buf_size);
if (!data->buffer) {
printk(KERN_ERR "oprofile: failed to allocate eviction buffer of %lu bytes\n", buf_size);
oprof_free_mem(i);
return -EFAULT;
}
memset(data->buffer, 0, buf_size);
data->buf_size = sysctl.buf_size;
data->buf_watermark = OP_PRE_WATERMARK(data->buf_size);
data->nextbuf = 0;
}
return 0;
}
static int parms_check(void)
{
int err;
if ((err = check_range(sysctl.buf_size, OP_MIN_BUF_SIZE, OP_MAX_BUF_SIZE,
"sysctl.buf_size value %d not in range (%d %d)\n")))
return err;
if ((err = check_range(sysctl.note_size, OP_MIN_NOTE_TABLE_SIZE, OP_MAX_NOTE_TABLE_SIZE,
"sysctl.note_size value %d not in range (%d %d)\n")))
return err;
if ((err = int_ops->check_params()))
return err;
return 0;
}
static DECLARE_MUTEX(sysctlsem);
static int oprof_start(void)
{
int err = 0;
down(&sysctlsem);
/* save the sysctl settable things to protect against change through
* systcl the profiler params */
sysctl_parms.cpu_type = sysctl.cpu_type;
sysctl = sysctl_parms;
if ((err = oprof_init_data()))
goto out;
if ((err = parms_check())) {
oprof_free_mem(smp_num_cpus);
goto out;
}
if ((err = int_ops->setup())) {
oprof_free_mem(smp_num_cpus);
goto out;
}
op_intercept_syscalls();
int_ops->start();
state = RUNNING;
out:
up(&sysctlsem);
return err;
}
/*
* stop interrupts being generated and notes arriving.
* This is idempotent.
*/
static void oprof_partial_stop(void)
{
BUG_ON(state == STOPPED);
if (state == RUNNING) {
op_restore_syscalls();
int_ops->stop();
}
state = STOPPING;
}
static int oprof_stop(void)
{
uint i;
// FIXME: err not needed
int err = -EINVAL;
down(&sysctlsem);
BUG_ON(state == STOPPED);
/* here we need to :
* bring back the old system calls
* stop the perf counter
* bring back the old NMI handler
* reset the map buffer stuff and ready values
*
* Nothing will be able to write into the map buffer because
* we synchronise via the spinlocks
*/
oprof_partial_stop();
spin_lock(¬e_lock);
for (i = 0 ; i < smp_num_cpus; i++) {
struct _oprof_data * data = &oprof_data[i];
oprof_ready[i] = 0;
data->nextbuf = 0;
}
oprof_free_mem(smp_num_cpus);
spin_unlock(¬e_lock);
err = 0;
/* FIXME: can we really say this ? */
state = STOPPED;
up(&sysctlsem);
return err;
}
static struct file_operations oprof_fops = {
#ifdef HAVE_FILE_OPERATIONS_OWNER
owner: THIS_MODULE,
#endif
open: oprof_open,
release: oprof_release,
read: oprof_read,
mmap: oprof_mmap,
};
/*
* /proc/sys/dev/oprofile/
* bufsize
* notesize
* dump
* dump_stop
* nr_interrupts
* #ctr/
* event
* enabled
* count
* unit_mask
* kernel
* user
*
* #ctr is in [0-1] for PPro core, [0-3] for Athlon core
*
*/
/* These access routines are basically not safe on SMP for module unload.
* And there is nothing we can do about it - the API is broken. We'll just
* make a best-efforts thing. Note the sem is needed to prevent parms_check
* bypassing during oprof_start().
*/
static void lock_sysctl(void)
{
MOD_INC_USE_COUNT;
down(&sysctlsem);
}
static void unlock_sysctl(void)
{
up(&sysctlsem);
MOD_DEC_USE_COUNT;
}
static int get_nr_interrupts(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
{
uint cpu;
int ret = -EINVAL;
lock_sysctl();
if (write)
goto out;
sysctl.nr_interrupts = 0;
for (cpu = 0 ; cpu < smp_num_cpus; cpu++) {
sysctl.nr_interrupts += oprof_data[cpu].nr_irq;
oprof_data[cpu].nr_irq = 0;
}
ret = proc_dointvec(table, write, filp, buffer, lenp);
out:
unlock_sysctl();
return ret;
}
static int get_nr_buffer_overflow(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
{
uint cpu;
int ret = -EINVAL;
lock_sysctl();
if (write)
goto out;
for (cpu = 0 ; cpu < smp_num_cpus; cpu++) {
sysctl.nr_buffer_overflow += oprof_data[cpu].nr_buffer_overflow;
oprof_data[cpu].nr_buffer_overflow = 0;
}
ret = proc_dointvec(table, write, filp, buffer, lenp);
out:
unlock_sysctl();
return ret;
}
int lproc_dointvec(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
{
int err;
lock_sysctl();
err = proc_dointvec(table, write, filp, buffer, lenp);
unlock_sysctl();
return err;
}
static void do_actual_dump(void)
{
uint cpu;
for (cpu = 0 ; cpu < smp_num_cpus; cpu++)
oprof_ready[cpu] = 2;
oprof_wake_up(&oprof_wait);
}
static int sysctl_do_dump(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
{
int err = -EINVAL;
lock_sysctl();
if (state != RUNNING)
goto out;
if (!write) {
err = proc_dointvec(table, write, filp, buffer, lenp);
goto out;
}
do_actual_dump();
err = 0;
out:
unlock_sysctl();
return err;
}
static int sysctl_do_dump_stop(ctl_table * table, int write, struct file * filp, void * buffer, size_t * lenp)
{
int err = -EINVAL;
lock_sysctl();
if (state != RUNNING)
goto out;
if (!write) {
err = proc_dointvec(table, write, filp, buffer, lenp);
goto out;
}
oprof_partial_stop();
/* also wakes up daemon */
do_actual_dump();
err = 0;
out:
unlock_sysctl();
return err;
}
static int const nr_oprof_static = 8;
static ctl_table oprof_table[] = {
{ 1, "bufsize", &sysctl_parms.buf_size, sizeof(int), 0644, NULL, &lproc_dointvec, NULL, },
{ 1, "dump", &sysctl_parms.dump, sizeof(int), 0666, NULL, &sysctl_do_dump, NULL, },
{ 1, "dump_stop", &sysctl_parms.dump_stop, sizeof(int), 0644, NULL, &sysctl_do_dump_stop, NULL, },
{ 1, "nr_interrupts", &sysctl.nr_interrupts, sizeof(int), 0444, NULL, &get_nr_interrupts, NULL, },
{ 1, "notesize", &sysctl_parms.note_size, sizeof(int), 0644, NULL, &lproc_dointvec, NULL, },
{ 1, "cpu_type", &sysctl.cpu_type, sizeof(int), 0444, NULL, &lproc_dointvec, NULL, },
{ 1, "note_buffer_overflow", &sysctl.nr_note_buffer_overflow, sizeof(int), 0444, NULL, &lproc_dointvec, NULL, },
{ 1, "buffer_overflow", &sysctl.nr_buffer_overflow, sizeof(int), 0444, NULL, &get_nr_buffer_overflow, NULL, },
{ 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, }, { 0, },
{ 0, },
};
static ctl_table oprof_root[] = {
{1, "oprofile", NULL, 0, 0755, oprof_table},
{0, },
};
static ctl_table dev_root[] = {
{CTL_DEV, "dev", NULL, 0, 0555, oprof_root},
{0, },
};
static struct ctl_table_header * sysctl_header;
/* NOTE: we do *not* support sysctl() syscall */
static int __init init_sysctl(void)
{
int err = 0;
ctl_table * next = &oprof_table[nr_oprof_static];
/* these sysctl parms need sensible value */
sysctl_parms.buf_size = OP_DEFAULT_BUF_SIZE;
sysctl_parms.note_size = OP_DEFAULT_NOTE_SIZE;
if ((err = int_ops->add_sysctls(next)))
return err;
sysctl_header = register_sysctl_table(dev_root, 0);
return err;
}
/* not safe to mark as __exit since used from __init code */
static void cleanup_sysctl(void)
{
ctl_table * next = &oprof_table[nr_oprof_static];
unregister_sysctl_table(sysctl_header);
int_ops->remove_sysctls(next);
return;
}
static int can_unload(void)
{
int can = -EBUSY;
down(&sysctlsem);
if (allow_unload && state == STOPPED && !GET_USE_COUNT(THIS_MODULE))
can = 0;
up(&sysctlsem);
return can;
}
int __init oprof_init(void)
{
int err = 0;
if (sysctl.cpu_type != CPU_RTC) {
int_ops = op_int_interface();
// try to init, fall back to rtc if not
if ((err = int_ops->init())) {
int_ops = &op_rtc_ops;
if ((err = int_ops->init()))
return err;
sysctl.cpu_type = CPU_RTC;
}
} else {
int_ops = &op_rtc_ops;
if ((err = int_ops->init()))
return err;
}
if ((err = init_sysctl()))
goto out_err;
err = op_major = register_chrdev(0, "oprof", &oprof_fops);
if (err < 0)
goto out_err2;
err = oprof_init_hashmap();
if (err < 0) {
printk(KERN_ERR "oprofile: couldn't allocate hash map !\n");
unregister_chrdev(op_major, "oprof");
goto out_err2;
}
/* module might not be unloadable */
THIS_MODULE->can_unload = can_unload;
/* do this now so we don't have to track save/restores later */
op_save_syscalls();
printk(KERN_INFO "%s loaded, major %u\n", op_version, op_major);
return 0;
out_err2:
cleanup_sysctl();
out_err:
int_ops->deinit();
return err;
}
void __exit oprof_exit(void)
{
oprof_free_hashmap();
unregister_chrdev(op_major, "oprof");
cleanup_sysctl();
int_ops->deinit();
}
/*
* "The most valuable commodity I know of is information."
* - Gordon Gekko
*/