From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 171 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 9 +++ include/linux/syscalls.h | 6 ++ 3 files changed, 186 insertions(+) create mode 100644 include/linux/perf_counter.h (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 00000000000..22c4469abf4 --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,171 @@ +/* + * Performance counters: + * + * Copyright(C) 2008, Thomas Gleixner + * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include + +#include +#include +#include +#include +#include + +struct task_struct; + +/* + * Generalized hardware event types, used by the hw_event_type parameter + * of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + PERF_COUNT_CYCLES, + PERF_COUNT_INSTRUCTIONS, + PERF_COUNT_CACHE_REFERENCES, + PERF_COUNT_CACHE_MISSES, + PERF_COUNT_BRANCH_INSTRUCTIONS, + PERF_COUNT_BRANCH_MISSES, + /* + * If this bit is set in the type, then trigger NMI sampling: + */ + PERF_COUNT_NMI = (1 << 30), +}; + +/* + * IRQ-notification data record type: + */ +enum perf_record_type { + PERF_RECORD_SIMPLE, + PERF_RECORD_IRQ, + PERF_RECORD_GROUP, +}; + +/** + * struct hw_perf_counter - performance counter hardware details + */ +struct hw_perf_counter { + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + s32 next_count; + u64 irq_period; +}; + +/* + * Hardcoded buffer length limit for now, for IRQ-fed events: + */ +#define PERF_DATA_BUFLEN 2048 + +/** + * struct perf_data - performance counter IRQ data sampling ... + */ +struct perf_data { + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { + struct list_head list; + int active; +#if BITS_PER_LONG == 64 + atomic64_t count; +#else + atomic_t count32[2]; +#endif + u64 __irq_period; + + struct hw_perf_counter hw; + + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * Protect attach/detach: + */ + struct mutex mutex; + + int oncpu; + int cpu; + + s32 hw_event_type; + enum perf_record_type record_type; + + /* read() / irq related data */ + wait_queue_head_t waitq; + /* optional: for NMIs */ + int wakeup_pending; + struct perf_data *irqdata; + struct perf_data *usrdata; + struct perf_data data[2]; +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { +#ifdef CONFIG_PERF_COUNTERS + /* + * Protect the list of counters: + */ + spinlock_t lock; + struct list_head counters; + int nr_counters; + int nr_active; + struct task_struct *task; +#endif +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { + struct perf_counter_context ctx; + struct perf_counter_context *task_ctx; + int active_oncpu; + int max_pertask; +}; + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +#ifdef CONFIG_PERF_COUNTERS +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_print_debug(void); +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu) { } +static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_print_debug(void) { } +#endif + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d11447..4c530278391 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif + struct perf_counter_context perf_counter_ctx; #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; @@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info); + + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 04fb47bfb92..6cce728a626 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); #endif -- cgit From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 22c4469abf4..5031b5614f2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void hw_perf_restore_ctrl(u64 ctrl); +extern u64 hw_perf_disable_all(void); #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void hw_perf_restore_ctrl(u64 ctrl) { } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ -- cgit From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:26:59 +0100 Subject: perf counters: clean up 'raw' type API Impact: cleanup Introduce a separate hw_event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ include/linux/syscalls.h | 8 +++----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5031b5614f2..daedd7d87c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -38,6 +38,7 @@ enum hw_event_types { * If this bit is set in the type, then trigger NMI sampling: */ PERF_COUNT_NMI = (1 << 30), + PERF_COUNT_RAW = (1 << 31), }; /* @@ -49,6 +50,12 @@ enum perf_record_type { PERF_RECORD_GROUP, }; +struct perf_counter_event { + u32 hw_event_type; + u32 hw_event_period; + u64 hw_raw_ctrl; +}; + /** * struct hw_perf_counter - performance counter hardware details */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6cce728a626..3ecd73d03da 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct perf_counter_event; #include #include @@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu); +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd); #endif -- cgit From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index daedd7d87c2..1f0017673e7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -96,8 +96,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - u64 __irq_period; - + struct perf_counter_event event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -111,7 +110,6 @@ struct perf_counter { int oncpu; int cpu; - s32 hw_event_type; enum perf_record_type record_type; /* read() / irq related data */ -- cgit From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 98 ++++++++++++++++++++++++++++---------------- include/linux/syscalls.h | 12 ++++-- 2 files changed, 70 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1f0017673e7..a2b4852e2d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,65 +24,93 @@ struct task_struct; /* - * Generalized hardware event types, used by the hw_event_type parameter - * of the sys_perf_counter_open() syscall: + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: */ enum hw_event_types { - PERF_COUNT_CYCLES, - PERF_COUNT_INSTRUCTIONS, - PERF_COUNT_CACHE_REFERENCES, - PERF_COUNT_CACHE_MISSES, - PERF_COUNT_BRANCH_INSTRUCTIONS, - PERF_COUNT_BRANCH_MISSES, /* - * If this bit is set in the type, then trigger NMI sampling: + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_NMI = (1 << 30), - PERF_COUNT_RAW = (1 << 31), + PERF_COUNT_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* * IRQ-notification data record type: */ -enum perf_record_type { - PERF_RECORD_SIMPLE, - PERF_RECORD_IRQ, - PERF_RECORD_GROUP, +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; -struct perf_counter_event { - u32 hw_event_type; - u32 hw_event_period; - u64 hw_raw_ctrl; +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + u64 type; + + u64 irq_period; + u32 record_type; + + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + __reserved_1 : 29; + + u64 __reserved_2; }; +/* + * Kernel-internal data types: + */ + /** - * struct hw_perf_counter - performance counter hardware details + * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - u64 prev_count; - s32 next_count; - u64 irq_period; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + u64 irq_period; + s32 next_count; }; /* * Hardcoded buffer length limit for now, for IRQ-fed events: */ -#define PERF_DATA_BUFLEN 2048 +#define PERF_DATA_BUFLEN 2048 /** * struct perf_data - performance counter IRQ data sampling ... */ struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; }; /** @@ -96,7 +124,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -110,8 +138,6 @@ struct perf_counter { int oncpu; int cpu; - enum perf_record_type record_type; - /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3ecd73d03da..a549678b7c3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,7 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; -struct perf_counter_event; +struct perf_counter_hw_event; #include #include @@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd); + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd); #endif -- cgit From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a2b4852e2d7..7af7d896546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -117,7 +117,10 @@ struct perf_data { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { - struct list_head list; + struct list_head list_entry; + struct list_head sibling_list; + struct perf_counter *group_leader; + int active; #if BITS_PER_LONG == 64 atomic64_t count; @@ -158,7 +161,8 @@ struct perf_counter_context { * Protect the list of counters: */ spinlock_t lock; - struct list_head counters; + + struct list_head counter_list; int nr_counters; int nr_active; struct task_struct *task; -- cgit From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7af7d896546..27385641ecb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -113,6 +113,17 @@ struct perf_data { u8 data[PERF_DATA_BUFLEN]; }; +struct perf_counter; + +/** + * struct hw_perf_counter_ops - performance counter hw ops + */ +struct hw_perf_counter_ops { + void (*hw_perf_counter_enable) (struct perf_counter *counter); + void (*hw_perf_counter_disable) (struct perf_counter *counter); + void (*hw_perf_counter_read) (struct perf_counter *counter); +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -120,6 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; + struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -185,6 +197,9 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter); + extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -- cgit From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 27385641ecb..9a1713a1be2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -131,7 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -197,7 +197,7 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern struct hw_perf_counter_ops * +extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); @@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern void hw_perf_restore_ctrl(u64 ctrl); extern u64 hw_perf_disable_all(void); +extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); +extern u64 atomic64_counter_read(struct perf_counter *counter); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ -- cgit From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9a1713a1be2..68f6e3ad531 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -67,7 +67,7 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - u64 type; + s64 type; u64 irq_period; u32 record_type; @@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); -extern void hw_perf_restore_ctrl(u64 ctrl); -extern u64 hw_perf_disable_all(void); +extern u64 hw_perf_save_disable(void); +extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); @@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } -static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline void hw_perf_restore(u64 ctrl) { } +static inline u64 hw_perf_save_disable(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ -- cgit From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:03:20 +0100 Subject: perf counters: implement PERF_COUNT_TASK_CLOCK Impact: add new perf-counter type The 'task clock' counter counts the amount of time a task is executing, in nanoseconds. It stops ticking when a task is scheduled out either due to it blocking, sleeping or it being preempted. This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 68f6e3ad531..30c0ec8c1ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -50,8 +50,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, + /* + * Future software events: + */ + /* PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, */ }; /* -- cgit From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:59:31 +0100 Subject: perf counters: add prctl interface to disable/enable counters Add a way for self-monitoring tasks to disable/enable counters summarily, via a prctl: PR_TASK_PERF_COUNTERS_DISABLE 31 PR_TASK_PERF_COUNTERS_ENABLE 32 Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ include/linux/prctl.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 30c0ec8c1ee..97d86c293ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); +extern int perf_counter_task_disable(void); +extern int perf_counter_task_enable(void); #else static inline void @@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } +static inline int perf_counter_task_disable(void) { return -EINVAL; } +static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 48d887e3c6e..b00df4c79c6 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,4 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + #endif /* _LINUX_PRCTL_H */ -- cgit From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 97d86c293ee..8cb095fa442 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,6 +127,15 @@ struct hw_perf_counter_ops { void (*hw_perf_counter_read) (struct perf_counter *counter); }; +/** + * enum perf_counter_active_state - the states of a counter + */ +enum perf_counter_active_state { + PERF_COUNTER_STATE_OFF = -1, + PERF_COUNTER_STATE_INACTIVE = 0, + PERF_COUNTER_STATE_ACTIVE = 1, +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -136,7 +145,7 @@ struct perf_counter { struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; - int active; + enum perf_counter_active_state state; #if BITS_PER_LONG == 64 atomic64_t count; #else -- cgit From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8cb095fa442..72460289c65 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -91,14 +91,16 @@ struct perf_counter_hw_event { * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { +#ifdef CONFIG_PERF_COUNTERS u64 config; unsigned long config_base; unsigned long counter_base; int nmi; unsigned int idx; - u64 prev_count; + atomic64_t prev_count; u64 irq_period; - s32 next_count; + atomic64_t period_left; +#endif }; /* @@ -140,17 +142,15 @@ enum perf_counter_active_state { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { +#ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; -#if BITS_PER_LONG == 64 atomic64_t count; -#else - atomic_t count32[2]; -#endif + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -172,6 +172,7 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; +#endif }; /** @@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); -extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); -extern u64 atomic64_counter_read(struct perf_counter *counter); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); -- cgit From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 72460289c65..e5d25bf8f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -75,10 +75,11 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - __reserved_1 : 29; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + __reserved_1 : 28; u64 __reserved_2; }; @@ -138,6 +139,8 @@ enum perf_counter_active_state { PERF_COUNTER_STATE_ACTIVE = 1, }; +struct file; + /** * struct perf_counter - performance counter kernel representation: */ @@ -156,7 +159,10 @@ struct perf_counter { struct perf_counter_context *ctx; struct task_struct *task; + struct file *filp; + unsigned int nr_inherited; + struct perf_counter *parent; /* * Protect attach/detach: */ @@ -210,13 +216,16 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern void +perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_init_task(struct task_struct *child); +extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); @@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void); #else static inline void +perf_counter_show(struct perf_counter *counter, char *str, int trace) { } +static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_init_task(struct task_struct *child) { } +static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } -- cgit From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:28:33 +0100 Subject: perfcounters: add context switch counter Impact: add new feature, new sw counter Add a counter that counts the number of context-switches a task is doing. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e5d25bf8f74..d2a16563415 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,8 +53,8 @@ enum hw_event_types { /* * Future software events: */ - /* PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, */ + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* -- cgit From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++++--- include/linux/sched.h | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d2a16563415..f30486fc55d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -42,6 +42,8 @@ enum hw_event_types { PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_HW_EVENTS_MAX = 6, + /* * Special "software" counters provided by the kernel, even if * the hardware does not support performance counters. These @@ -50,11 +52,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - /* - * Future software events: - */ PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c530278391..2e15be8fc79 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1014,6 +1014,8 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1029,7 +1031,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; -- cgit From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f30486fc55d..d038450de87 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -218,8 +218,6 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern void -perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); @@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void); #else static inline void -perf_counter_show(struct perf_counter *counter, char *str, int trace) { } -static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } -- cgit From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d038450de87..984da540224 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #include #include -- cgit From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 984da540224..48f76d2e54c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,9 +128,9 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*hw_perf_counter_enable) (struct perf_counter *counter); - void (*hw_perf_counter_disable) (struct perf_counter *counter); - void (*hw_perf_counter_read) (struct perf_counter *counter); + void (*enable) (struct perf_counter *counter); + void (*disable) (struct perf_counter *counter); + void (*read) (struct perf_counter *counter); }; /** -- cgit From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..1b2e3242497 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */ +extern void curr_rq_lock_irq_save(unsigned long *flags); +extern void curr_rq_unlock_irq_restore(unsigned long *flags); +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); + extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); -- cgit From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e..54fa2fa2c8e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,14 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_ctx.counter_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -180,6 +188,7 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PERF_COUNTERS(tsk) \ } -- cgit From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 15:07:49 +0100 Subject: perfcounters: fix init context lock Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 54fa2fa2c8e..467cff545c3 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -116,7 +116,9 @@ extern struct group_info init_groups; #ifdef CONFIG_PERF_COUNTERS # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.lock = \ + __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else # define INIT_PERF_COUNTERS(tsk) #endif -- cgit From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48f76d2e54c..53af11d3767 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,7 +128,7 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*enable) (struct perf_counter *counter); + int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); }; -- cgit From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:29:25 +0100 Subject: perfcounters: remove ->nr_inherited Impact: remove dead code nr_inherited was not maintained correctly (not decremented) - and also not used - remove it. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 53af11d3767..1ea08e9f31c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -164,7 +164,6 @@ struct perf_counter { struct task_struct *task; struct file *filp; - unsigned int nr_inherited; struct perf_counter *parent; /* * Protect attach/detach: -- cgit From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1ea08e9f31c..ec77d1643d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -36,14 +36,15 @@ enum hw_event_types { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CYCLES = 0, + PERF_COUNT_CPU_CYCLES = 0, PERF_COUNT_INSTRUCTIONS = 1, PERF_COUNT_CACHE_REFERENCES = 2, PERF_COUNT_CACHE_MISSES = 3, PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 6, + PERF_HW_EVENTS_MAX = 7, /* * Special "software" counters provided by the kernel, even if -- cgit From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 09:02:11 +0100 Subject: perfcounters: include asm/perf_counter.h only if CONFIG_PERF_COUNTERS=y Impact: build fix on ia64 KOSAKI Motohiro reported that -tip doesnt build on ia64 because asm/perf_counter.h only exists on x86 for now. Fix it. Reported-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Acked-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ec77d1643d3..cc3a75a239a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,7 +14,10 @@ #define _LINUX_PERF_COUNTER_H #include -#include + +#ifdef CONFIG_PERF_COUNTERS +# include +#endif #include #include -- cgit From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:43:42 +1100 Subject: perf_counter: Add optional hw_perf_group_sched_in arch function Impact: extend perf_counter infrastructure This adds an optional hw_perf_group_sched_in() arch function that enables a whole group of counters in one go. It returns 1 if it added the group successfully, 0 if it did nothing (and therefore the core needs to add the counters individually), or a negative number if an error occurred. It should add all the counters and enable any software counters in the group, or else add none of them and return an error. There are a couple of related changes/improvements in the group handling here: * As an optimization, group_sched_out() and group_sched_in() now check the state of the group leader, and do nothing if the leader is not active or disabled. * We now call hw_perf_save_disable/hw_perf_restore around the complete set of counter enable/disable calls in __perf_counter_sched_in/out, to give the arch code the opportunity to defer updating the hardware state until the hw_perf_restore call if it wants. * We no longer stop adding groups after we get to a group that has more than one counter. We will ultimately add an option for a group to be exclusive. The current code doesn't really implement exclusive groups anyway, since a group could end up going on with other counters that get added before it. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index cc3a75a239a..b21d1ea4c05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu); #else static inline void -- cgit From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 21:00:30 +1100 Subject: perf_counter: Add support for pinned and exclusive counter groups Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b21d1ea4c05..7ab8e5f96f5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -86,7 +86,10 @@ struct perf_counter_hw_event { nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ - __reserved_1 : 28; + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only counter on PMU */ + + __reserved_1 : 26; u64 __reserved_2; }; @@ -141,6 +144,7 @@ struct hw_perf_counter_ops { * enum perf_counter_active_state - the states of a counter */ enum perf_counter_active_state { + PERF_COUNTER_STATE_ERROR = -2, PERF_COUNTER_STATE_OFF = -1, PERF_COUNTER_STATE_INACTIVE = 0, PERF_COUNTER_STATE_ACTIVE = 1, @@ -214,6 +218,7 @@ struct perf_cpu_context { struct perf_counter_context *task_ctx; int active_oncpu; int max_pertask; + int exclusive; }; /* @@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 17 Jan 2009 18:10:22 +1100 Subject: perf_counter: Add counter enable/disable ioctls Impact: New perf_counter features This primarily adds a way for perf_counter users to enable and disable counters and groups. Enabling or disabling a counter or group also enables or disables all of the child counters that have been cloned from it to monitor children of the task monitored by the top-level counter. The userspace interface to enable/disable counters is via ioctl on the counter file descriptor. Along the way this extends the code that handles child counters to handle child counter groups properly. A group with multiple counters will be cloned to child tasks if and only if the group leader has the hw_event.inherit bit set - if it is set the whole group is cloned as a group in the child task. In order to be able to enable or disable all child counters of a given top-level counter, we need a way to find them all. Hence I have added a child_list field to struct perf_counter, which is the head of the list of children for a top-level counter, or the link in that list for a child counter. That list is protected by the perf_counter.mutex field. This also adds a mutex to the perf_counter_context struct. Previously the list of counters was protected just by the lock field in the context, which meant that perf_counter_init_task had to take that lock and then take whatever lock/mutex protects the top-level counter's child_list. But the counter enable/disable functions need to take that lock in order to traverse the list, then for each counter take the lock in that counter's context in order to change the counter's state safely, which will lead to a deadlock. To solve this, we now have both a mutex and a spinlock in the context, and taking either is sufficient to ensure the list of counters can't change - you have to take both before changing the list. Now perf_counter_init_task takes the mutex instead of the lock (which incidentally means that inherit_counter can use GFP_KERNEL instead of GFP_ATOMIC) and thus avoids the possible deadlock. Similarly the new enable/disable functions can take the mutex while traversing the list of child counters without incurring a possible deadlock when the counter manipulation code locks the context for a child counter. We also had an misfeature that the first counter added to a context would possibly not go on until the next sched-in, because we were using ctx->nr_active to detect if the context was running on a CPU. But nr_active is the number of active counters, and if that was zero (because the context didn't have any counters yet) it would look like the context wasn't running on a cpu and so the retry code in __perf_install_in_context wouldn't retry. So this adds an 'is_active' field that is set when the context is on a CPU, even if it has no counters. The is_active field is only used for task contexts, not for per-cpu contexts. If we enable a subsidiary counter in a group that is active on a CPU, and the arch code can't enable the counter, then we have to pull the whole group off the CPU. We do this with group_sched_out, which gets moved up in the file so it comes before all its callers. This also adds similar logic to __perf_install_in_context so that the "all on, or none" invariant of groups is preserved when adding a new counter to a group. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7ab8e5f96f5..33ba9fe0a78 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #ifdef CONFIG_PERF_COUNTERS # include @@ -94,6 +95,12 @@ struct perf_counter_hw_event { u64 __reserved_2; }; +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + /* * Kernel-internal data types: */ @@ -173,8 +180,10 @@ struct perf_counter { struct file *filp; struct perf_counter *parent; + struct list_head child_list; + /* - * Protect attach/detach: + * Protect attach/detach and child_list: */ struct mutex mutex; @@ -199,13 +208,21 @@ struct perf_counter { struct perf_counter_context { #ifdef CONFIG_PERF_COUNTERS /* - * Protect the list of counters: + * Protect the states of the counters in the list, + * nr_active, and the list: */ spinlock_t lock; + /* + * Protect the list of counters. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; struct list_head counter_list; int nr_counters; int nr_active; + int is_active; struct task_struct *task; #endif }; -- cgit From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 33ba9fe0a78..91f1ca4c01c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); @@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +#define PERFMON_MIN_PERIOD_NS 10000 + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } -- cgit From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 91f1ca4c01c..f55381fbcac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } -#define PERFMON_MIN_PERIOD_NS 10000 - #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b10abf77..1e5f70062a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern u64 cpu_nr_switches(int cpu); +extern u64 cpu_nr_migrations(int cpu); struct seq_file; struct cfs_rq; -- cgit From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f55381fbcac..c83f51d6e35 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,14 +83,17 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only counter on PMU */ - - __reserved_1 : 26; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + + __reserved_1 : 23; u64 __reserved_2; }; -- cgit From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Feb 2009 22:10:34 +1100 Subject: perfcounters: make context switch and migration software counters work again Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the context switch and migration software counters to report zero always. With that commit, the software counters only count events that occur between sched-in and sched-out for a task. This is necessary for the counter enable/disable prctls and ioctls to work. However, the context switch and migration counts are incremented after sched-out for one task and before sched-in for the next. Since the increment doesn't occur while a task is scheduled in (as far as the software counters are concerned) it doesn't count towards any counter. Thus the context switch and migration counters need to count events that occur at any time, provided the counter is enabled, not just those that occur while the task is scheduled in (from the perf_counter subsystem's point of view). The problem though is that the software counter code can't tell the difference between being enabled and being scheduled in, and between being disabled and being scheduled out, since we use the one pair of enable/disable entry points for both. That is, the high-level disable operation simply arranges for the counter to not be scheduled in any more, and the high-level enable operation arranges for it to be scheduled in again. One way to solve this would be to have sched_in/out operations in the hw_perf_counter_ops struct as well as enable/disable. However, this takes a simpler approach: it adds a 'prev_state' field to the perf_counter struct that allows a counter's enable method to know whether the counter was previously disabled or just inactive (scheduled out), and therefore whether the enable method is being called as a result of a high-level enable or a schedule-in operation. This then allows the context switch, migration and page fault counters to reset their hw.prev_count value in their enable functions only if they are called as a result of a high-level enable operation. Although page faults would normally only occur while the counter is scheduled in, this changes the page fault counter code too in case there are ever circumstances where page faults get counted against a task while its counters are not scheduled in. Reported-by: Jaswinder Singh Rajput Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c83f51d6e35..32cd1acb738 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -173,6 +173,7 @@ struct perf_counter { const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; + enum perf_counter_active_state prev_state; atomic64_t count; struct perf_counter_hw_event hw_event; -- cgit From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 43 +++++++++++++++++++++++-------------------- include/linux/syscalls.h | 9 +++------ 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 32cd1acb738..186efaf4966 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -13,20 +13,8 @@ #ifndef _LINUX_PERF_COUNTER_H #define _LINUX_PERF_COUNTER_H -#include -#include - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include - -struct task_struct; +#include +#include /* * User-space ABI bits: @@ -78,12 +66,12 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - s64 type; + __s64 type; - u64 irq_period; - u32 record_type; + __u64 irq_period; + __u32 record_type; - u32 disabled : 1, /* off by default */ + __u32 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -95,7 +83,7 @@ struct perf_counter_hw_event { __reserved_1 : 23; - u64 __reserved_2; + __u64 __reserved_2; }; /* @@ -104,10 +92,24 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#ifdef __KERNEL__ /* - * Kernel-internal data types: + * Kernel-internal data types and definitions: */ +#ifdef CONFIG_PERF_COUNTERS +# include +#endif + +#include +#include +#include +#include +#include +#include + +struct task_struct; + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif +#endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88255d3261a..28ef2be839c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd); +asmlinkage long sys_perf_counter_open( + const struct perf_counter_hw_event __user *hw_event_uptr, + pid_t pid, int cpu, int group_fd); #endif -- cgit From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2009 20:36:51 +1100 Subject: perfcounters: provide expansion room in the ABI Impact: ABI change This expands several fields in the perf_counter_hw_event struct and adds a "flags" argument to the perf_counter_open system call, in order that features can be added in future without ABI changes. In particular the record_type field is expanded to 64 bits, and the space for flag bits has been expanded from 32 to 64 bits. This also adds some new fields: * read_format (64 bits) is intended to provide a way to specify what userspace wants to get back when it does a read() on a simple (non-interrupting) counter; * exclude_idle (1 bit) provides a way for userspace to ask that events that occur when the cpu is idle be excluded; * extra_config_len will provide a way for userspace to supply an arbitrary amount of extra machine-specific PMU configuration data immediately following the perf_counter_hw_event struct, to allow sophisticated users to program things such as instruction matching CAMs and address range registers; * __reserved_3 and __reserved_4 provide space for future expansion. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 12 +++++++++--- include/linux/syscalls.h | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 186efaf4966..c42455ab155 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -69,9 +69,10 @@ struct perf_counter_hw_event { __s64 type; __u64 irq_period; - __u32 record_type; + __u64 record_type; + __u64 read_format; - __u32 disabled : 1, /* off by default */ + __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -80,10 +81,15 @@ struct perf_counter_hw_event { exclude_user : 1, /* don't count user */ exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 23; + __reserved_1 : 55; + + __u32 extra_config_len; + __u32 __reserved_4; __u64 __reserved_2; + __u64 __reserved_3; }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 28ef2be839c..ab1d7724739 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( const struct perf_counter_hw_event __user *hw_event_uptr, - pid_t pid, int cpu, int group_fd); + pid_t pid, int cpu, int group_fd, unsigned long flags); #endif -- cgit From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 12:33:16 +0100 Subject: perfcounters: fix reserved bits sizing The sum of bits is 65 currently not 64 - so reduce the # of reserved bits from 55 to 54. Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c42455ab155..dde564517b6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,7 +83,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; -- cgit From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:32 +0100 Subject: perf_counter: software counter event infrastructure Provide generic software counter infrastructure that supports software events. This will be used to allow sample based profiling based on software events such as pagefaults. The current infrastructure can only provide a count of such events, no place information. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dde564517b6..3fefc3b8150 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -126,6 +126,7 @@ struct hw_perf_counter { unsigned long counter_base; int nmi; unsigned int idx; + atomic64_t count; /* software */ atomic64_t prev_count; u64 irq_period; atomic64_t period_left; @@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } +static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } + +static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) { } #endif #endif /* __KERNEL__ */ -- cgit From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3fefc3b8150..4b14a8e9dbf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -49,8 +49,10 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, PERF_COUNT_CPU_MIGRATIONS = -5, + PERF_COUNT_PAGE_FAULTS_MIN = -6, + PERF_COUNT_PAGE_FAULTS_MAJ = -7, - PERF_SW_EVENTS_MIN = -6, + PERF_SW_EVENTS_MIN = -8, }; /* -- cgit From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:35 +0100 Subject: perf_counter: hrtimer based sampling for software time events Use hrtimers to profile timer based sampling for the software time counters. This allows platforms without hardware counter support to still perform sample based profiling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4b14a8e9dbf..dfb4c7ce18b 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -114,6 +114,7 @@ struct perf_counter_hw_event { #include #include #include +#include #include struct task_struct; @@ -123,12 +124,19 @@ struct task_struct; */ struct hw_perf_counter { #ifdef CONFIG_PERF_COUNTERS - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - atomic64_t count; /* software */ + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; atomic64_t prev_count; u64 irq_period; atomic64_t period_left; -- cgit From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:36 +0100 Subject: perf_counter: add an event_list I noticed that the counter_list only includes top-level counters, thus perf_swcounter_event() will miss sw-counters in groups. Since perf_swcounter_event() also wants an RCU safe list, create a new event_list that includes all counters and uses RCU list ops and use call_rcu to free the counter structure. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dfb4c7ce18b..08c11a6afeb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -187,6 +187,7 @@ struct file; struct perf_counter { #ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; + struct list_head event_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; @@ -220,6 +221,8 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; + + struct rcu_head rcu_head; #endif }; @@ -243,6 +246,7 @@ struct perf_counter_context { struct mutex mutex; struct list_head counter_list; + struct list_head event_list; int nr_counters; int nr_active; int is_active; -- cgit From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:11 +0100 Subject: perf_counter: fix uninitialized usage of event_list Impact: fix boot crash When doing the generic context switch event I ran into some early boot hangs, which were caused by inf func recursion (event, fault, event, fault). I eventually tracked it down to event_list not being initialized at the time of the first event. Fix this. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.195392657@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 219748d0026..ca226a91abe 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -124,6 +124,8 @@ extern struct cred init_cred; # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.event_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ .perf_counter_ctx.lock = \ __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else -- cgit From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 75b2fc5306d..7ed41f7c5ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -138,7 +138,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern u64 cpu_nr_switches(int cpu); extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); -- cgit From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:17 +0100 Subject: perf_counter: hook up the tracepoint events Impact: new perfcounters feature Enable usage of tracepoints as perf counter events. tracepoint event ids can be found in /debug/tracing/event/*/*/id and (for now) are represented as -65536+id in the type field. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.744044174@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 08c11a6afeb..065984c1ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,6 +53,8 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS_MAJ = -7, PERF_SW_EVENTS_MIN = -8, + + PERF_TP_EVENTS_MIN = -65536 }; /* @@ -222,6 +224,7 @@ struct perf_counter { struct perf_data *usrdata; struct perf_data data[2]; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif }; -- cgit From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 95 ++++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 065984c1ff5..8f939490550 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -21,56 +21,81 @@ */ /* - * Generalized performance counter event types, used by the hw_event.type - * parameter of the sys_perf_counter_open() syscall: + * hw_event.type */ -enum hw_event_types { +enum perf_event_types { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + /* - * Common hardware events, generalized by the kernel: + * available TYPE space, raw is the max value. */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 7, + PERF_TYPE_RAW = 128, +}; +/* + * Generalized performance counter event types, used by the hw_event.event_id + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_ids { /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - PERF_COUNT_PAGE_FAULTS_MIN = -6, - PERF_COUNT_PAGE_FAULTS_MAJ = -7, - - PERF_SW_EVENTS_MIN = -8, + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, + + PERF_HW_EVENTS_MAX = 7, +}; - PERF_TP_EVENTS_MIN = -65536 +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum sw_event_ids { + PERF_COUNT_CPU_CLOCK = 0, + PERF_COUNT_TASK_CLOCK = 1, + PERF_COUNT_PAGE_FAULTS = 2, + PERF_COUNT_CONTEXT_SWITCHES = 3, + PERF_COUNT_CPU_MIGRATIONS = 4, + PERF_COUNT_PAGE_FAULTS_MIN = 5, + PERF_COUNT_PAGE_FAULTS_MAJ = 6, + + PERF_SW_EVENTS_MAX = 7, }; /* * IRQ-notification data record type: */ enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - __s64 type; + union { + struct { + __u64 event_id : 56, + type : 8; + }; + struct { + __u64 raw_event_id : 63, + raw_type : 1; + }; + __u64 event_config; + }; __u64 irq_period; __u64 record_type; @@ -78,7 +103,6 @@ struct perf_counter_hw_event { __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -87,7 +111,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 54; + __reserved_1 : 55; __u32 extra_config_len; __u32 __reserved_4; @@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw && counter->hw_event.type < 0; + return !counter->hw_event.raw_type && + counter->hw_event.type != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); #else static inline void @@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, +static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } #endif -- cgit From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8f939490550..a4b76c0175f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ -- cgit From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 21 Mar 2009 15:31:47 +1100 Subject: perf_counter: fix type/event_id layout on big-endian systems Impact: build fix for powerpc Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI") expanded the hw_event.type field into a union of structs containing bitfields. In particular it introduced a type field and a raw_type field, with the intention that the 1-bit raw_type field should overlay the most-significant bit of the 8-bit type field, and in fact perf_counter_alloc() now assumes that (or at least, assumes that raw_type doesn't overlay any of the bits that are 1 in the values of PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}). Unfortunately this is not true on big-endian systems such as PowerPC, where bitfields are laid out from left to right, i.e. from most significant bit to least significant. This means that setting hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1. This fixes it by making the layout depend on whether or not __BIG_ENDIAN_BITFIELD is defined. It's a bit ugly, but that's what we get for using bitfields in a user/kernel ABI. Also, that commit didn't fix up some places in arch/powerpc/kernel/ perf_counter.c where hw_event.raw and hw_event.event_id were used. This fixes them too. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a4b76c0175f..98f5990be1e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -15,6 +15,7 @@ #include #include +#include /* * User-space ABI bits: @@ -86,6 +87,7 @@ enum perf_counter_record_type { */ struct perf_counter_hw_event { union { +#ifndef __BIG_ENDIAN_BITFIELD struct { __u64 event_id : 56, type : 8; @@ -94,6 +96,16 @@ struct perf_counter_hw_event { __u64 raw_event_id : 63, raw_type : 1; }; +#else + struct { + __u64 type : 8, + event_id : 56; + }; + struct { + __u64 raw_type : 1, + raw_event_id : 63; + }; +#endif /* __BIT_ENDIAN_BITFIELD */ __u64 event_config; }; -- cgit From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 74 ++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98f5990be1e..56099e52970 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -82,32 +82,37 @@ enum perf_counter_record_type { PERF_RECORD_GROUP = 2, }; +#define __PERF_COUNTER_MASK(name) \ + (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ + PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW_BITS 1 +#define PERF_COUNTER_RAW_SHIFT 63 +#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) + +#define PERF_COUNTER_CONFIG_BITS 63 +#define PERF_COUNTER_CONFIG_SHIFT 0 +#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) + +#define PERF_COUNTER_TYPE_BITS 7 +#define PERF_COUNTER_TYPE_SHIFT 56 +#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) + +#define PERF_COUNTER_EVENT_BITS 56 +#define PERF_COUNTER_EVENT_SHIFT 0 +#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) + /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - union { -#ifndef __BIG_ENDIAN_BITFIELD - struct { - __u64 event_id : 56, - type : 8; - }; - struct { - __u64 raw_event_id : 63, - raw_type : 1; - }; -#else - struct { - __u64 type : 8, - event_id : 56; - }; - struct { - __u64 raw_type : 1, - raw_event_id : 63; - }; -#endif /* __BIT_ENDIAN_BITFIELD */ - __u64 event_config; - }; + /* + * The MSB of the config word signifies if the rest contains cpu + * specific (raw) counter configuration data, if unset, the next + * 7 bits are an event type and the rest of the bits are the event + * identifier. + */ + __u64 config; __u64 irq_period; __u64 record_type; @@ -157,6 +162,27 @@ struct perf_counter_hw_event { struct task_struct; +static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_RAW_MASK; +} + +static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_CONFIG_MASK; +} + +static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +{ + return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + PERF_COUNTER_TYPE_SHIFT; +} + +static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_EVENT_MASK; +} + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw_type && - counter->hw_event.type != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->hw_event) && + perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); -- cgit From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:07 +0100 Subject: perf_counter: avoid recursion Tracepoint events like lock_acquire and software counters like pagefaults can recurse into the perf counter code again, avoid that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.152096433@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 56099e52970..18dc17d0a61 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -328,6 +328,13 @@ struct perf_cpu_context { int active_oncpu; int max_pertask; int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; }; /* -- cgit From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 18:22:08 +0100 Subject: perf_counter: add an mmap method to allow userspace to read hardware counters Impact: new feature giving performance improvement This adds the ability for userspace to do an mmap on a hardware counter fd and get access to a read-only page that contains the information needed to translate a hardware counter value to the full 64-bit counter value that would be returned by a read on the fd. This is useful on architectures that allow user programs to read the hardware counters, such as PowerPC. The mmap will only succeed if the counter is a hardware counter monitoring the current process. On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter and translate it to the full 64-bit value in about 30ns using the mmapped page, compared to about 830ns for the read syscall on the counter, so this does give a significant performance improvement. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.297057964@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 18dc17d0a61..40b324e91bf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -143,6 +143,17 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -278,6 +289,9 @@ struct perf_counter { int oncpu; int cpu; + /* pointer to page shared with userspace via mmap */ + unsigned long user_page; + /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ @@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_update_userpage(struct perf_counter *counter); extern void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs); -- cgit From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 23 Mar 2009 18:22:09 +0100 Subject: mutex: add atomic_dec_and_mutex_lock() Much like the atomic_dec_and_lock() function in which we take an hold a spin_lock if we drop the atomic to 0 this function takes and holds the mutex if we dec the atomic to 0. Signed-off-by: Eric Paris Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.410913479@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab..93054fc3635 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} + #endif -- cgit From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:10 +0100 Subject: perf_counter: new output ABI - part 1 Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40b324e91bf..2b5e66d5ebd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -152,6 +152,8 @@ struct perf_counter_mmap_page { __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + + __u32 data_head; /* head in the data section */ }; #ifdef __KERNEL__ @@ -218,21 +220,6 @@ struct hw_perf_counter { #endif }; -/* - * Hardcoded buffer length limit for now, for IRQ-fed events: - */ -#define PERF_DATA_BUFLEN 2048 - -/** - * struct perf_data - performance counter IRQ data sampling ... - */ -struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; -}; - struct perf_counter; /** @@ -256,6 +243,14 @@ enum perf_counter_active_state { struct file; +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; + atomic_t head; + struct perf_counter_mmap_page *user_page; + void *data_pages[0]; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -289,16 +284,15 @@ struct perf_counter { int oncpu; int cpu; - /* pointer to page shared with userspace via mmap */ - unsigned long user_page; + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; - /* read() / irq related data */ + /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ int wakeup_pending; - struct perf_data *irqdata; - struct perf_data *usrdata; - struct perf_data data[2]; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; -- cgit From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Mar 2009 13:18:16 +0100 Subject: perf_counter: fix perf_poll() Impact: fix kerneltop 100% CPU usage Only return a poll event when there's actually been one, poll_wait() doesn't actually wait for the waitq you pass it, it only enqueues you on it. Only once all FDs have been iterated and none of thm returned a poll-event will it schedule(). Also make it return POLL_HUP when there's not mmap() area to read from. Further, fix a silly bug in the write code. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arjan van de Ven Orig-LKML-Reference: <1237897096.24918.181.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b5e66d5ebd..48212c15b7d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -246,6 +246,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; + atomic_t wakeup; atomic_t head; struct perf_counter_mmap_page *user_page; void *data_pages[0]; -- cgit From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:23 +0100 Subject: perf_counter: output objects Provide a {type,size} header for each output entry. This should provide extensible output, and the ability to mix multiple streams. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.831607932@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48212c15b7d..c256635377d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,16 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +struct perf_event_header { + __u32 type; + __u32 size; +}; + +enum perf_event_type { + PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -260,6 +270,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; + int nr_siblings; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; -- cgit From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:25 +0100 Subject: perf_counter: optionally provide the pid/tid of the sampled task Allow cpu wide counters to profile userspace by providing what process the sample belongs to. This raises the first issue with the output type, lots of these options: group, tid, callchain, etc.. are non-exclusive and could be combined, suggesting a bitfield. However, things like the mmap() data stream doesn't fit in that. How to split the type field... Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.013775235@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c256635377d..7fdbdf8be77 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,8 +127,9 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ + include_tid : 1, /* include the tid */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; @@ -164,6 +165,8 @@ struct perf_event_header { enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + + __PERF_EVENT_TID = 0x100, }; #ifdef __KERNEL__ -- cgit From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Mar 2009 22:46:58 +1100 Subject: perf_counter: record time running and time enabled for each counter Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Andrew Morton Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7fdbdf8be77..6bf67ce1762 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,16 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1, + PERF_FORMAT_TOTAL_TIME_RUNNING = 2, +}; + /* * Hardware event to monitor via a performance monitoring counter: */ @@ -281,6 +291,32 @@ struct perf_counter { enum perf_counter_active_state prev_state; atomic64_t count; + /* + * These are the total time in nanoseconds that the counter + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task counter) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the counter is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the counter was enabled + * tstamp_running: the notional time when the counter was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * counter was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -291,6 +327,13 @@ struct perf_counter { struct perf_counter *parent; struct list_head child_list; + /* + * These accumulate total time (in nanoseconds) that children + * counters have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + /* * Protect attach/detach and child_list: */ @@ -339,6 +382,16 @@ struct perf_counter_context { int nr_active; int is_active; struct task_struct *task; + + /* + * time_now is the current time in nanoseconds since an arbitrary + * point in the past. For per-task counters, this is based on the + * task clock, and for per-cpu counters it is based on the cpu clock. + * time_lost is an offset from the task/cpu clock, used to make it + * appear that time only passes while the context is scheduled in. + */ + u64 time_now; + u64 time_lost; #endif }; -- cgit From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6bf67ce1762..0d833228eee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -275,6 +275,10 @@ struct perf_mmap_data { void *data_pages[0]; }; +struct perf_wakeup_entry { + struct perf_wakeup_entry *next; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -350,7 +354,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ - int wakeup_pending; + struct perf_wakeup_entry wakeup; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; @@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); @@ -461,7 +465,7 @@ static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } @@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(u32 event, u64 nr, - int nmi, struct pt_regs *regs) { } +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + #endif #endif /* __KERNEL__ */ -- cgit From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:03 +0200 Subject: perf_counter: fix update_userpage() It just occured to me it is possible to have multiple contending updates of the userpage (mmap information vs overflow vs counter). This would break the seqlock logic. It appear the arch code uses this from NMI context, so we cannot possibly serialize its use, therefore separate the data_head update from it and let it return to its original use. The arch code needs to make sure there are no contending callers by disabling the counter before using it -- powerpc appears to do this nicely. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.241410660@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0d833228eee..8ac18852dcf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,10 +160,45 @@ struct perf_counter_hw_event { struct perf_counter_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * The index and offset should be read atomically using the seqlock: + * + * __u32 seq, index; + * __s64 offset; + * + * again: + * rmb(); + * seq = pc->lock; + * + * if (unlikely(seq & 1)) { + * cpu_relax(); + * goto again; + * } + * + * index = pc->index; + * offset = pc->offset; + * + * rmb(); + * if (pc->lock != seq) + * goto again; + * + * After this, index contains architecture specific counter index + 1, + * so that 0 means unavailable, offset contains the value to be added + * to the result of the raw timer read to obtain this counter's value. + */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + /* + * Control data for the mmap() data buffer. + * + * User-space reading this value should issue an rmb(), on SMP capable + * platforms, after reading this value -- see perf_counter_wakeup(). + */ __u32 data_head; /* head in the data section */ }; -- cgit From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:05 +0200 Subject: perf_counter: executable mmap() information Currently the profiling information returns userspace IPs but no way to correlate them to userspace code. Userspace could look into /proc/$pid/maps but that might not be current or even present anymore at the time of analyzing the IPs. Therefore provide means to track the mmap information and provide it in the output stream. XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Andrew Morton Orig-LKML-Reference: <20090330171023.417259499@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8ac18852dcf..037a81145ac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -137,9 +137,11 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ + include_tid : 1, /* include the tid */ + mmap : 1, /* include mmap data */ + munmap : 1, /* include munmap data */ - __reserved_1 : 54; + __reserved_1 : 52; __u32 extra_config_len; __u32 __reserved_4; @@ -211,6 +213,9 @@ enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + PERF_EVENT_MMAP = 2, + PERF_EVENT_MUNMAP = 3, + __PERF_EVENT_TID = 0x100, }; @@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter) extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + +extern void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + +static inline void +perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + +static inline void +perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + #endif #endif /* __KERNEL__ */ -- cgit From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:12 +0200 Subject: perf_counter: re-arrange the perf_event_type Breaks ABI yet again :-) Change the event type so that [0, 2^31-1] are regular event types, but [2^31, 2^32-1] forms a bitmask for overflow events. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.047961770@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 037a81145ac..edf5bfb7ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -210,13 +210,15 @@ struct perf_event_header { }; enum perf_event_type { - PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, PERF_EVENT_MMAP = 2, PERF_EVENT_MUNMAP = 3, - __PERF_EVENT_TID = 0x100, + PERF_EVENT_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = 1UL << 30, + __PERF_EVENT_TID = 1UL << 29, }; #ifdef __KERNEL__ -- cgit From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:14 +0200 Subject: perf_counter: provide generic callchain bits Provide the generic callchain support bits. If hw_event->callchain is set the arch specific perf_callchain() function is called upon to provide a perf_callchain_entry structure filled with the current callchain. If it does so, it is added to the overflow output event. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.254266860@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index edf5bfb7ff5..43083afffe0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -140,8 +140,9 @@ struct perf_counter_hw_event { include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + callchain : 1, /* add callchain data */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 __reserved_4; @@ -219,6 +220,7 @@ enum perf_event_type { PERF_EVENT_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = 1UL << 30, __PERF_EVENT_TID = 1UL << 29, + __PERF_EVENT_CALLCHAIN = 1UL << 28, }; #ifdef __KERNEL__ @@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +#define MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + u64 nr; + u64 ip[MAX_STACK_DEPTH]; +}; + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:11:59 +0200 Subject: perf_counter: move the event overflow output bits to record_type Per suggestion from Paul, move the event overflow bits to record_type and sanitize the enums a bit. Breaks the ABI -- again ;-) Suggested-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.151921176@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 50 +++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 43083afffe0..06a6fba9f53 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,15 +73,6 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - #define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -102,6 +93,17 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.record_type to request information + * in the overflow packets. + */ +enum perf_counter_record_format { + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_GROUP = 1U << 2, + PERF_RECORD_CALLCHAIN = 1U << 3, +}; + /* * Bits that can be set in hw_event.read_format to request that * reads on the counter should return the indicated quantities, @@ -125,8 +127,8 @@ struct perf_counter_hw_event { __u64 config; __u64 irq_period; - __u64 record_type; - __u64 read_format; + __u32 record_type; + __u32 read_format; __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ @@ -137,12 +139,10 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ - callchain : 1, /* add callchain data */ - __reserved_1 : 51; + __reserved_1 : 53; __u32 extra_config_len; __u32 __reserved_4; @@ -212,15 +212,21 @@ struct perf_event_header { enum perf_event_type { - PERF_EVENT_GROUP = 1, - - PERF_EVENT_MMAP = 2, - PERF_EVENT_MUNMAP = 3, + PERF_EVENT_MMAP = 1, + PERF_EVENT_MUNMAP = 2, - PERF_EVENT_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = 1UL << 30, - __PERF_EVENT_TID = 1UL << 29, - __PERF_EVENT_CALLCHAIN = 1UL << 28, + /* + * Half the event type space is reserved for the counter overflow + * bitfields, as found in hw_event.record_type. + * + * These events will have types of the form: + * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + */ + PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = PERF_RECORD_IP, + __PERF_EVENT_TID = PERF_RECORD_TID, + __PERF_EVENT_GROUP = PERF_RECORD_GROUP, + __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, }; #ifdef __KERNEL__ -- cgit From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:01 +0200 Subject: perf_counter: per event wakeups By request, provide a way to request a wakeup every 'n' events instead of every page of output. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.323309784@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 06a6fba9f53..5428ba120d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -145,7 +145,7 @@ struct perf_counter_hw_event { __reserved_1 : 53; __u32 extra_config_len; - __u32 __reserved_4; + __u32 wakeup_events; /* wakeup every n events */ __u64 __reserved_2; __u64 __reserved_3; @@ -321,6 +321,7 @@ struct perf_mmap_data { int nr_pages; atomic_t wakeup; atomic_t head; + atomic_t events; struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; -- cgit From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5428ba120d7..90cce0c74a0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 255 +#define MAX_STACK_DEPTH 254 struct perf_callchain_entry { - u64 nr; + u32 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; -- cgit From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:04 +0200 Subject: perf_counter: update mmap() counter read Paul noted that we don't need SMP barriers for the mmap() counter read because its always on the same cpu (otherwise you can't access the hw counter anyway). So remove the SMP barriers and replace them with regular compiler barriers. Further, update the comment to include a race free method of reading said hardware counter. The primary change is putting the pmc_read inside the seq-loop, otherwise we can still race and read rubbish. Noticed-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.577951445@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 90cce0c74a0..f2b914de3f0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -167,30 +167,28 @@ struct perf_counter_mmap_page { /* * Bits needed to read the hw counters in user-space. * - * The index and offset should be read atomically using the seqlock: - * - * __u32 seq, index; - * __s64 offset; + * u32 seq; + * s64 count; * * again: - * rmb(); * seq = pc->lock; - * * if (unlikely(seq & 1)) { * cpu_relax(); * goto again; * } * - * index = pc->index; - * offset = pc->offset; + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * rmb(); + * barrier(); * if (pc->lock != seq) * goto again; * - * After this, index contains architecture specific counter index + 1, - * so that 0 means unavailable, offset contains the value to be added - * to the result of the raw timer read to obtain this counter's value. + * NOTE: for obvious reason this only works on self-monitoring + * processes. */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ -- cgit From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:44:59 +0200 Subject: perf_counter: update mmap() counter read, take 2 Update the userspace read method. Paul noted that: - userspace cannot observe ->lock & 1 on the same cpu. - we need a barrier() between reading ->lock and ->index to ensure we read them in that prticular order. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.368446033@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f2b914de3f0..e22ab47a2f4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -170,22 +170,18 @@ struct perf_counter_mmap_page { * u32 seq; * s64 count; * - * again: - * seq = pc->lock; - * if (unlikely(seq & 1)) { - * cpu_relax(); - * goto again; - * } + * do { + * seq = pc->lock; * - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * barrier(); - * if (pc->lock != seq) - * goto again; + * barrier(); + * } while (pc->lock != seq); * * NOTE: for obvious reason this only works on self-monitoring * processes. -- cgit From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:00 +0200 Subject: perf_counter: add more context information Change the callchain context entries to u16, so as to gain some space. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.457320003@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e22ab47a2f4..f9d5cf0bfbd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 254 +#define MAX_STACK_DEPTH 255 struct perf_callchain_entry { - u32 nr, hv, kernel, user; + u16 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; -- cgit From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:01 +0200 Subject: perf_counter: SIGIO support Provide support for fcntl() I/O availability signals. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.579788800@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f9d5cf0bfbd..8d5d11b8d01 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -238,6 +238,7 @@ enum perf_event_type { #include #include #include +#include #include struct task_struct; @@ -398,6 +399,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; + struct fasync_struct *fasync; /* optional: for NMIs */ struct perf_wakeup_entry wakeup; -- cgit From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:02 +0200 Subject: perf_counter: generalize pending infrastructure Prepare the pending infrastructure to do more than wakeups. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.634732847@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8d5d11b8d01..977fb15a53f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -321,8 +321,9 @@ struct perf_mmap_data { void *data_pages[0]; }; -struct perf_wakeup_entry { - struct perf_wakeup_entry *next; +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); }; /** @@ -401,7 +402,7 @@ struct perf_counter { wait_queue_head_t waitq; struct fasync_struct *fasync; /* optional: for NMIs */ - struct perf_wakeup_entry wakeup; + struct perf_pending_entry pending; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; -- cgit From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 977fb15a53f..ca2d4df29e0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_counter_context *ctx, int cpu); extern void perf_counter_update_userpage(struct perf_counter *counter); -extern void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs); +extern int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ -- cgit From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:06 +0200 Subject: perf_counter: PERF_RECORD_TIME By popular request, provide means to log a timestamp along with the counter overflow event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.024173282@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ca2d4df29e0..928a7fae096 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,7 @@ enum perf_counter_record_format { PERF_RECORD_TID = 1U << 1, PERF_RECORD_GROUP = 1U << 2, PERF_RECORD_CALLCHAIN = 1U << 3, + PERF_RECORD_TIME = 1U << 4, }; /* @@ -221,6 +222,7 @@ enum perf_event_type { __PERF_EVENT_TID = PERF_RECORD_TID, __PERF_EVENT_GROUP = PERF_RECORD_GROUP, __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, + __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ -- cgit From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:07 +0200 Subject: perf_counter: counter overflow limit Provide means to auto-disable the counter after 'n' overflow events. Create the counter with hw_event.disabled = 1, and then issue an ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable the counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.083139737@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 928a7fae096..ef4dcbff75a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -155,8 +155,9 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) /* * Structure of the page that can be mapped via mmap @@ -403,9 +404,14 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; - /* optional: for NMIs */ + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_disable; struct perf_pending_entry pending; + atomic_t event_limit; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif -- cgit From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:08 +0200 Subject: perf_counter: comment the perf_event_type stuff Describe the event format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.211174347@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ef4dcbff75a..81220188d05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -208,6 +208,20 @@ struct perf_event_header { enum perf_event_type { + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, @@ -217,6 +231,24 @@ enum perf_event_type { * * These events will have types of the form: * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && __PERF_EVENT_IP + * { u32 pid, tid; } && __PERF_EVENT_TID + * + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * + * { u64 time; } && __PERF_EVENT_TIME + * }; */ PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = PERF_RECORD_IP, -- cgit From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:09 +0200 Subject: perf_counter: change event definition Currently the definition of an event is slightly ambiguous. We have wakeup events, for poll() and SIGIO, which are either generated when a record crosses a page boundary (hw_events.wakeup_events == 0), or every wakeup_events new records. Now a record can be either a counter overflow record, or a number of different things, like the mmap PROT_EXEC region notifications. Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only considers counter overflows. This patch changes then wakeup_events and SIGIO notification to only consider overflow events. Furthermore it changes the SIGIO notification to report SIGHUP when the event limit is reached and the counter will be disabled. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.266679874@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 81220188d05..0f5a4005048 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -439,6 +439,7 @@ struct perf_counter { /* delayed work for NMIs and such */ int pending_wakeup; + int pending_kill; int pending_disable; struct perf_pending_entry pending; -- cgit From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:10 +0200 Subject: perf_counter: rework context time Since perf_counter_context is switched along with tasks, we can maintain the context time without using the task runtime clock. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.353552838@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0f5a4005048..7f5d353d78a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -477,14 +477,10 @@ struct perf_counter_context { struct task_struct *task; /* - * time_now is the current time in nanoseconds since an arbitrary - * point in the past. For per-task counters, this is based on the - * task clock, and for per-cpu counters it is based on the cpu clock. - * time_lost is an offset from the task/cpu clock, used to make it - * appear that time only passes while the context is scheduled in. + * Context clock, runs when context enabled. */ - u64 time_now; - u64 time_lost; + u64 time; + u64 timestamp; #endif }; -- cgit From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:12 +0200 Subject: perf_counter: remove rq->lock usage Now that all the task runtime clock users are gone, remove the ugly rq->lock usage from perf counters, which solves the nasty deadlock seen when a software task clock counter was read from an NMI overflow context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.531137582@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b6d2887a5d8..080d1fd461d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern void curr_rq_lock_irq_save(unsigned long *flags); -extern void curr_rq_unlock_irq_restore(unsigned long *flags); extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); -- cgit From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:26 +0200 Subject: perf_counter: provide misc bits in the event header Limit the size of each record to 64k (or should we count in multiples of u64 and have a 512K limit?), this gives 16 bits or spare room in the header, which we can use for misc bits, so as to not have to grow the record with u64 every time we have a few bits to report. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.769271806@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7f5d353d78a..5bd8817b12d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,9 +201,13 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) + struct perf_event_header { __u32 type; - __u32 size; + __u16 misc; + __u16 size; }; enum perf_event_type { -- cgit From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:27 +0200 Subject: perf_counter: use misc field to widen type Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that we can have the full 32bit for PERF_RECORD_ bits. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.891867663@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5bd8817b12d..4809ae18a94 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,8 +201,9 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -230,36 +231,27 @@ enum perf_event_type { PERF_EVENT_MUNMAP = 2, /* - * Half the event type space is reserved for the counter overflow - * bitfields, as found in hw_event.record_type. - * - * These events will have types of the form: - * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * will be PERF_RECORD_* * * struct { * struct perf_event_header header; * - * { u64 ip; } && __PERF_EVENT_IP - * { u32 pid, tid; } && __PERF_EVENT_TID + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID * * { u64 nr; - * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * * { u16 nr, * hv, * kernel, * user; - * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * - * { u64 time; } && __PERF_EVENT_TIME + * { u64 time; } && PERF_RECORD_TIME * }; */ - PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = PERF_RECORD_IP, - __PERF_EVENT_TID = PERF_RECORD_TID, - __PERF_EVENT_GROUP = PERF_RECORD_GROUP, - __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, - __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ -- cgit From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:29 +0200 Subject: perf_counter: add some comments Add a few comments because I was forgetting what field what for what functionality. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.036984214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4809ae18a94..8bf764fc622 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -344,10 +344,12 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; - int nr_pages; - atomic_t wakeup; - atomic_t head; - atomic_t events; + int nr_pages; /* nr of data pages */ + + atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t head; /* write position */ + atomic_t events; /* event limit */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; -- cgit From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:30 +0200 Subject: perf_counter: track task-comm data Similar to the mmap data stream, add one that tracks the task COMM field, so that the userspace reporting knows what to call a task. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.127422406@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bf764fc622..a70a55f2759 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -142,8 +142,9 @@ struct perf_counter_hw_event { exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + comm : 1, /* include comm data */ - __reserved_1 : 53; + __reserved_1 : 52; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -230,6 +231,16 @@ enum perf_event_type { PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +extern void perf_counter_comm(struct task_struct *tsk); + #define MAX_STACK_DEPTH 255 struct perf_callchain_entry { @@ -583,6 +596,7 @@ static inline void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } +static inline void perf_counter_comm(struct task_struct *tsk) { } #endif #endif /* __KERNEL__ */ -- cgit From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:32 +0200 Subject: perf_counter: move PERF_RECORD_TIME Move PERF_RECORD_TIME so that all the fixed length items come before the variable length ones. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.307926436@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a70a55f2759..8bd1be58c93 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -100,9 +100,9 @@ enum sw_event_ids { enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, - PERF_RECORD_GROUP = 1U << 2, - PERF_RECORD_CALLCHAIN = 1U << 3, - PERF_RECORD_TIME = 1U << 4, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_GROUP = 1U << 3, + PERF_RECORD_CALLCHAIN = 1U << 4, }; /* @@ -250,6 +250,7 @@ enum perf_event_type { * * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -259,8 +260,6 @@ enum perf_event_type { * kernel, * user; * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN - * - * { u64 time; } && PERF_RECORD_TIME * }; */ }; -- cgit From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bd1be58c93..c22363a4f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -101,8 +101,9 @@ enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_GROUP = 1U << 3, - PERF_RECORD_CALLCHAIN = 1U << 4, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, }; /* @@ -251,6 +252,7 @@ enum perf_event_type { * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); extern int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs); + int nmi, struct pt_regs *regs, u64 addr); /* * Return 1 for a software counter, 0 for a hardware counter */ @@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter) perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); extern void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); @@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } - +perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } static inline void perf_counter_mmap(unsigned long addr, unsigned long len, -- cgit From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:45 +0200 Subject: perf_counter: sysctl for system wide perf counters Impact: add sysctl for paranoid/relaxed perfcounters policy Allow the use of system wide perf counters to everybody, but provide a sysctl to disable it for the paranoid security minded. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.514046352@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c22363a4f74..98143288530 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -568,6 +568,8 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int sysctl_perf_counter_priv; + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 20:38:21 +0200 Subject: perfcounters, sched: remove __task_delta_exec() This function was left orphan by the latest round of sw-counter cleanups. [ Impact: remove unused kernel function ] Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 080d1fd461d..a77c6007dc9 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t, cputime_t); -- cgit From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:59 +0200 Subject: perf_counter, x86: declare perf_max_counters only for CONFIG_PERF_COUNTERS This is only needed for CONFIG_PERF_COUNTERS enabled. [ Impact: cleanup ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98143288530..be10b3ffe32 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -512,12 +512,13 @@ struct perf_cpu_context { int recursion[4]; }; +#ifdef CONFIG_PERF_COUNTERS + /* * Set by architecture code: */ extern int perf_max_counters; -#ifdef CONFIG_PERF_COUNTERS extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); -- cgit From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index be10b3ffe32..c3db52dc876 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -334,9 +334,9 @@ struct hw_perf_counter { struct perf_counter; /** - * struct hw_perf_counter_ops - performance counter hw ops + * struct pmu - generic performance monitoring unit */ -struct hw_perf_counter_ops { +struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); @@ -381,7 +381,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; enum perf_counter_active_state state; enum perf_counter_active_state prev_state; @@ -519,8 +519,7 @@ struct perf_cpu_context { */ extern int perf_max_counters; -extern const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter); +extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); -- cgit From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c3db52dc876..41aed427005 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -318,7 +318,7 @@ struct hw_perf_counter { unsigned long config_base; unsigned long counter_base; int nmi; - unsigned int idx; + int idx; }; union { /* software */ atomic64_t count; -- cgit From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:16 +0200 Subject: perf_counter: fix race in perf_output_* When two (or more) contexts output to the same buffer, it is possible to observe half written output. Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing perf_counter_overflow(). If CPU1 does a wakeup and exposes head to user-space, then CPU2 can observe the data CPU0 is still writing. [ Impact: fix occasionally corrupted profiling records ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.007821627@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 41aed427005..f776851f8c4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,10 +358,13 @@ struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ - atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ atomic_t events; /* event limit */ + atomic_t wakeup_head; /* completed head */ + atomic_t lock; /* concurrent writes */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; -- cgit From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:13:30 +0200 Subject: perf_counter: initialize the per-cpu context earlier percpu scheduling for perfcounters wants to take the context lock, but that lock first needs to be initialized. Currently it is an early_initcall() - but that is too late, the task tick runs much sooner than that. Call it explicitly from the scheduler init sequence instead. [ Impact: fix access-before-init crash ] LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f776851f8c4..a356fa69796 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern void perf_counter_init(void); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len, static inline void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } + unsigned long pgoff, struct file *file) { } static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_init(void) { } #endif #endif /* __KERNEL__ */ -- cgit From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:22 +0200 Subject: perf_counter: uncouple data_head updates from wakeups Keep data_head up-to-date irrespective of notifications. This fixes the case where you disable a counter and don't get a notification for the last few pending events, and it also allows polling usage. [ Impact: increase precision of perfcounter mmap-ed fields ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155436.925084300@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a356fa69796..17b63105f2a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -362,9 +362,11 @@ struct perf_mmap_data { atomic_t head; /* write position */ atomic_t events; /* event limit */ - atomic_t wakeup_head; /* completed head */ + atomic_t done_head; /* completed head */ atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; -- cgit From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:23 +0200 Subject: perf_counter: add ioctl(PERF_COUNTER_IOC_RESET) Provide a way to reset an existing counter - this eases PAPI libraries around perfcounters. Similar to read() it doesn't collapse pending child counters. [ Impact: new perfcounter fd ioctl method to reset counters ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155437.022272933@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 17b63105f2a..0fcbf34a4f7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,6 +160,7 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) /* * Structure of the page that can be mapped via mmap -- cgit From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:24 +0200 Subject: perf_counter: provide an mlock threshold Provide a threshold to relax the mlock accounting, increasing usability. Each counter gets perf_counter_mlock_kb for free. [ Impact: allow more mmap buffering ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.112113632@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0fcbf34a4f7..00081d84169 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,6 +358,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ + int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ @@ -575,6 +576,7 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); -- cgit From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:22 +0200 Subject: perf_counter: rework ioctl()s Corey noticed that ioctl()s on grouped counters didn't work on the whole group. This extends the ioctl() interface to take a second argument that is interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle the behaviour. Having this flag gives the greatest flexibility, allowing you to individually enable/disable/reset counters in a group, or all together. [ Impact: fix group counter enable/disable semantics ] Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090508170028.837558214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 00081d84169..88f863ec274 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -157,10 +157,14 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32) +#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; /* * Structure of the page that can be mapped via mmap -- cgit From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:23 +0200 Subject: perf_counter: add PERF_RECORD_CONFIG Much like CONFIG_RECORD_GROUP records the hw_event.config to identify the values, allow to record this for all counters. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.923228280@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 88f863ec274..0e6303d36c6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -104,6 +104,7 @@ enum perf_counter_record_format { PERF_RECORD_ADDR = 1U << 3, PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, }; /* @@ -258,6 +259,7 @@ enum perf_event_type { * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP -- cgit From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:24 +0200 Subject: perf_counter: add PERF_RECORD_CPU Allow recording the CPU number the event was generated on. RFC: this leaves a u32 as reserved, should we fill in the node_id() there, or leave this open for future extention, as userspace can already easily do the cpu->node mapping if needed. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170029.008627711@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0e6303d36c6..614f921d616 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -105,6 +105,7 @@ enum perf_counter_record_format { PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -260,6 +261,7 @@ enum perf_event_type { * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP -- cgit From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 614f921d616..e543ecc129f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); -extern u64 hw_perf_save_disable(void); -extern void hw_perf_restore(u64 ctrl); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } -static inline u64 hw_perf_save_disable(void) { return 0; } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -- cgit From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:27 +0200 Subject: perf_counter: per user mlock gift Instead of a per-process mlock gift for perf-counters, use a per-user gift so that there is less of a DoS potential. [ Impact: allow less worst-case unprivileged memory consumption ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.496182835@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d1857580a13..ff59d123151 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -674,6 +674,10 @@ struct user_struct { struct work_struct work; #endif #endif + +#ifdef CONFIG_PERF_COUNTERS + atomic_long_t locked_vm; +#endif }; extern int uids_sysfs_init(void); -- cgit From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e543ecc129f..004b6e162b9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -130,7 +130,11 @@ struct perf_counter_hw_event { */ __u64 config; - __u64 irq_period; + union { + __u64 irq_period; + __u64 irq_freq; + }; + __u32 record_type; __u32 read_format; @@ -146,8 +150,9 @@ struct perf_counter_hw_event { mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -337,6 +342,7 @@ struct hw_perf_counter { atomic64_t prev_count; u64 irq_period; atomic64_t period_left; + u64 interrupts; #endif }; -- cgit From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 21:48:08 +1000 Subject: perf_counter: allow arch to supply event misc flags and instruction pointer At present the values we put in overflow events for the misc flags indicating processor mode and the instruction pointer are obtained using the standard user_mode() and instruction_pointer() functions. Those functions tell you where the performance monitor interrupt was taken, which might not be exactly where the counter overflow occurred, for example because interrupts were disabled at the point where the overflow occurred, or because the processor had many instructions in flight and chose to complete some more instructions beyond the one that caused the counter overflow. Some architectures (e.g. powerpc) can supply more precise information about where the counter overflow occurred and the processor mode at that point. This introduces new functions, perf_misc_flags() and perf_instruction_pointer(), which arch code can override to provide more precise information if available. They have default implementations which are identical to the existing code. This also adds a new misc flag value, PERF_EVENT_MISC_HYPERVISOR, for the case where a counter overflow occurred in the hypervisor. We encode the processor mode in the 2 bits previously used to indicate user or kernel mode; the values for user and kernel mode are unchanged and hypervisor mode is indicated by both bits being set. [ Impact: generalize perfcounter core facilities ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 004b6e162b9..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -215,8 +215,11 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) #define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { @@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ + PERF_EVENT_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix counter inheritance race Context rotation should not occur when we are in the middle of walking the counter list when inheriting counters ... [ Impact: fix occasionally incorrect perf stat results ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..13cb2fbbf33 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,6 +508,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + int rr_allowed; struct task_struct *task; /* -- cgit From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:19 +0200 Subject: perf_counter: Solve the rotate_ctx vs inherit race differently Instead of disabling RR scheduling of the counters, use a different list that does not get rotated to iterate the counters on inheritance. [ Impact: cleanup, optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.237504544@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 13cb2fbbf33..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,7 +508,6 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; - int rr_allowed; struct task_struct *task; /* -- cgit From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:20 +0200 Subject: perf_counter: Log irq_period changes For the dynamic irq_period code, log whenever we change the period so that analyzing code can normalize the event flow. [ Impact: add new feature to allow more precise profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.298769743@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..f612941ef46 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -257,6 +257,14 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u64 irq_period; + * }; + */ + PERF_EVENT_PERIOD = 4, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* -- cgit From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 13 ------------- include/linux/perf_counter.h | 4 +--- include/linux/sched.h | 6 ++++-- 3 files changed, 5 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 503afaa0afa..d87247d2641 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,18 +108,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ - .perf_counter_ctx.event_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ - .perf_counter_ctx.lock = \ - __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), -#else -# define INIT_PERF_COUNTERS(tsk) -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -183,7 +171,6 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f612941ef46..07130900546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -449,7 +449,6 @@ struct perf_counter { struct hw_perf_counter hw; struct perf_counter_context *ctx; - struct task_struct *task; struct file *filp; struct perf_counter *parent; @@ -498,7 +497,6 @@ struct perf_counter { * Used as a container for task counters and CPU counters as well: */ struct perf_counter_context { -#ifdef CONFIG_PERF_COUNTERS /* * Protect the states of the counters in the list, * nr_active, and the list: @@ -516,6 +514,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + atomic_t refcount; struct task_struct *task; /* @@ -523,7 +522,6 @@ struct perf_counter_context { */ u64 time; u64 timestamp; -#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d123151..9714d450f41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; -- cgit From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:27:22 +1000 Subject: perf_counter: Optimize context switch between identical inherited contexts When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 07130900546..4cae01a5045 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,6 +513,7 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; + int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; @@ -522,6 +523,14 @@ struct perf_counter_context { */ u64 time; u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_counter_context *parent_ctx; + u32 parent_gen; + u32 generation; }; /** @@ -552,7 +561,8 @@ extern int perf_max_counters; extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); -- cgit From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 May 2009 12:32:15 +0200 Subject: perf_counter: fix !PERF_COUNTERS build failure Update the !CONFIG_PERF_COUNTERS prototype too, for perf_counter_task_sched_out(). [ Impact: build fix ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4cae01a5045..2eedae8498d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -625,7 +625,8 @@ extern void perf_counter_init(void); static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void -perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } -- cgit From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:55 +0200 Subject: perf_counter: Fix dynamic irq_period logging We call perf_adjust_freq() from perf_counter_task_tick() which is is called under the rq->lock causing lock recursion. However, it's no longer required to be called under the rq->lock, so remove it from under it. Also, fix up some related comments. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.476197912@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2eedae8498d..23ddd29730f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -260,6 +260,7 @@ enum perf_event_type { /* * struct { * struct perf_event_header header; + * u64 time; * u64 irq_period; * }; */ -- cgit From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:56 +0200 Subject: perf_counter: Sanitize counter->mutex s/counter->mutex/counter->child_mutex/ and make sure its only used to protect child_list. The usage in __perf_counter_exit_task() doesn't appear to be problematic since ctx->mutex also covers anything related to fd tear-down. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.533186528@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 23ddd29730f..4ab8050eb9e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -452,9 +452,6 @@ struct perf_counter { struct perf_counter_context *ctx; struct file *filp; - struct perf_counter *parent; - struct list_head child_list; - /* * These accumulate total time (in nanoseconds) that children * counters have been enabled and running, respectively. @@ -465,7 +462,9 @@ struct perf_counter { /* * Protect attach/detach and child_list: */ - struct mutex mutex; + struct mutex child_mutex; + struct list_head child_list; + struct perf_counter *parent; int oncpu; int cpu; -- cgit From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:00 +0200 Subject: perf_counter: Change pctrl() behaviour Instead of en/dis-abling all counters acting on a particular task, en/dis- able all counters we created. [ v2: fix crash on first counter enable ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.916937244@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 ++++++++++ include/linux/perf_counter.h | 3 +++ include/linux/sched.h | 2 ++ 3 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d87247d2641..353c0ac7723 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,6 +108,15 @@ extern struct group_info init_groups; extern struct cred init_cred; +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ + .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -171,6 +180,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ + INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4ab8050eb9e..4159ee5940f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -469,6 +469,9 @@ struct perf_counter { int oncpu; int cpu; + struct list_head owner_entry; + struct task_struct *owner; + /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9714d450f41..bc9326dcdde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1389,6 +1389,8 @@ struct task_struct { #endif #ifdef CONFIG_PERF_COUNTERS struct perf_counter_context *perf_counter_ctxp; + struct mutex perf_counter_mutex; + struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; -- cgit From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:01 +0200 Subject: perf_counter: Remove perf_counter_context::nr_enabled now that pctrl() no longer disables other people's counters, remove the PMU cache code that deals with that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163013.032998331@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4159ee5940f..2ddf5e3c551 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -516,7 +516,6 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; - int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; -- cgit From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:25 +0200 Subject: perf_counter: Remove unused ABI bits extra_config_len isn't used for anything, remove it. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.116035832@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2ddf5e3c551..b1f2bac09f9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -154,11 +154,11 @@ struct perf_counter_hw_event { __reserved_1 : 51; - __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ + __u32 __reserved_2; - __u64 __reserved_2; __u64 __reserved_3; + __u64 __reserved_4; }; /* -- cgit From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:27 +0200 Subject: perf_counter: Propagate inheritance failures down the fork() path Fail fork() when we fail inheritance for some reason (-ENOMEM most likely). Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.324656474@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b1f2bac09f9..d3e85de9bf1 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *child); +extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); @@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *child) { } +static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -- cgit From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d3e85de9bf1..0c160be2078 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); -extern void perf_counter_unthrottle(void); extern void __perf_disable(void); extern bool __perf_enable(void); extern void perf_disable(void); @@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -static inline void perf_counter_unthrottle(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } -- cgit From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0c160be2078..e3a7585d3e4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -266,6 +266,15 @@ enum perf_event_type { */ PERF_EVENT_PERIOD = 4, + /* + * struct { + * struct perf_event_header header; + * u64 time; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -367,6 +376,7 @@ struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); + void (*unthrottle) (struct perf_counter *counter); }; /** @@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; extern int sysctl_perf_counter_mlock; +extern int sysctl_perf_counter_limit; extern void perf_counter_init(void); -- cgit From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 22:03:26 +0200 Subject: perf_counter: fix warning & lockup - remove bogus warning - fix wakeup from NMI path lockup - also fix up whitespace noise in perf_counter.h Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 78 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e3a7585d3e4..2b16ed37b74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,7 +73,7 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -#define __PERF_COUNTER_MASK(name) \ +#define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -98,14 +98,14 @@ enum sw_event_ids { * in the overflow packets. */ enum perf_counter_record_format { - PERF_RECORD_IP = 1U << 0, - PERF_RECORD_TID = 1U << 1, - PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_ADDR = 1U << 3, - PERF_RECORD_GROUP = 1U << 4, - PERF_RECORD_CALLCHAIN = 1U << 5, - PERF_RECORD_CONFIG = 1U << 6, - PERF_RECORD_CPU = 1U << 7, + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -235,13 +235,13 @@ enum perf_event_type { * correlate userspace IPs to code. They have the following structure: * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; * }; */ PERF_EVENT_MMAP = 1, @@ -249,27 +249,27 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * char comm[]; + * u32 pid, tid; + * char comm[]; * }; */ PERF_EVENT_COMM = 3, /* * struct { - * struct perf_event_header header; - * u64 time; - * u64 irq_period; + * struct perf_event_header header; + * u64 time; + * u64 irq_period; * }; */ PERF_EVENT_PERIOD = 4, /* * struct { - * struct perf_event_header header; - * u64 time; + * struct perf_event_header header; + * u64 time; * }; */ PERF_EVENT_THROTTLE = 5, @@ -280,23 +280,23 @@ enum perf_event_type { * will be PERF_RECORD_* * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * { u64 ip; } && PERF_RECORD_IP - * { u32 pid, tid; } && PERF_RECORD_TID - * { u64 time; } && PERF_RECORD_TIME - * { u64 addr; } && PERF_RECORD_ADDR - * { u64 config; } && PERF_RECORD_CONFIG - * { u32 cpu, res; } && PERF_RECORD_CPU + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * - * { u64 nr; - * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * - * { u16 nr, - * hv, - * kernel, - * user; - * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * }; */ }; @@ -406,7 +406,7 @@ struct perf_mmap_data { atomic_t wakeup; /* needs a wakeup */ struct perf_counter_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[0]; }; struct perf_pending_entry { @@ -422,7 +422,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; - int nr_siblings; + int nr_siblings; struct perf_counter *group_leader; const struct pmu *pmu; -- cgit From d3e78ee3d015dac1794433abb6403b6fc8e70e10 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2009 11:41:50 +0200 Subject: perf_counter: Fix perf_counter_init_task() on !CONFIG_PERF_COUNTERS Pointed out by compiler warnings: tip/include/linux/perf_counter.h:644: warning: no return statement in function returning non-void Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b16ed37b74..a65ddc58051 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -641,7 +641,7 @@ perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline int perf_counter_init_task(struct task_struct *child) { } +static inline int perf_counter_init_task(struct task_struct *child) { return 0; } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -- cgit From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 28 May 2009 22:18:17 +1000 Subject: perf_counter: Fix race in attaching counters to tasks and exiting Commit 564c2b21 ("perf_counter: Optimize context switch between identical inherited contexts") introduced a race where it is possible that a counter being attached to a task could get attached to the wrong task, if the task is one that has inherited its context from another task via fork. This happens because the optimized context switch could switch the context to another task after find_get_context has read task->perf_counter_ctxp. In fact, it's possible that the context could then get freed, if the other task then exits. This fixes the problem by protecting both the context switch and the critical code in find_get_context with spinlocks. The context switch locks the cxt->lock of both the outgoing and incoming contexts before swapping them. That means that once code such as find_get_context has obtained the spinlock for the context associated with a task, the context can't get swapped to another task. However, the context may have been swapped in the interval between reading task->perf_counter_ctxp and getting the lock, so it is necessary to check and retry. To make sure that none of the contexts being looked at in find_get_context can get freed, this changes the context freeing code to use RCU. Thus an rcu_read_lock() is sufficient to ensure that no contexts can get freed. This part of the patch is lifted from a patch posted by Peter Zijlstra. This also adds a check to make sure that we can't add a counter to a task that is exiting. There is also a race between perf_counter_exit_task and find_get_context; this solves the race by moving the get_ctx that was in perf_counter_alloc into the locked region in find_get_context, so that once find_get_context has got the context for a task, it won't get freed even if the task calls perf_counter_exit_task. It doesn't matter if new top-level (non-inherited) counters get attached to the context after perf_counter_exit_task has detached the context from the task. They will just stay there and never get scheduled in until the counters' fds get closed, and then perf_release will remove them from the context and eventually free the context. With this, we are now doing the unclone in find_get_context rather than when a counter was added to or removed from a context (actually, we were missing the unclone_ctx() call when adding a counter to a context). We don't need to unclone when removing a counter from a context because we have no way to remove a counter from a cloned context. This also takes out the smp_wmb() in find_get_context, which Peter Zijlstra pointed out was unnecessary because the cmpxchg implies a full barrier anyway. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a65ddc58051..717bf3b59ba 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -541,8 +541,9 @@ struct perf_counter_context { * been cloned (inherited) from a common ancestor. */ struct perf_counter_context *parent_ctx; - u32 parent_gen; - u32 generation; + u64 parent_gen; + u64 generation; + struct rcu_head rcu_head; }; /** -- cgit From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2009 14:25:58 +0200 Subject: perf_counter: Ammend cleanup in fork() fail When fork() fails we cannot use perf_counter_exit_task() since that assumes to operate on current. Write a new helper that cleans up unused/clean contexts. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 717bf3b59ba..519a41bba24 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -579,6 +579,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); +extern void perf_counter_free_task(struct task_struct *task); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void __perf_disable(void); @@ -644,6 +645,7 @@ static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline int perf_counter_init_task(struct task_struct *child) { return 0; } static inline void perf_counter_exit_task(struct task_struct *child) { } +static inline void perf_counter_free_task(struct task_struct *task) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_disable(void) { } -- cgit From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:48:12 +1000 Subject: perf_counter: Provide functions for locking and pinning the context for a task This abstracts out the code for locking the context associated with a task. Because the context might get transferred from one task to another concurrently, we have to check after locking the context that it is still the right context for the task and retry if not. This was open-coded in find_get_context() and perf_counter_init_task(). This adds a further function for pinning the context for a task, i.e. marking it so it can't be transferred to another task. This adds a 'pin_count' field to struct perf_counter_context to indicate that a context is pinned, instead of the previous method of setting the parent_gen count to all 1s. Pinning the context with a pin_count is easier to undo and doesn't require saving the parent_gen value. This also adds a perf_unpin_context() to undo the effect of perf_pin_task_context() and changes perf_counter_init_task to use it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 519a41bba24..81ec79c9f19 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -543,6 +543,7 @@ struct perf_counter_context { struct perf_counter_context *parent_ctx; u64 parent_gen; u64 generation; + int pin_count; struct rcu_head rcu_head; }; -- cgit From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 1 Jun 2009 10:13:37 +0200 Subject: perf_counter: Tidy up style details - whitespace fixlets - make local variable definitions more consistent [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 81ec79c9f19..0e57d8cc5a3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -562,7 +562,7 @@ struct perf_cpu_context { * * task, softirq, irq, nmi context */ - int recursion[4]; + int recursion[4]; }; #ifdef CONFIG_PERF_COUNTERS -- cgit From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:52:30 +1000 Subject: perf_counter: Fix cpu migration counter This fixes the cpu migration software counter to count correctly even when contexts get swapped from one task to another. Previously the cpu migration counts reported by perf stat were bogus, ranging from negative to several thousand for a single "lat_ctx 2 8 32" run. With this patch the cpu migration count reported for "lat_ctx 2 8 32" is almost always between 35 and 44. This fixes the problem by adding a call into the perf_counter code from set_task_cpu when tasks are migrated. This enables us to use the generic swcounter code (with some modifications) for the cpu migration counter. This modifies the swcounter code to allow a NULL regs pointer to be passed in to perf_swcounter_ctx_event() etc. The cpu migration counter does this because there isn't necessarily a pt_regs struct for the task available. In this case, the counter will not have interrupt capability - but the migration counter didn't have interrupt capability before, so this is no loss. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0e57d8cc5a3..deb9acf9ad2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len, extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_task_migration(struct task_struct *task, int cpu); + #define MAX_STACK_DEPTH 255 struct perf_callchain_entry { @@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len, static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } +static inline void perf_counter_task_migration(struct task_struct *task, + int cpu) { } #endif #endif /* __KERNEL__ */ -- cgit From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:53:16 +1000 Subject: perf_counter: Remove unused prev_state field This removes the prev_state field of struct perf_counter since it is now unused. It was only used by the cpu migration counter, which doesn't use it any more. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index deb9acf9ad2..d970fbc16af 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -427,7 +427,6 @@ struct perf_counter { const struct pmu *pmu; enum perf_counter_active_state state; - enum perf_counter_active_state prev_state; atomic64_t count; /* -- cgit From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 14:13:15 +0200 Subject: perf_counter: Use PID namespaces properly Stop using task_struct::pid and start using PID namespaces. PIDs will be reported in the PID namespace of the monitoring task at the moment of counter creation. Signed-off-by: Peter Zijlstra Cc: Eric W. Biederman Cc: Oleg Nesterov Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d970fbc16af..9ec20fc6bd3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -317,6 +317,7 @@ enum perf_event_type { #include #include #include +#include #include struct task_struct; @@ -500,6 +501,8 @@ struct perf_counter { void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; + + struct pid_namespace *ns; #endif }; -- cgit From 53e111a730ea8b002d57dd226098c12789993329 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 17:01:58 +0200 Subject: x86: Fix atomic_long_xchg() on 64bit Apparently I'm the first to use it :-) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/asm-generic/atomic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 3673a13b670..81d3be459ef 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) #define atomic_long_cmpxchg(l, old, new) \ (atomic64_cmpxchg((atomic64_t *)(l), (old), (new))) #define atomic_long_xchg(v, new) \ - (atomic64_xchg((atomic64_t *)(l), (new))) + (atomic64_xchg((atomic64_t *)(v), (new))) #else /* BITS_PER_LONG == 64 */ -- cgit From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:08:15 +0200 Subject: perf_counter: Add unique counter id Stephan raised the issue that we currently cannot distinguish between similar counters within a group (PERF_RECORD_GROUP uses the config value as identifier). Therefore, generate a new ID for each counter using a global u64 sequence counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9ec20fc6bd3..4845a214b9e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -114,8 +114,9 @@ enum perf_counter_record_format { * in increasing order of bit value, after the counter value. */ enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1, - PERF_FORMAT_TOTAL_TIME_RUNNING = 2, + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, }; /* @@ -290,7 +291,7 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_RECORD_CPU * * { u64 nr; - * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP * * { u16 nr, * hv, @@ -503,6 +504,7 @@ struct perf_counter { struct rcu_head rcu_head; struct pid_namespace *ns; + u64 id; #endif }; -- cgit From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4845a214b9e..1fcd3cc9385 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -94,18 +94,18 @@ enum sw_event_ids { #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) /* - * Bits that can be set in hw_event.record_type to request information + * Bits that can be set in hw_event.sample_type to request information * in the overflow packets. */ -enum perf_counter_record_format { - PERF_RECORD_IP = 1U << 0, - PERF_RECORD_TID = 1U << 1, - PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_ADDR = 1U << 3, - PERF_RECORD_GROUP = 1U << 4, - PERF_RECORD_CALLCHAIN = 1U << 5, - PERF_RECORD_CONFIG = 1U << 6, - PERF_RECORD_CPU = 1U << 7, +enum perf_counter_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_CONFIG = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, }; /* @@ -132,12 +132,12 @@ struct perf_counter_hw_event { __u64 config; union { - __u64 irq_period; - __u64 irq_freq; + __u64 sample_period; + __u64 sample_freq; }; - __u32 record_type; - __u32 read_format; + __u64 sample_type; + __u64 read_format; __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ @@ -262,7 +262,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; - * u64 irq_period; + * u64 sample_period; * }; */ PERF_EVENT_PERIOD = 4, @@ -363,7 +363,7 @@ struct hw_perf_counter { }; }; atomic64_t prev_count; - u64 irq_period; + u64 sample_period; atomic64_t period_left; u64 interrupts; #endif -- cgit From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:27:45 +0200 Subject: perf_counter: Remove the last nmi/irq bits IRQ (non-NMI) sampling is not used anymore - remove the last few bits. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1fcd3cc9385..cef9931793f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -140,7 +140,6 @@ struct perf_counter_hw_event { __u64 read_format; __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -153,7 +152,7 @@ struct perf_counter_hw_event { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 51; + __reserved_1 : 52; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -354,7 +353,6 @@ struct hw_perf_counter { u64 config; unsigned long config_base; unsigned long counter_base; - int nmi; int idx; }; union { /* software */ -- cgit From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:16:02 +0200 Subject: perf_counter: Change data head from u32 to u64 Since some people worried that 4G might not be a large enough as an mmap data window, extend it to 64 bit for capable platforms. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index cef9931793f..c046f7d97cf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -212,7 +212,7 @@ struct perf_counter_mmap_page { * User-space reading this value should issue an rmb(), on SMP capable * platforms, after reading this value -- see perf_counter_wakeup(). */ - __u32 data_head; /* head in the data section */ + __u64 data_head; /* head in the data section */ }; #define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) @@ -397,10 +397,11 @@ struct perf_mmap_data { int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ - atomic_t head; /* write position */ atomic_t events; /* event limit */ - atomic_t done_head; /* completed head */ + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + atomic_t lock; /* concurrent writes */ atomic_t wakeup; /* needs a wakeup */ -- cgit From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:46:57 +0200 Subject: perf_counter: Add ioctl for changing the sample period/frequency Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c046f7d97cf..45bdd3b95d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -164,10 +164,11 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32) -#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32) -#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) -#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32) +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, -- cgit From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 34 +++++++++++++++++----------------- include/linux/syscalls.h | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 45bdd3b95d3..37d5541d74c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -22,7 +22,7 @@ */ /* - * hw_event.type + * attr.type */ enum perf_event_types { PERF_TYPE_HARDWARE = 0, @@ -37,10 +37,10 @@ enum perf_event_types { }; /* - * Generalized performance counter event types, used by the hw_event.event_id + * Generalized performance counter event types, used by the attr.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum hw_event_ids { +enum attr_ids { /* * Common hardware events, generalized by the kernel: */ @@ -94,7 +94,7 @@ enum sw_event_ids { #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) /* - * Bits that can be set in hw_event.sample_type to request information + * Bits that can be set in attr.sample_type to request information * in the overflow packets. */ enum perf_counter_sample_format { @@ -109,7 +109,7 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in hw_event.read_format to request that + * Bits that can be set in attr.read_format to request that * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ @@ -122,7 +122,7 @@ enum perf_counter_read_format { /* * Hardware event to monitor via a performance monitoring counter: */ -struct perf_counter_hw_event { +struct perf_counter_attr { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -323,25 +323,25 @@ enum perf_event_type { struct task_struct; -static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_raw(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_RAW_MASK; + return attr->config & PERF_COUNTER_RAW_MASK; } -static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_config(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_CONFIG_MASK; + return attr->config & PERF_COUNTER_CONFIG_MASK; } -static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_type(struct perf_counter_attr *attr) { - return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + return (attr->config & PERF_COUNTER_TYPE_MASK) >> PERF_COUNTER_TYPE_SHIFT; } -static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_id(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_EVENT_MASK; + return attr->config & PERF_COUNTER_EVENT_MASK; } /** @@ -457,7 +457,7 @@ struct perf_counter { u64 tstamp_running; u64 tstamp_stopped; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !perf_event_raw(&counter->hw_event) && - perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->attr) && + perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 79faae950e2..c6c84ad8bd7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_hw_event; +struct perf_counter_attr; #include #include @@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( - const struct perf_counter_hw_event __user *hw_event_uptr, + const struct perf_counter_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif -- cgit From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 16:53:44 +0200 Subject: perf_counter: Add fork event Create a fork event so that we can easily clone the comm and dso maps without having to generate all those events. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 37d5541d74c..380247bdb91 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -276,6 +276,14 @@ enum perf_event_type { PERF_EVENT_THROTTLE = 5, PERF_EVENT_UNTHROTTLE = 6, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * }; + */ + PERF_EVENT_FORK = 7, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -618,6 +626,7 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_fork(struct task_struct *tsk); extern void perf_counter_task_migration(struct task_struct *task, int cpu); @@ -673,6 +682,7 @@ perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } static inline void perf_counter_task_migration(struct task_struct *task, int cpu) { } -- cgit From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 17:08:58 +0200 Subject: perf_counter: Remove munmap stuff In name of keeping it simple, only track mmap events. Userspace will have to remove old overlapping maps when it encounters them. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 380247bdb91..6ca403acd41 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -148,11 +148,10 @@ struct perf_counter_attr { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ - munmap : 1, /* include munmap data */ comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 52; + __reserved_1 : 53; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -246,7 +245,6 @@ enum perf_event_type { * }; */ PERF_EVENT_MMAP = 1, - PERF_EVENT_MUNMAP = 2, /* * struct { @@ -622,9 +620,6 @@ extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); extern void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -extern void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file); - extern void perf_counter_comm(struct task_struct *tsk); extern void perf_counter_fork(struct task_struct *tsk); @@ -677,10 +672,6 @@ static inline void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } -static inline void -perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } - static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } -- cgit From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:04:55 +0200 Subject: perf_counter: Generate mmap events for install_special_mapping() In order to track the vdso also generate mmap events for install_special_mapping(). Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6ca403acd41..40dc0e273d9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -617,8 +617,13 @@ static inline int is_software_counter(struct perf_counter *counter) extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); -extern void perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file); +extern void __perf_counter_mmap(struct vm_area_struct *vma); + +static inline void perf_counter_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_counter_mmap(vma); +} extern void perf_counter_comm(struct task_struct *tsk); extern void perf_counter_fork(struct task_struct *tsk); @@ -668,10 +673,7 @@ static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } -static inline void -perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } - +static inline void perf_counter_mmap(struct vm_area_struct *vma) { } static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } -- cgit From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:44:52 +0200 Subject: perf_counter: Change PERF_SAMPLE_CONFIG into PERF_SAMPLE_ID The purpose of PERF_SAMPLE_CONFIG was to identify the counters, since then we've added counter ids, use those instead. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40dc0e273d9..9cea32a0655 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -104,7 +104,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_ADDR = 1U << 3, PERF_SAMPLE_GROUP = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_CONFIG = 1U << 6, + PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, }; -- cgit From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 15:05:43 +0200 Subject: perf_counter: Add PERF_SAMPLE_PERIOD In order to allow easy tracking of the period, also provide means of adding it to the sample data. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9cea32a0655..6bc25003973 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -106,6 +106,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, }; /* @@ -260,6 +261,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; + * u64 id; * u64 sample_period; * }; */ -- cgit From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 18:01:29 +0200 Subject: perf_counter: Fix frequency adjustment for < HZ Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6bc25003973..4f9d39ecdc0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -373,6 +373,9 @@ struct hw_perf_counter { u64 sample_period; atomic64_t period_left; u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; #endif }; -- cgit From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 65 +++++++++----------------------------------- 1 file changed, 13 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4f9d39ecdc0..f794c69b34c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,26 +73,6 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -#define __PERF_COUNTER_MASK(name) \ - (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ - PERF_COUNTER_##name##_SHIFT) - -#define PERF_COUNTER_RAW_BITS 1 -#define PERF_COUNTER_RAW_SHIFT 63 -#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) - -#define PERF_COUNTER_CONFIG_BITS 63 -#define PERF_COUNTER_CONFIG_SHIFT 0 -#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) - -#define PERF_COUNTER_TYPE_BITS 7 -#define PERF_COUNTER_TYPE_SHIFT 56 -#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) - -#define PERF_COUNTER_EVENT_BITS 56 -#define PERF_COUNTER_EVENT_SHIFT 0 -#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) - /* * Bits that can be set in attr.sample_type to request information * in the overflow packets. @@ -125,10 +105,13 @@ enum perf_counter_read_format { */ struct perf_counter_attr { /* - * The MSB of the config word signifies if the rest contains cpu - * specific (raw) counter configuration data, if unset, the next - * 7 bits are an event type and the rest of the bits are the event - * identifier. + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + __u32 __reserved_1; + + /* + * Type specific configuration information. */ __u64 config; @@ -152,12 +135,11 @@ struct perf_counter_attr { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 53; + __reserved_2 : 53; __u32 wakeup_events; /* wakeup every n events */ - __u32 __reserved_2; + __u32 __reserved_3; - __u64 __reserved_3; __u64 __reserved_4; }; @@ -278,8 +260,8 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u32 pid, ppid; + * struct perf_event_header header; + * u32 pid, ppid; * }; */ PERF_EVENT_FORK = 7, @@ -331,27 +313,6 @@ enum perf_event_type { struct task_struct; -static inline u64 perf_event_raw(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_RAW_MASK; -} - -static inline u64 perf_event_config(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_CONFIG_MASK; -} - -static inline u64 perf_event_type(struct perf_counter_attr *attr) -{ - return (attr->config & PERF_COUNTER_TYPE_MASK) >> - PERF_COUNTER_TYPE_SHIFT; -} - -static inline u64 perf_event_id(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_EVENT_MASK; -} - /** * struct hw_perf_counter - performance counter hardware details: */ @@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !perf_event_raw(&counter->attr) && - perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE; + return (counter->attr.type != PERF_TYPE_RAW) && + (counter->attr.type != PERF_TYPE_HARDWARE); } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); -- cgit From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f794c69b34c..3586df840f6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -28,6 +28,7 @@ enum perf_event_types { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, /* * available TYPE space, raw is the max value. @@ -55,6 +56,39 @@ enum attr_ids { PERF_HW_EVENTS_MAX = 7, }; +/* + * Generalized hardware cache counters: + * + * { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum hw_cache_id { + PERF_COUNT_HW_CACHE_L1D, + PERF_COUNT_HW_CACHE_L1I, + PERF_COUNT_HW_CACHE_L2, + PERF_COUNT_HW_CACHE_DTLB, + PERF_COUNT_HW_CACHE_ITLB, + PERF_COUNT_HW_CACHE_BPU, + + PERF_COUNT_HW_CACHE_MAX, +}; + +enum hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ, + PERF_COUNT_HW_CACHE_OP_WRITE, + PERF_COUNT_HW_CACHE_OP_PREFETCH, + + PERF_COUNT_HW_CACHE_OP_MAX, +}; + +enum hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS, + PERF_COUNT_HW_CACHE_RESULT_MISS, + + PERF_COUNT_HW_CACHE_RESULT_MAX, +}; + /* * Special "software" counters provided by the kernel, even if the hardware * does not support performance counters. These counters measure various -- cgit From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3586df840f6..282d8cc4898 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -371,6 +371,7 @@ struct hw_perf_counter { u64 freq_count; u64 freq_interrupts; + u64 freq_stamp; #endif }; -- cgit From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 282d8cc4898..d8c0eb480f9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_counter_context *ctx, int cpu); extern void perf_counter_update_userpage(struct perf_counter *counter); -extern int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr); +struct perf_sample_data { + struct pt_regs *regs; + u64 addr; +}; + +extern int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); + /* * Return 1 for a software counter, 0 for a hardware counter */ -- cgit From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d8c0eb480f9..5b966472b45 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -366,6 +366,7 @@ struct hw_perf_counter { }; atomic64_t prev_count; u64 sample_period; + u64 last_period; atomic64_t period_left; u64 interrupts; @@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; - u64 addr; + struct pt_regs *regs; + u64 addr; + u64 period; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, -- cgit From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:18:36 +0200 Subject: perf_counter: More paranoia settings Rename the perf_counter_priv knob to perf_counter_paranoia (because priv can be read as private, as opposed to privileged) and provide one more level: 0 - permissive 1 - restrict cpu counters to privilidged contexts 2 - restrict kernel-mode code counting and profiling Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5b966472b45..386be915baa 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -648,7 +648,7 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); -extern int sysctl_perf_counter_priv; +extern int sysctl_perf_counter_paranoid; extern int sysctl_perf_counter_mlock; extern int sysctl_perf_counter_limit; -- cgit From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:25:05 +0200 Subject: perf_counter: Rename perf_counter_limit sysctl Rename perf_counter_limit to perf_counter_max_sample_rate and prohibit creation of counters with a known higher sample frequency. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 386be915baa..95c797c480e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -650,7 +650,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_paranoid; extern int sysctl_perf_counter_mlock; -extern int sysctl_perf_counter_limit; +extern int sysctl_perf_counter_sample_rate; extern void perf_counter_init(void); -- cgit From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 13:19:29 +0200 Subject: perf_counter: Rename enums Rename the perf enums to be in the 'perf_' namespace and strictly enumerate the ABI bits. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 53 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 95c797c480e..d5911b02bc8 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,24 +24,21 @@ /* * attr.type */ -enum perf_event_types { +enum perf_type_id { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, - /* - * available TYPE space, raw is the max value. - */ - - PERF_TYPE_RAW = 128, + PERF_TYPE_MAX, /* non ABI */ }; /* * Generalized performance counter event types, used by the attr.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum attr_ids { +enum perf_hw_id { /* * Common hardware events, generalized by the kernel: */ @@ -53,7 +50,7 @@ enum attr_ids { PERF_COUNT_BRANCH_MISSES = 5, PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 7, + PERF_HW_EVENTS_MAX, /* non ABI */ }; /* @@ -63,30 +60,30 @@ enum attr_ids { * { read, write, prefetch } x * { accesses, misses } */ -enum hw_cache_id { - PERF_COUNT_HW_CACHE_L1D, - PERF_COUNT_HW_CACHE_L1I, - PERF_COUNT_HW_CACHE_L2, - PERF_COUNT_HW_CACHE_DTLB, - PERF_COUNT_HW_CACHE_ITLB, - PERF_COUNT_HW_CACHE_BPU, - - PERF_COUNT_HW_CACHE_MAX, +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_L2 = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non ABI */ }; -enum hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ, - PERF_COUNT_HW_CACHE_OP_WRITE, - PERF_COUNT_HW_CACHE_OP_PREFETCH, +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - PERF_COUNT_HW_CACHE_OP_MAX, + PERF_COUNT_HW_CACHE_OP_MAX, /* non ABI */ }; -enum hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS, - PERF_COUNT_HW_CACHE_RESULT_MISS, +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - PERF_COUNT_HW_CACHE_RESULT_MAX, + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non ABI */ }; /* @@ -95,7 +92,7 @@ enum hw_cache_op_result_id { * physical and sw events of the kernel (and allow the profiling of them as * well): */ -enum sw_event_ids { +enum perf_sw_ids { PERF_COUNT_CPU_CLOCK = 0, PERF_COUNT_TASK_CLOCK = 1, PERF_COUNT_PAGE_FAULTS = 2, @@ -104,7 +101,7 @@ enum sw_event_ids { PERF_COUNT_PAGE_FAULTS_MIN = 5, PERF_COUNT_PAGE_FAULTS_MAJ = 6, - PERF_SW_EVENTS_MAX = 7, + PERF_SW_EVENTS_MAX, /* non ABI */ }; /* -- cgit From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d5911b02bc8..887df88a9c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -42,15 +42,15 @@ enum perf_hw_id { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - - PERF_HW_EVENTS_MAX, /* non ABI */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non ABI */ }; /* @@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id { * well): */ enum perf_sw_ids { - PERF_COUNT_CPU_CLOCK = 0, - PERF_COUNT_TASK_CLOCK = 1, - PERF_COUNT_PAGE_FAULTS = 2, - PERF_COUNT_CONTEXT_SWITCHES = 3, - PERF_COUNT_CPU_MIGRATIONS = 4, - PERF_COUNT_PAGE_FAULTS_MIN = 5, - PERF_COUNT_PAGE_FAULTS_MAJ = 6, - - PERF_SW_EVENTS_MAX, /* non ABI */ + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non ABI */ }; /* -- cgit From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 887df88a9c2..20cf5af27ad 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -56,14 +56,14 @@ enum perf_hw_id { /* * Generalized hardware cache counters: * - * { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x * { read, write, prefetch } x * { accesses, misses } */ enum perf_hw_cache_id { PERF_COUNT_HW_CACHE_L1D = 0, PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_L2 = 2, + PERF_COUNT_HW_CACHE_LL = 2, PERF_COUNT_HW_CACHE_DTLB = 3, PERF_COUNT_HW_CACHE_ITLB = 4, PERF_COUNT_HW_CACHE_BPU = 5, -- cgit From a308444ceb576d3089f9ca0dfd097eba6f1e623f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Jun 2009 14:44:26 +0200 Subject: perf_counter: Better align code Whitespace and comment bits. Also update copyrights. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: --- include/linux/perf_counter.h | 165 ++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 80 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 20cf5af27ad..1fa1a26cb1b 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -1,12 +1,13 @@ /* * Performance counters: * - * Copyright(C) 2008, Thomas Gleixner - * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * - * Started by: Thomas Gleixner and Ingo Molnar + * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ @@ -25,18 +26,19 @@ * attr.type */ enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, - PERF_TYPE_MAX, /* non ABI */ + PERF_TYPE_MAX, /* non-ABI */ }; /* - * Generalized performance counter event types, used by the attr.event_id - * parameter of the sys_perf_counter_open() syscall: + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: */ enum perf_hw_id { /* @@ -50,7 +52,7 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, - PERF_COUNT_HW_MAX, /* non ABI */ + PERF_COUNT_HW_MAX, /* non-ABI */ }; /* @@ -61,29 +63,29 @@ enum perf_hw_id { * { accesses, misses } */ enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ }; enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - PERF_COUNT_HW_CACHE_OP_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ }; enum perf_hw_cache_op_result_id { PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ }; /* @@ -93,15 +95,15 @@ enum perf_hw_cache_op_result_id { * well): */ enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - - PERF_COUNT_SW_MAX, /* non ABI */ + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ }; /* @@ -109,15 +111,15 @@ enum perf_sw_ids { * in the overflow packets. */ enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, }; /* @@ -126,9 +128,9 @@ enum perf_counter_sample_format { * in increasing order of bit value, after the counter value. */ enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, }; /* @@ -229,12 +231,12 @@ struct perf_counter_mmap_page { __u64 data_head; /* head in the data section */ }; -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -351,14 +353,14 @@ struct hw_perf_counter { #ifdef CONFIG_PERF_COUNTERS union { struct { /* hardware */ - u64 config; - unsigned long config_base; - unsigned long counter_base; - int idx; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int idx; }; union { /* software */ - atomic64_t count; - struct hrtimer hrtimer; + atomic64_t count; + struct hrtimer hrtimer; }; }; atomic64_t prev_count; @@ -523,37 +525,37 @@ struct perf_counter_context { * Protect the states of the counters in the list, * nr_active, and the list: */ - spinlock_t lock; + spinlock_t lock; /* * Protect the list of counters. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ - struct mutex mutex; + struct mutex mutex; - struct list_head counter_list; - struct list_head event_list; - int nr_counters; - int nr_active; - int is_active; - atomic_t refcount; - struct task_struct *task; + struct list_head counter_list; + struct list_head event_list; + int nr_counters; + int nr_active; + int is_active; + atomic_t refcount; + struct task_struct *task; /* * Context clock, runs when context enabled. */ - u64 time; - u64 timestamp; + u64 time; + u64 timestamp; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ - struct perf_counter_context *parent_ctx; - u64 parent_gen; - u64 generation; - int pin_count; - struct rcu_head rcu_head; + struct perf_counter_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; }; /** @@ -604,9 +606,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; - u64 addr; - u64 period; + struct pt_regs *regs; + u64 addr; + u64 period; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, @@ -636,11 +638,14 @@ extern void perf_counter_fork(struct task_struct *tsk); extern void perf_counter_task_migration(struct task_struct *task, int cpu); -#define MAX_STACK_DEPTH 255 +#define MAX_STACK_DEPTH 255 struct perf_callchain_entry { - u16 nr, hv, kernel, user; - u64 ip[MAX_STACK_DEPTH]; + u16 nr; + u16 hv; + u16 kernel; + u16 user; + u64 ip[MAX_STACK_DEPTH]; }; extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); -- cgit From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:57:55 +0200 Subject: perf_counter: Add counter->id to the throttle event So as to be able to distuinguish between multiple counters. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1fa1a26cb1b..6e133954e2e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -286,6 +286,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; + * u64 id; * }; */ PERF_EVENT_THROTTLE = 5, -- cgit