Discussion:
[RFC PATCH 12/22 -v2] separate out the percpu date into a percpu struct
(too old to reply)
Steven Rostedt
2008-01-09 23:40:06 UTC
Permalink
For better cacheline performance, this patch creates a separate
struct for each CPU with the percpu data grouped together.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 42 +++++++++++++++++++++++-------------------
lib/tracing/tracer.h | 12 ++++++++----
2 files changed, 31 insertions(+), 23 deletions(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:14:43.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:28.000000000 -0500
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/linkage.h>
#include <linux/seq_file.h>
+#include <linux/percpu.h>
#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
@@ -25,6 +26,7 @@
#include "tracer_interface.h"

static struct mctracer_trace mctracer_trace;
+static DEFINE_PER_CPU(struct mctracer_trace_cpu, mctracer_trace_cpu);

static inline notrace void
mctracer_add_trace_entry(struct mctracer_trace *tr,
@@ -35,21 +37,22 @@ mctracer_add_trace_entry(struct mctracer
unsigned long idx, idx_next;
struct mctracer_entry *entry;
struct task_struct *tsk = current;
+ struct mctracer_trace_cpu *data = tr->data[cpu];

- idx = tr->trace_idx[cpu];
+ idx = data->trace_idx;
idx_next = idx + 1;

if (unlikely(idx_next >= tr->entries)) {
- atomic_inc(&tr->underrun[cpu]);
+ atomic_inc(&data->underrun);
idx_next = 0;
}

- tr->trace_idx[cpu] = idx_next;
+ data->trace_idx = idx_next;

- if (unlikely(idx_next != 0 && atomic_read(&tr->underrun[cpu])))
- atomic_inc(&tr->underrun[cpu]);
+ if (unlikely(idx_next != 0 && atomic_read(&data->underrun)))
+ atomic_inc(&data->underrun);

- entry = tr->trace[cpu] + idx * MCTRACER_ENTRY_SIZE;
+ entry = data->trace + idx * MCTRACER_ENTRY_SIZE;
entry->idx = atomic_inc_return(&tr->cnt);
entry->ip = ip;
entry->parent_ip = parent_ip;
@@ -69,11 +72,11 @@ static notrace void trace_function(const

tr = &mctracer_trace;

- atomic_inc(&tr->disabled[cpu]);
- if (likely(atomic_read(&tr->disabled[cpu]) == 1))
+ atomic_inc(&tr->data[cpu]->disabled);
+ if (likely(atomic_read(&tr->data[cpu]->disabled) == 1))
mctracer_add_trace_entry(tr, cpu, ip, parent_ip);

- atomic_dec(&tr->disabled[cpu]);
+ atomic_dec(&tr->data[cpu]->disabled);

raw_local_irq_restore(flags);
}
@@ -83,8 +86,8 @@ static notrace void mctracer_reset(struc
int cpu;

for_each_online_cpu(cpu) {
- tr->trace_idx[cpu] = 0;
- atomic_set(&tr->underrun[cpu], 0);
+ tr->data[cpu]->trace_idx = 0;
+ atomic_set(&tr->data[cpu]->underrun, 0);
}
}

@@ -105,16 +108,16 @@ static struct mctracer_entry *mctracer_e
unsigned long idx,
int cpu)
{
- struct mctracer_entry *array = tr->trace[cpu];
+ struct mctracer_entry *array = tr->data[cpu]->trace;
unsigned long underrun;

if (idx >= tr->entries)
return NULL;

- underrun = atomic_read(&tr->underrun[cpu]);
+ underrun = atomic_read(&tr->data[cpu]->underrun);
if (underrun)
idx = ((underrun - 1) + idx) % tr->entries;
- else if (idx >= tr->trace_idx[cpu])
+ else if (idx >= tr->data[cpu]->trace_idx)
return NULL;

return &array[idx];
@@ -129,7 +132,7 @@ static void *find_next_entry(struct mctr
int i;

for_each_possible_cpu(i) {
- if (!tr->trace[i])
+ if (!tr->data[i]->trace)
continue;
ent = mctracer_entry_idx(tr, iter->next_idx[i], i);
if (ent && (!next || next->idx > ent->idx)) {
@@ -452,6 +455,7 @@ static notrace int mctracer_alloc_buffer
int i;

for_each_possible_cpu(i) {
+ mctracer_trace.data[i] = &per_cpu(mctracer_trace_cpu, i);
array = (struct mctracer_entry *)
__get_free_pages(GFP_KERNEL, order);
if (array == NULL) {
@@ -459,7 +463,7 @@ static notrace int mctracer_alloc_buffer
" %ld bytes for trace buffer!\n", size);
goto free_buffers;
}
- mctracer_trace.trace[i] = array;
+ mctracer_trace.data[i]->trace = array;
}

/*
@@ -478,10 +482,10 @@ static notrace int mctracer_alloc_buffer

free_buffers:
for (i-- ; i >= 0; i--) {
- if (mctracer_trace.trace[i]) {
- free_pages((unsigned long)mctracer_trace.trace[i],
+ if (mctracer_trace.data[i] && mctracer_trace.data[i]->trace) {
+ free_pages((unsigned long)mctracer_trace.data[i]->trace,
order);
- mctracer_trace.trace[i] = NULL;
+ mctracer_trace.data[i]->trace = NULL;
}
}
return -ENOMEM;
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:14:02.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:28.000000000 -0500
@@ -12,15 +12,19 @@ struct mctracer_entry {
pid_t pid;
};

+struct mctracer_trace_cpu {
+ void *trace;
+ unsigned long trace_idx;
+ atomic_t disabled;
+ atomic_t underrun;
+};
+
struct mctracer_trace {
- void *trace[NR_CPUS];
- unsigned long trace_idx[NR_CPUS];
unsigned long entries;
long ctrl;
unsigned long iter_flags;
atomic_t cnt;
- atomic_t disabled[NR_CPUS];
- atomic_t underrun[NR_CPUS];
+ struct mctracer_trace_cpu *data[NR_CPUS];
};

#endif /* _LINUX_MCOUNT_TRACER_H */
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:07 UTC
Permalink
Add "notrace" annotation to x86_64 specific files.

Signed-off-by: Arnaldo Carvalho de Melo <***@ghostprotocols.net>
Signed-off-by: Steven Rostedt <***@redhat.com>
---
arch/x86/kernel/head64.c | 2 +-
arch/x86/kernel/setup64.c | 4 ++--
arch/x86/kernel/smpboot_64.c | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)

Index: linux-compile-i386.git/arch/x86/kernel/head64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/head64.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/head64.c 2008-01-09 14:10:14.000000000 -0500
@@ -46,7 +46,7 @@ static void __init copy_bootdata(char *r
}
}

-void __init x86_64_start_kernel(char * real_mode_data)
+notrace void __init x86_64_start_kernel(char *real_mode_data)
{
int i;

Index: linux-compile-i386.git/arch/x86/kernel/setup64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/setup64.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/setup64.c 2008-01-09 14:10:14.000000000 -0500
@@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void)
}
}

-void pda_init(int cpu)
+notrace void pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);

@@ -197,7 +197,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
-void __cpuinit cpu_init (void)
+notrace void __cpuinit cpu_init(void)
{
int cpu = stack_smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
Index: linux-compile-i386.git/arch/x86/kernel/smpboot_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/smpboot_64.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/smpboot_64.c 2008-01-09 14:10:14.000000000 -0500
@@ -317,7 +317,7 @@ static inline void set_cpu_sibling_map(i
/*
* Setup code on secondary processor (after comming out of the trampoline)
*/
-void __cpuinit start_secondary(void)
+notrace __cpuinit void start_secondary(void)
{
/*
* Dont put anything before smp_callin(), SMP
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:07 UTC
Permalink
This annotates NMI functions with notrace. Some tracers may be able
to live with this, but some cannot. So we turn off NMI tracing.

One solution might be to make a notrace_nmi which would only turn
off NMI tracing if a trace utility needed it off.

Signed-off-by: Arnaldo Carvalho de Melo <***@ghostprotocols.net>
Signed-off-by: Steven Rostedt <***@redhat.com>

---
arch/x86/kernel/nmi_32.c | 2 +-
arch/x86/kernel/nmi_64.c | 2 +-
arch/x86/kernel/traps_32.c | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)

Index: linux-compile-i386.git/arch/x86/kernel/nmi_32.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/nmi_32.c 2008-01-09 14:07:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/nmi_32.c 2008-01-09 14:10:29.000000000 -0500
@@ -323,7 +323,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog);

extern void die_nmi(struct pt_regs *, const char *msg);

-__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{

/*
Index: linux-compile-i386.git/arch/x86/kernel/traps_32.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/traps_32.c 2008-01-09 14:07:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/traps_32.c 2008-01-09 14:10:29.000000000 -0500
@@ -722,7 +722,7 @@ void __kprobes die_nmi(struct pt_regs *r
do_exit(SIGSEGV);
}

-static __kprobes void default_do_nmi(struct pt_regs * regs)
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;

@@ -762,7 +762,7 @@ static __kprobes void default_do_nmi(str

static int ignore_nmis;

-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+notrace fastcall __kprobes void do_nmi(struct pt_regs *regs, long error_code)
{
int cpu;

Index: linux-compile-i386.git/arch/x86/kernel/nmi_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/nmi_64.c 2008-01-09 14:07:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/nmi_64.c 2008-01-09 14:10:29.000000000 -0500
@@ -314,7 +314,7 @@ void touch_nmi_watchdog(void)
touch_softlockup_watchdog();
}

-int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{
int sum;
int touched = 0;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:07 UTC
Permalink
Add the notrace annotations to some of the vsyscall functions.

Note: checkpatch errors on the define of vsyscall_fn because it thinks
that it is a complex macro that needs paranthesis. Unfortunately
we can't put paranthesis on this macro.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
arch/x86/kernel/vsyscall_64.c | 3 ++-
arch/x86/vdso/vclock_gettime.c | 15 ++++++++-------
arch/x86/vdso/vgetcpu.c | 3 ++-
include/asm-x86/vsyscall.h | 3 ++-
4 files changed, 14 insertions(+), 10 deletions(-)

Index: linux-compile-i386.git/arch/x86/vdso/vclock_gettime.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/vdso/vclock_gettime.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/vdso/vclock_gettime.c 2008-01-09 14:10:20.000000000 -0500
@@ -24,7 +24,7 @@

#define gtod vdso_vsyscall_gtod_data

-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+static long notrace vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
asm("syscall" : "=a" (ret) :
@@ -32,7 +32,7 @@ static long vdso_fallback_gettime(long c
return ret;
}

-static inline long vgetns(void)
+static inline long notrace vgetns(void)
{
long v;
cycles_t (*vread)(void);
@@ -41,7 +41,7 @@ static inline long vgetns(void)
return (v * gtod->clock.mult) >> gtod->clock.shift;
}

-static noinline int do_realtime(struct timespec *ts)
+static noinline int notrace do_realtime(struct timespec *ts)
{
unsigned long seq, ns;
do {
@@ -55,7 +55,8 @@ static noinline int do_realtime(struct t
}

/* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+static void notrace
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
{
while (nsec >= NSEC_PER_SEC) {
nsec -= NSEC_PER_SEC;
@@ -69,7 +70,7 @@ static void vset_normalized_timespec(str
ts->tv_nsec = nsec;
}

-static noinline int do_monotonic(struct timespec *ts)
+static noinline int notrace do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
do {
@@ -83,7 +84,7 @@ static noinline int do_monotonic(struct
return 0;
}

-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+int notrace __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
if (likely(gtod->sysctl_enabled && gtod->clock.vread))
switch (clock) {
@@ -97,7 +98,7 @@ int __vdso_clock_gettime(clockid_t clock
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));

-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+int notrace __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
long ret;
if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
Index: linux-compile-i386.git/arch/x86/vdso/vgetcpu.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/vdso/vgetcpu.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/vdso/vgetcpu.c 2008-01-09 14:10:20.000000000 -0500
@@ -13,7 +13,8 @@
#include <asm/vgtod.h>
#include "vextern.h"

-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+long notrace
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int dummy, p;

Index: linux-compile-i386.git/include/asm-x86/vsyscall.h
===================================================================
--- linux-compile-i386.git.orig/include/asm-x86/vsyscall.h 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/include/asm-x86/vsyscall.h 2008-01-09 14:10:20.000000000 -0500
@@ -24,7 +24,8 @@ enum vsyscall_num {
((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
#define __section_vsyscall_clock __attribute__ \
((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn __attribute__ \
+ ((unused, __section__(".vsyscall_fn"))) notrace

#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2
Index: linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:09:35.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c 2008-01-09 15:17:34.000000000 -0500
@@ -42,7 +42,8 @@
#include <asm/topology.h>
#include <asm/vgtod.h>

-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","rcx","memory"
#define __pa_vsymbol(x) \
({unsigned long v; \
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:09 UTC
Permalink
Handle accurate time even if there's a long delay between
accumulated clock cycles.

Signed-off-by: John Stultz <***@us.ibm.com>
Signed-off-by: Steven Rostedt <***@redhat.com>
---
arch/x86/kernel/vsyscall_64.c | 5 ++-
include/asm-x86/vgtod.h | 2 -
include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
kernel/time/timekeeping.c | 35 +++++++++++++------------
4 files changed, 80 insertions(+), 20 deletions(-)

linux-2.6.21-rc5_cycles-accumulated_C7.patch
============================================
Index: linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:10:20.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:17:53.000000000 -0500
@@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
@@ -121,7 +122,7 @@ static __always_inline long time_syscall

static __always_inline void do_vgettimeofday(struct timeval * tv)
{
- cycle_t now, base, mask, cycle_delta;
+ cycle_t now, base, accumulated, mask, cycle_delta;
unsigned seq;
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
@@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
}
now = vread();
base = __vsyscall_gtod_data.clock.cycle_last;
+ accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
shift = __vsyscall_gtod_data.clock.shift;
@@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo

/* calculate interval: */
cycle_delta = (now - base) & mask;
+ cycle_delta += accumulated;
/* convert to nsecs: */
nsec += (cycle_delta * mult) >> shift;

Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 15:17:33.000000000 -0500
@@ -50,8 +50,12 @@ struct clocksource;
* @flags: flags describing special properties
* @vread: vsyscall based read
* @resume: resume function for the clocksource, if necessary
+ * @cycle_last: Used internally by timekeeping core, please ignore.
+ * @cycle_accumulated: Used internally by timekeeping core, please ignore.
* @cycle_interval: Used internally by timekeeping core, please ignore.
* @xtime_interval: Used internally by timekeeping core, please ignore.
+ * @xtime_nsec: Used internally by timekeeping core, please ignore.
+ * @error: Used internally by timekeeping core, please ignore.
*/
struct clocksource {
/*
@@ -82,7 +86,10 @@ struct clocksource {
* Keep it in a different cache line to dirty no
* more than one cache line.
*/
- cycle_t cycle_last ____cacheline_aligned_in_smp;
+ struct {
+ cycle_t cycle_last, cycle_accumulated;
+ } ____cacheline_aligned_in_smp;
+
u64 xtime_nsec;
s64 error;

@@ -168,11 +175,44 @@ static inline cycle_t clocksource_read(s
}

/**
+ * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Uses the clocksource to return the current cycle_t value.
+ * NOTE!!!: This is different from clocksource_read, because it
+ * returns the accumulated cycle value! Must hold xtime lock!
+ */
+static inline cycle_t
+clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ offset += cs->cycle_accumulated;
+ return offset;
+}
+
+/**
+ * clocksource_accumulate: - Accumulates clocksource cycles
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Used to avoids clocksource hardware overflow by periodically
+ * accumulating the current cycle delta. Must hold xtime write lock!
+ */
+static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ cs->cycle_last = now;
+ cs->cycle_accumulated += offset;
+}
+
+/**
* cyc2ns - converts clocksource cycles to nanoseconds
* @cs: Pointer to clocksource
* @cycles: Cycles
*
* Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ * Must hold xtime lock!
*
* XXX - This could use some mult_lxl_ll() asm optimization
*/
@@ -184,13 +224,27 @@ static inline s64 cyc2ns(struct clocksou
}

/**
+ * ns2cyc - converts nanoseconds to clocksource cycles
+ * @cs: Pointer to clocksource
+ * @nsecs: Nanoseconds
+ */
+static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs)
+{
+ cycle_t ret = nsecs << cs->shift;
+
+ do_div(ret, cs->mult + 1);
+
+ return ret;
+}
+
+/**
* clocksource_calculate_interval - Calculates a clocksource interval struct
*
* @c: Pointer to clocksource.
* @length_nsec: Desired interval length in nanoseconds.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
+ * pair and interval request. Must hold xtime_lock!
*
* Unless you're the timekeeping code, you should not be using this!
*/
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
@@ -66,16 +66,10 @@ static struct clocksource *clock; /* poi
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_now, cycle_delta;
+ cycle_t cycle_delta;
s64 ns_offset;

- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
+ cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
ns_offset = cyc2ns(clock, cycle_delta);

return ns_offset;
@@ -195,7 +189,7 @@ static void change_clocksource(void)

clock = new;
clock->cycle_last = now;
-
+ clock->cycle_accumulated = 0;
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -205,9 +199,15 @@ static void change_clocksource(void)
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
}
+
+void timekeeping_accumulate(void)
+{
+ clocksource_accumulate(clock, clocksource_read(clock));
+}
#else
static inline void change_clocksource(void) { }
static inline s64 __get_nsec_offset(void) { return 0; }
+void timekeeping_accumulate(void) { }
#endif

/**
@@ -302,6 +302,7 @@ static int timekeeping_resume(struct sys
timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
/* re-base the last cycle value */
clock->cycle_last = clocksource_read(clock);
+ clock->cycle_accumulated = 0;
clock->error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t offset;
+ cycle_t cycle_now, offset;

/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;

#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ cycle_now = clocksource_read(clock);
#else
- offset = clock->cycle_interval;
+ cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
+ offset = (cycle_now - clock->cycle_last) & clock->mask;
+ clocksource_accumulate(clock, cycle_now);
+
clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;

/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (clock->cycle_accumulated >= clock->cycle_interval) {
/* accumulate one interval */
clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
+ clock->cycle_accumulated -= clock->cycle_interval;

if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
@@ -482,7 +485,7 @@ void update_wall_time(void)
}

/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ clocksource_adjust(clock->cycle_accumulated);

/* store full nanoseconds into xtime */
xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
Index: linux-compile-i386.git/include/asm-x86/vgtod.h
===================================================================
--- linux-compile-i386.git.orig/include/asm-x86/vgtod.h 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/include/asm-x86/vgtod.h 2008-01-09 14:17:53.000000000 -0500
@@ -15,7 +15,7 @@ struct vsyscall_gtod_data {
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
cycle_t (*vread)(void);
- cycle_t cycle_last;
+ cycle_t cycle_last, cycle_accumulated;
cycle_t mask;
u32 mult;
u32 shift;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 00:10:06 UTC
Permalink
plain text document attachment (rt-time-starvation-fix.patch)
Handle accurate time even if there's a long delay between
accumulated clock cycles.
---
arch/x86/kernel/vsyscall_64.c | 5 ++-
include/asm-x86/vgtod.h | 2 -
include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
kernel/time/timekeeping.c | 35 +++++++++++++------------
4 files changed, 80 insertions(+), 20 deletions(-)
linux-2.6.21-rc5_cycles-accumulated_C7.patch
^^ An oldie but a goodie?

I was just reminded that in the time since 2.6.21-rc5, other arches
beside x86_64 have gained vgettimeofday implementations, and thus will
need similar update_vsyscall() tweaks as seen below to get the correct
time.
Index: linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:10:20.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:17:53.000000000 -0500
@@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
@@ -121,7 +122,7 @@ static __always_inline long time_syscall
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
- cycle_t now, base, mask, cycle_delta;
+ cycle_t now, base, accumulated, mask, cycle_delta;
unsigned seq;
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
@@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
}
now = vread();
base = __vsyscall_gtod_data.clock.cycle_last;
+ accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
shift = __vsyscall_gtod_data.clock.shift;
@@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo
/* calculate interval: */
cycle_delta = (now - base) & mask;
+ cycle_delta += accumulated;
/* convert to nsecs: */
nsec += (cycle_delta * mult) >> shift;
Tony: ia64 also needs something like this, but I found the fsyscall asm
bits a little difficult to grasp. So I'll need some assistance on how to
include the accumulated cycles into the final calculation.


The following is a quick and dirty fix for powerpc so it includes
cycle_accumulated in its calculation. It relies on the fact that the
powerpc clocksource is a 64bit counter (don't have to worry about
multiple overflows), so the subtraction should be safe.

Signed-off-by: John Stultz <***@us.ibm.com>
Index: 2.6.24-rc5/arch/powerpc/kernel/time.c
===================================================================
--- 2.6.24-rc5.orig/arch/powerpc/kernel/time.c 2008-01-09 15:17:32.000000000 -0800
+++ 2.6.24-rc5/arch/powerpc/kernel/time.c 2008-01-09 15:17:43.000000000 -0800
@@ -773,7 +773,7 @@ void update_vsyscall(struct timespec *wa
stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
do_div(stamp_xsec, 1000000000);
stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
- update_gtod(clock->cycle_last, stamp_xsec, t2x);
+ update_gtod(clock->cycle_last-clock->cycle_accumulated, stamp_xsec, t2x);
}

void update_vsyscall_tz(void)





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 00:20:07 UTC
Permalink
Post by john stultz
Post by Steven Rostedt
---
arch/x86/kernel/vsyscall_64.c | 5 ++-
include/asm-x86/vgtod.h | 2 -
include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
kernel/time/timekeeping.c | 35 +++++++++++++------------
4 files changed, 80 insertions(+), 20 deletions(-)
linux-2.6.21-rc5_cycles-accumulated_C7.patch
^^ An oldie but a goodie?
Hehe, I got this directly from the RT queue.
Post by john stultz
I was just reminded that in the time since 2.6.21-rc5, other arches
beside x86_64 have gained vgettimeofday implementations, and thus will
need similar update_vsyscall() tweaks as seen below to get the correct
time.
Thanks! I'll fold this into this patch for the next release.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tony Luck
2008-01-10 20:00:21 UTC
Permalink
Post by john stultz
Tony: ia64 also needs something like this, but I found the fsyscall asm
bits a little difficult to grasp. So I'll need some assistance on how to
include the accumulated cycles into the final calculation.
I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
propagated all the way to the low level assembler (which
I don't want to mess with unless I really, really have to).
Can't I do the necessary calculations in update_vsyscall()
[Where I can do them in C :-)] and keep the same low
level assembly code. I think I must be missing some
important bit of what is going on here.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 20:20:20 UTC
Permalink
Post by Tony Luck
I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
John,

Before we hardcode these names, can we change them? Later in the series I
use something called 'cycle_raw' which really should be called
'cycle_accumulated'. Since cycle_accumulated IIRC can go backwards.

Or do you think I should rename cycle_raw to cycle_monotonic?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 20:50:14 UTC
Permalink
Post by Steven Rostedt
Post by Tony Luck
I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
John,
Before we hardcode these names, can we change them? Later in the series I
use something called 'cycle_raw' which really should be called
'cycle_accumulated'. Since cycle_accumulated IIRC can go backwards.
Err... I don't think the current cycle_accumulated can go backwards. Or
maybe I'm missing what you mean.
Post by Steven Rostedt
Or do you think I should rename cycle_raw to cycle_monotonic?
That's fine by me either way.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 20:30:09 UTC
Permalink
Post by Tony Luck
Post by john stultz
Tony: ia64 also needs something like this, but I found the fsyscall asm
bits a little difficult to grasp. So I'll need some assistance on how to
include the accumulated cycles into the final calculation.
I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
propagated all the way to the low level assembler (which
I don't want to mess with unless I really, really have to).
Can't I do the necessary calculations in update_vsyscall()
[Where I can do them in C :-)] and keep the same low
level assembly code. I think I must be missing some
important bit of what is going on here.
(Added Bob Picco to the mail, as he was involved in the ia64 clocksource
work).

So the background on the patch is this:

Some clocksources wrap frequently (every few seconds, for example). This
can cause issues if we defer the update_wall_time() function where we
accumulate time for too long (this really only happens on -rt systems
right now).

To avoid that issue, we've added the cycle_accumulated value, which acts
as a midpoint, where we can quickly accumulate cycles off of the
counter, without doing the more expensive update_wall_time() function.
This avoids issues with the clocksource wrapping, but requires that
cycle_accumulated be added in to the gettiemofday() calculation.

If you noticed in my email, the fix for ppc was a bit easier, as it has
only a 64bit counter that is quite unlikely to wrap twice between calls
to update_wall_time(). There we could decrement the cycles_last value by
cycles_accumulated and get the same effect of adding it in.

Unfortunately on ia64, I suspect it will be necessary to do similar to
the x86_64 code and add in the cycles accumulated value in
vgettime/fgettime function, since there is the possibility of quickly
wrapping clocksources on that architecture.

So unless someone can point out a nicer trick, it likely means adding a
new cycles_accumulated value to the fsyscall structure and the asm to do
the addition. :(

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-10 20:50:12 UTC
Permalink
Post by john stultz
Post by Tony Luck
Post by john stultz
Tony: ia64 also needs something like this, but I found the fsyscall asm
bits a little difficult to grasp. So I'll need some assistance on how to
include the accumulated cycles into the final calculation.
I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
propagated all the way to the low level assembler (which
I don't want to mess with unless I really, really have to).
Can't I do the necessary calculations in update_vsyscall()
[Where I can do them in C :-)] and keep the same low
level assembly code. I think I must be missing some
important bit of what is going on here.
(Added Bob Picco to the mail, as he was involved in the ia64 clocksource
work).
Some clocksources wrap frequently (every few seconds, for example). This
can cause issues if we defer the update_wall_time() function where we
accumulate time for too long (this really only happens on -rt systems
right now).
To avoid that issue, we've added the cycle_accumulated value, which acts
as a midpoint, where we can quickly accumulate cycles off of the
counter, without doing the more expensive update_wall_time() function.
This avoids issues with the clocksource wrapping, but requires that
cycle_accumulated be added in to the gettiemofday() calculation.
If you noticed in my email, the fix for ppc was a bit easier, as it has
only a 64bit counter that is quite unlikely to wrap twice between calls
to update_wall_time(). There we could decrement the cycles_last value by
cycles_accumulated and get the same effect of adding it in.
Unfortunately on ia64, I suspect it will be necessary to do similar to
the x86_64 code and add in the cycles accumulated value in
vgettime/fgettime function, since there is the possibility of quickly
wrapping clocksources on that architecture.
So unless someone can point out a nicer trick, it likely means adding a
new cycles_accumulated value to the fsyscall structure and the asm to do
the addition. :(
I think it's about time I introduce the approach I have taken for LTTng
timestamping. Basically, one of the main issues with the clock sources
is the xtime lock : having a read seqlock nested over a write seqlock is
a really, really bad idea. This can happen with NMIs. Basically, it
would cause a deadlock.

What I have done is an RCU algorithm that extends a 32 bits TSC (that's
the case on MIPS, for instance) to 64 bits. The update of the MSBs is
done by a periodical timer (fired often enough to make sure we always
detect the 32 LSBs wrap-around) and the read-side only has to disable
preemption.

I use a 2 slots array, each of them keeping, alternatively, the last 64
bits counter value, to implement the RCU algorithm.

Since we are discussing time source modification, this is one that I
would really like to see in the Linux kernel : it would provide the kind
of time source needed for function entry/exit tracing and for generic
kernel tracing as well.

Mathieu

Here is the patch, for reference. It applies on 2.6.24-rc7, after some
other LTTng patches in my patchset.


LTTng timestamp

LTTng synthetic TSC code for timestamping. Extracts 64 bits tsc from a 32 bits
counter, kept up to date by periodical timer interrupt. Lockless.

Signed-off-by: Mathieu Desnoyers <***@polymtl.ca>
---
init/Kconfig | 2
ltt/Kconfig | 17 ++++
ltt/Makefile | 1
ltt/ltt-timestamp.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 218 insertions(+)

Index: linux-2.6-lttng/ltt/ltt-timestamp.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/ltt-timestamp.c 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1,198 @@
+/*
+ * (C) Copyright 2006,2007 -
+ * Mathieu Desnoyers (***@polymtl.ca)
+ *
+ * notes : ltt-timestamp timer-based clock cannot be used for early tracing in
+ * the boot process, as it depends on timer interrupts.
+ *
+ * The timer needs to be only on one CPU to support hotplug.
+ * We have the choice between schedule_delayed_work_on and an IPI to get each
+ * CPU to write the heartbeat. IPI has been chosen because it is considered
+ * faster than passing through the timer to get the work scheduled on all the
+ * CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/ltt.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* FIX for m68k local_irq_enable in on_each_cpu */
+
+atomic_t lttng_generic_clock;
+EXPORT_SYMBOL(lttng_generic_clock);
+
+/* Expected maximum interrupt latency in ms : 15ms, *2 for security */
+#define EXPECTED_INTERRUPT_LATENCY 30
+
+static struct timer_list stsc_timer;
+static unsigned int precalc_expire;
+
+/* For architectures with 32 bits TSC */
+static struct synthetic_tsc_struct {
+ u32 tsc[2][2]; /* a pair of 2 32 bits. [0] is the MSB, [1] is LSB */
+ unsigned int index; /* Index of the current synth. tsc. */
+} ____cacheline_aligned synthetic_tsc[NR_CPUS];
+
+/* Called from IPI : either in interrupt or process context */
+static void ltt_update_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ tsc = ltt_get_timestamp32(); /* We deal with a 32 LSB TSC */
+
+ if (tsc < cpu_synth->tsc[cpu_synth->index][1]) {
+ unsigned int new_index = cpu_synth->index ? 0 : 1; /* 0 <-> 1 */
+ /*
+ * Overflow
+ * Non atomic update of the non current synthetic TSC, followed
+ * by an atomic index change. There is no write concurrency,
+ * so the index read/write does not need to be atomic.
+ */
+ cpu_synth->tsc[new_index][1] = tsc; /* LSB update */
+ cpu_synth->tsc[new_index][0] =
+ cpu_synth->tsc[cpu_synth->index][0]+1; /* MSB update */
+ cpu_synth->index = new_index; /* atomic change of index */
+ } else {
+ /*
+ * No overflow : we can simply update the 32 LSB of the current
+ * synthetic TSC as it's an atomic write.
+ */
+ cpu_synth->tsc[cpu_synth->index][1] = tsc;
+ }
+ preempt_enable();
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 ltt_read_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 ret;
+ unsigned int index;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ index = cpu_synth->index; /* atomic read */
+ tsc = ltt_get_timestamp32(); /* We deal with a 32 LSB TSC */
+
+ if (tsc < cpu_synth->tsc[index][1]) {
+ /* Overflow */
+ ret = ((u64)(cpu_synth->tsc[index][0]+1) << 32) | ((u64)tsc);
+ } else {
+ /* no overflow */
+ ret = ((u64)cpu_synth->tsc[index][0] << 32) | ((u64)tsc);
+ }
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+ ltt_update_synthetic_tsc();
+}
+
+/* We need to be in process context to do an IPI */
+static void synthetic_tsc_work(struct work_struct *work)
+{
+ on_each_cpu(synthetic_tsc_ipi, NULL, 1, 1);
+}
+static DECLARE_WORK(stsc_work, synthetic_tsc_work);
+
+/*
+ * stsc_timer : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void stsc_timer_fct(unsigned long data)
+{
+ PREPARE_WORK(&stsc_work, synthetic_tsc_work);
+ schedule_work(&stsc_work);
+
+ mod_timer(&stsc_timer, jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the 32 bits TSC
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+ unsigned long mask;
+
+ mask = 0xFFFFFFFFUL;
+ precalc_expire =
+ (mask/((ltt_frequency() / HZ * ltt_freq_scale()) << 1)
+ - 1 - (EXPECTED_INTERRUPT_LATENCY*HZ/1000)) >> 1;
+ WARN_ON(precalc_expire == 0);
+ printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+ precalc_expire);
+ return 0;
+}
+
+/*
+ * hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Sets the new CPU's current synthetic TSC to the same value as the
+ * currently running CPU.
+ *
+ * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 local_count;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ cpu_synth = &synthetic_tsc[hotcpu];
+ local_count = ltt_read_synthetic_tsc();
+ cpu_synth->tsc[0][1] = (u32)local_count; /* LSB */
+ cpu_synth->tsc[0][0] = (u32)(local_count >> 32); /* MSB */
+ cpu_synth->index = 0;
+ smp_wmb(); /* Writing in data of CPU about to come up */
+ break;
+ case CPU_ONLINE:
+ /*
+ * FIXME : heartbeat events are currently broken with CPU
+ * hotplug : events can be recorded before heartbeat, heartbeat
+ * too far from trace start and are broken with trace stop/start
+ * as well.
+ */
+ /* As we are preemptible, make sure it runs on the right cpu */
+ smp_call_function_single(hotcpu, synthetic_tsc_ipi, NULL, 1, 0);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/* Called from one CPU, before any tracing starts, to init each structure */
+static int __init ltt_init_synthetic_tsc(void)
+{
+ int cpu;
+ hotcpu_notifier(hotcpu_callback, 3);
+ precalc_stsc_interval();
+ init_timer(&stsc_timer);
+ stsc_timer.function = stsc_timer_fct;
+ stsc_timer.expires = jiffies + precalc_expire;
+ add_timer(&stsc_timer);
+ return 0;
+}
+
+__initcall(ltt_init_synthetic_tsc);
Index: linux-2.6-lttng/ltt/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Kconfig 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1,17 @@
+menu "Linux Trace Toolkit"
+
+config LTT_TIMESTAMP
+ bool "LTTng fine-grained timestamping"
+ default y
+ help
+ Allow fine-grained timestamps to be taken from tracing applications.
+
+config HAVE_LTT_CLOCK
+ def_bool n
+
+config HAVE_LTT_SYNTHETIC_TSC
+ bool
+ default y if (!HAVE_LTT_CLOCK)
+ default n if HAVE_LTT_CLOCK
+
+endmenu
Index: linux-2.6-lttng/ltt/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Makefile 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1 @@
+obj-$(CONFIG_HAVE_LTT_SYNTHETIC_TSC) += ltt-timestamp.o
Index: linux-2.6-lttng/init/Kconfig
===================================================================
--- linux-2.6-lttng.orig/init/Kconfig 2007-12-05 20:53:35.000000000 -0500
+++ linux-2.6-lttng/init/Kconfig 2007-12-05 20:54:48.000000000 -0500
@@ -682,6 +682,8 @@ config MARKERS
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.

+source "ltt/Kconfig"
+
source "arch/Kconfig"

config DISABLE_IMMEDIATE
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 21:30:21 UTC
Permalink
Post by Mathieu Desnoyers
I think it's about time I introduce the approach I have taken for LTTng
timestamping. Basically, one of the main issues with the clock sources
is the xtime lock : having a read seqlock nested over a write seqlock is
a really, really bad idea. This can happen with NMIs. Basically, it
would cause a deadlock.
What I have done is an RCU algorithm that extends a 32 bits TSC (that's
the case on MIPS, for instance) to 64 bits. The update of the MSBs is
done by a periodical timer (fired often enough to make sure we always
detect the 32 LSBs wrap-around) and the read-side only has to disable
preemption.
I use a 2 slots array, each of them keeping, alternatively, the last 64
bits counter value, to implement the RCU algorithm.
Since we are discussing time source modification, this is one that I
would really like to see in the Linux kernel : it would provide the kind
of time source needed for function entry/exit tracing and for generic
kernel tracing as well.
Hmm. I know powerpc has had a similar lock-free dual structure method
and for just a raw cycles based method you've shown below (or for some
of the bits Steven is working on), I think it should be fine.

The concern I've had with this method for general timekeeping, is that
I'm not sure it can handle the frequency corrections made by NTP. Since
we have to make sure time does not jump backwards, consider this
exaggerated situation:

time = base + (now - last)*mult;

So we have two structures:
base: 60 base: 180
last: 10 last: 30
mult: 06 mult: 05

Where the second structure has just been updated lock-free, however just
before the atomic pointer switch we were preempted, or somehow delayed,
and some time has past.

Now imagine two cpus now race to get the time. Both read the same now
value, but get different structure pointer values. (Note: You can create
the same race if you reverse the order and grab the pointer first, then
the cycle. However I think this example makes it easier to understand).

now = 50
cpu1:
60 + (50-10)*6 = 300
cpu2:
180 + (50-30)*5 = 280


Alternatively:
now=50: 60 + (50-10)*6 = 300
now=51: 180 + (51-30)*5 = 285

Eek. That's not good.

I'm not sure how this can be avoided, but I'd be very interested in
hearing ideas! Bounding the issue is a possibility, but then starts to
run amok with NO_HZ and -rt deferment.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-10 22:10:21 UTC
Permalink
Post by john stultz
Post by Mathieu Desnoyers
I think it's about time I introduce the approach I have taken for LTTng
timestamping. Basically, one of the main issues with the clock sources
is the xtime lock : having a read seqlock nested over a write seqlock is
a really, really bad idea. This can happen with NMIs. Basically, it
would cause a deadlock.
What I have done is an RCU algorithm that extends a 32 bits TSC (that's
the case on MIPS, for instance) to 64 bits. The update of the MSBs is
done by a periodical timer (fired often enough to make sure we always
detect the 32 LSBs wrap-around) and the read-side only has to disable
preemption.
I use a 2 slots array, each of them keeping, alternatively, the last 64
bits counter value, to implement the RCU algorithm.
Since we are discussing time source modification, this is one that I
would really like to see in the Linux kernel : it would provide the kind
of time source needed for function entry/exit tracing and for generic
kernel tracing as well.
Hmm. I know powerpc has had a similar lock-free dual structure method
and for just a raw cycles based method you've shown below (or for some
of the bits Steven is working on), I think it should be fine.
The concern I've had with this method for general timekeeping, is that
I'm not sure it can handle the frequency corrections made by NTP. Since
we have to make sure time does not jump backwards, consider this
time = base + (now - last)*mult;
base: 60 base: 180
last: 10 last: 30
mult: 06 mult: 05
Where the second structure has just been updated lock-free, however just
before the atomic pointer switch we were preempted, or somehow delayed,
and some time has past.
Now imagine two cpus now race to get the time. Both read the same now
value, but get different structure pointer values. (Note: You can create
the same race if you reverse the order and grab the pointer first, then
the cycle. However I think this example makes it easier to understand).
now = 50
60 + (50-10)*6 = 300
180 + (50-30)*5 = 280
now=50: 60 + (50-10)*6 = 300
now=51: 180 + (51-30)*5 = 285
Eek. That's not good.
I'm not sure how this can be avoided, but I'd be very interested in
hearing ideas! Bounding the issue is a possibility, but then starts to
run amok with NO_HZ and -rt deferment.
thanks
-john
I suggest we try to see the problem differently (and see how far we can
get with this) :

Let's suppose we have a 32 bits cycles counter given by the
architecture. We use the lockless algorithm to extend it to 64 bits : we
therefore have a 64 bits cycle counter that can be read locklessly.

Then, from this 64 bits counter (let's call it "now") (which has the
advantage of never overflowing, or after enough years so nobody
cares...), we can calculate the current time with :

time = base + (now - last) * mul

NTP would adjust time by modifying mul, last and base; base would be
recalculated from the formula with : base + (now - last) * mul each time
we modify the clock rate (mul), we also read the current "now" value
(which is saved as "last"). This would be done system-wide and would be
kept in a data structure separate from the 2 64 bits slots array.
Ideally, this NTP correction update could also be done atomically with a
2 slots array.

Whenever we need to read the time, we then have to insure that the "now"
value we use is consistent with the current NTP time correction. We want
to eliminate races where we would use value from the wrong NTP "window"
with a "now" value not belonging to this window. (an NTP window would be
defined by a tuple of base, last and mul values)

If "now" is lower than "last", we are using an old timestamp with a
new copy of the structure and must therefore re-read the "now" value.

If, when we are about to return the "time" value calculated, we figure
out that the current NTP window pointer have changed, we must make sure
that the time value we are about to return is lower than the new base or
otherwise time could go backward. If we detect that time is higher than
the new base, we re-read the "now" value and re-do the calculation.

Am I only partially crazy ? ;)

Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 22:50:09 UTC
Permalink
Post by Mathieu Desnoyers
Am I only partially crazy ? ;)
Not at all, you should just do some more shopping!

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 23:00:20 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
Post by Mathieu Desnoyers
I think it's about time I introduce the approach I have taken for LTTng
timestamping. Basically, one of the main issues with the clock sources
is the xtime lock : having a read seqlock nested over a write seqlock is
a really, really bad idea. This can happen with NMIs. Basically, it
would cause a deadlock.
What I have done is an RCU algorithm that extends a 32 bits TSC (that's
the case on MIPS, for instance) to 64 bits. The update of the MSBs is
done by a periodical timer (fired often enough to make sure we always
detect the 32 LSBs wrap-around) and the read-side only has to disable
preemption.
I use a 2 slots array, each of them keeping, alternatively, the last 64
bits counter value, to implement the RCU algorithm.
Since we are discussing time source modification, this is one that I
would really like to see in the Linux kernel : it would provide the kind
of time source needed for function entry/exit tracing and for generic
kernel tracing as well.
Hmm. I know powerpc has had a similar lock-free dual structure method
and for just a raw cycles based method you've shown below (or for some
of the bits Steven is working on), I think it should be fine.
The concern I've had with this method for general timekeeping, is that
I'm not sure it can handle the frequency corrections made by NTP. Since
we have to make sure time does not jump backwards, consider this
time = base + (now - last)*mult;
base: 60 base: 180
last: 10 last: 30
mult: 06 mult: 05
Where the second structure has just been updated lock-free, however just
before the atomic pointer switch we were preempted, or somehow delayed,
and some time has past.
Now imagine two cpus now race to get the time. Both read the same now
value, but get different structure pointer values. (Note: You can create
the same race if you reverse the order and grab the pointer first, then
the cycle. However I think this example makes it easier to understand).
now = 50
60 + (50-10)*6 = 300
180 + (50-30)*5 = 280
now=50: 60 + (50-10)*6 = 300
now=51: 180 + (51-30)*5 = 285
Eek. That's not good.
I'm not sure how this can be avoided, but I'd be very interested in
hearing ideas! Bounding the issue is a possibility, but then starts to
run amok with NO_HZ and -rt deferment.
thanks
-john
I suggest we try to see the problem differently (and see how far we can
Let's suppose we have a 32 bits cycles counter given by the
architecture. We use the lockless algorithm to extend it to 64 bits : we
therefore have a 64 bits cycle counter that can be read locklessly.
Then, from this 64 bits counter (let's call it "now") (which has the
advantage of never overflowing, or after enough years so nobody
Hmm. Maybe I'm missing something here. I'm not sure I'm following the
importance of the 64bit extension.

The clocksource code deals with counters in a range of widths (from
64bit TSC to 24bit ACPI PM). The only requirement there is that have
accumulate often enough that it doesn't wrap twice between
accumulations. Currently this is done in update_wall_time(), but the
patch Steven sent that started this thread adds an interim step using
cycle_accumulated, allowing update_wall_time() to be deferred for longer
periods of time.

I do see how the method you're describing could be applied to just the
cycle_accumulated management, and maybe that's the whole point?

However my concern is that when we do the frequency adjustment in
update_wall_time, I'm not sure the method works. Thus we still would
have to have a lock in there for gettimeofday().

But let's continue...
Post by Mathieu Desnoyers
time = base + (now - last) * mul
NTP would adjust time by modifying mul, last and base; base would be
recalculated from the formula with : base + (now - last) * mul each time
we modify the clock rate (mul), we also read the current "now" value
(which is saved as "last"). This would be done system-wide and would be
kept in a data structure separate from the 2 64 bits slots array.
Ideally, this NTP correction update could also be done atomically with a
2 slots array.
Whenever we need to read the time, we then have to insure that the "now"
value we use is consistent with the current NTP time correction. We want
to eliminate races where we would use value from the wrong NTP "window"
with a "now" value not belonging to this window. (an NTP window would be
defined by a tuple of base, last and mul values)
If "now" is lower than "last", we are using an old timestamp with a
new copy of the structure and must therefore re-read the "now" value.
Ok, that would avoid one type of error, but in both of my examples in
the last mail, now was greater then last.
Post by Mathieu Desnoyers
If, when we are about to return the "time" value calculated, we figure
out that the current NTP window pointer have changed, we must make sure
that the time value we are about to return is lower than the new base or
otherwise time could go backward. If we detect that time is higher than
the new base, we re-read the "now" value and re-do the calculation.
Again, I'm not sure I see how this resolves the previous example given,
as in that case the update code was delayed in between its reading of
now and the final pointer change.

The issue is that the race isn't just between the readers and the
writer, but that time races against writer as well. So if you don't lock
the readers out during the write, I'm not sure how you can avoid the
window for inconsistencies.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 23:10:19 UTC
Permalink
Post by john stultz
The issue is that the race isn't just between the readers and the
writer, but that time races against writer as well. So if you don't lock
the readers out during the write, I'm not sure how you can avoid the
window for inconsistencies.
Maybe to state it more clearly, the issue is that in order to be atomic,
the writer must atomically access the clock, and make all the updates in
one step.

So if a reader accesses the clock after the writer accesses the clock,
but before the writer finishes the update, there is the possibility time
could go backwards.

Now, not to completely throw water on it, It is possible to set up a
bounds argument, and say as long as NTP adjustments are less then X, and
the window between the writer starting and finishing the update is less
then Y, the resulting inconsistency is limited to Z. And if Z is less
then a nanosecond, then you're ok.

However, items like virtualization and the realtime patch can cause Y to
be stretched quite a bit, so finding a way to handle that would be
needed as well.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Luck, Tony
2008-01-10 21:40:10 UTC
Permalink
Post by john stultz
If you noticed in my email, the fix for ppc was a bit easier, as it has
only a 64bit counter that is quite unlikely to wrap twice between calls
to update_wall_time().
"quite unlikely" ...

Hmmm just how fast are you driving the clocks on your ppc? Even at 100GHz
It is almost SIX YEARS between wrap-arounds of a 64-bit counter.
Perhaps you could just have a cron job that forces a call to update_wall_time()
every January 1st, rather than add extra code overhead to a hot path :-) ?

But I agree that narrower counters are a problem.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-10 00:30:13 UTC
Permalink
plain text document attachment (rt-time-starvation-fix.patch)
Handle accurate time even if there's a long delay between
accumulated clock cycles.
Hrmm.. One more item I just noticed.
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
@@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t offset;
+ cycle_t cycle_now, offset;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ cycle_now = clocksource_read(clock);
#else
- offset = clock->cycle_interval;
+ cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
+ offset = (cycle_now - clock->cycle_last) & clock->mask;
It seems this offset addition was to merge against the colliding
xtime_cache changes in mainline. However, I don't think its quite right,
and might be causing incorrect time() or vtime() results if NO_HZ is
enabled.
+ clocksource_accumulate(clock, cycle_now);
+
clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (clock->cycle_accumulated >= clock->cycle_interval) {
/* accumulate one interval */
clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
+ clock->cycle_accumulated -= clock->cycle_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
@@ -482,7 +485,7 @@ void update_wall_time(void)
}
/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ clocksource_adjust(clock->cycle_accumulated);
I suspect the following is needed, but haven't been able to test it yet.

thanks
-john


Fixup merge between xtime_cache and timekeeping starvation fix.

Signed-off-by: John Stultz <***@us.ibm.com>

Index: 2.6/kernel/time/timekeeping.c
===================================================================
--- 2.6.orig/kernel/time/timekeeping.c 2008-01-09 16:12:49.000000000 -0800
+++ 2.6/kernel/time/timekeeping.c 2008-01-09 16:13:18.000000000 -0800
@@ -449,7 +449,7 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t cycle_now, offset;
+ cycle_t cycle_now;

/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -460,7 +460,6 @@ void update_wall_time(void)
#else
cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
- offset = (cycle_now - clock->cycle_last) & clock->mask;
clocksource_accumulate(clock, cycle_now);

clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
@@ -491,7 +490,7 @@ void update_wall_time(void)
xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;

- update_xtime_cache(cyc2ns(clock, offset));
+ update_xtime_cache(cyc2ns(clock, clock->cycle_accumulated));

/* check to see if there is a new clocksource to use */
change_clocksource();


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 00:30:15 UTC
Permalink
Post by john stultz
Post by Steven Rostedt
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
@@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t offset;
+ cycle_t cycle_now, offset;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ cycle_now = clocksource_read(clock);
#else
- offset = clock->cycle_interval;
+ cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
+ offset = (cycle_now - clock->cycle_last) & clock->mask;
It seems this offset addition was to merge against the colliding
xtime_cache changes in mainline. However, I don't think its quite right,
and might be causing incorrect time() or vtime() results if NO_HZ is
enabled.
Yeah, this had a bit of clashes in its life in the RT kernel.
Post by john stultz
Post by Steven Rostedt
+ clocksource_accumulate(clock, cycle_now);
+
clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (clock->cycle_accumulated >= clock->cycle_interval) {
/* accumulate one interval */
clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
+ clock->cycle_accumulated -= clock->cycle_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
@@ -482,7 +485,7 @@ void update_wall_time(void)
}
/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ clocksource_adjust(clock->cycle_accumulated);
I suspect the following is needed, but haven't been able to test it yet.
Thanks, I'll pull it in and start testing it.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:09 UTC
Permalink
If CONFIG_MCOUNT is selected and /proc/sys/kernel/mcount_enabled is set to a
non-zero value the mcount routine will be called everytime we enter a kernel
function that is not marked with the "notrace" attribute.

The mcount routine will then call a registered function if a function
happens to be registered.

[This code has been highly hacked by Steven Rostedt, so don't
blame Arnaldo for all of this ;-) ]

Signed-off-by: Arnaldo Carvalho de Melo <***@ghostprotocols.net>
Signed-off-by: Steven Rostedt <***@redhat.com>
---
Makefile | 4 ++
arch/x86/Kconfig | 4 ++
arch/x86/kernel/Makefile_32 | 1
arch/x86/kernel/entry_64.S | 40 ++++++++++++++++++++
arch/x86/kernel/mcount-wrapper.S | 25 ++++++++++++
include/linux/linkage.h | 2 +
include/linux/mcount.h | 21 ++++++++++
kernel/sysctl.c | 11 +++++
lib/Kconfig.debug | 2 +
lib/Makefile | 2 +
lib/tracing/Kconfig | 7 +++
lib/tracing/Makefile | 3 +
lib/tracing/mcount.c | 77 +++++++++++++++++++++++++++++++++++++++
13 files changed, 199 insertions(+)
create mode 100644 arch/i386/kernel/mcount-wrapper.S
create mode 100644 lib/tracing/Kconfig
create mode 100644 lib/tracing/Makefile
create mode 100644 lib/tracing/mcount.c
create mode 100644 lib/tracing/mcount.h

Index: linux-compile-i386.git/Makefile
===================================================================
--- linux-compile-i386.git.orig/Makefile 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/Makefile 2008-01-09 14:10:07.000000000 -0500
@@ -509,6 +509,10 @@ endif

include $(srctree)/arch/$(SRCARCH)/Makefile

+# MCOUNT expects frame pointer
+ifdef CONFIG_MCOUNT
+KBUILD_CFLAGS += -pg
+endif
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
else
Index: linux-compile-i386.git/arch/x86/Kconfig
===================================================================
--- linux-compile-i386.git.orig/arch/x86/Kconfig 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/Kconfig 2008-01-09 14:10:07.000000000 -0500
@@ -28,6 +28,10 @@ config GENERIC_CMOS_UPDATE
bool
default y

+config ARCH_HAS_MCOUNT
+ bool
+ default y
+
config CLOCKSOURCE_WATCHDOG
bool
default y
Index: linux-compile-i386.git/arch/x86/kernel/Makefile_32
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/Makefile_32 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/Makefile_32 2008-01-09 14:10:07.000000000 -0500
@@ -23,6 +23,7 @@ obj-$(CONFIG_APM) += apm_32.o
obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
obj-$(CONFIG_SMP) += smpcommon_32.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
+obj-$(CONFIG_MCOUNT) += mcount-wrapper.o
obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
Index: linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,25 @@
+/*
+ * linux/arch/x86/mcount-wrapper.S
+ *
+ * Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %ebp
+ mov %esp, %ebp
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ call __mcount
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ popl %ebp
+out:
+ ret
Index: linux-compile-i386.git/include/linux/linkage.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/linkage.h 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/include/linux/linkage.h 2008-01-09 14:10:07.000000000 -0500
@@ -3,6 +3,8 @@

#include <asm/linkage.h>

+#define notrace __attribute__((no_instrument_function))
+
#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
Index: linux-compile-i386.git/kernel/sysctl.c
===================================================================
--- linux-compile-i386.git.orig/kernel/sysctl.c 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/kernel/sysctl.c 2008-01-09 14:10:07.000000000 -0500
@@ -46,6 +46,7 @@
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
+#include <linux/mcount.h>

#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -470,6 +471,16 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifdef CONFIG_MCOUNT
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "mcount_enabled",
+ .data = &mcount_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_KMOD
{
.ctl_name = KERN_MODPROBE,
Index: linux-compile-i386.git/lib/Kconfig.debug
===================================================================
--- linux-compile-i386.git.orig/lib/Kconfig.debug 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/lib/Kconfig.debug 2008-01-09 14:10:07.000000000 -0500
@@ -517,4 +517,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
help
Provide stacktrace filter for fault-injection capabilities

+source lib/tracing/Kconfig
+
source "samples/Kconfig"
Index: linux-compile-i386.git/lib/Makefile
===================================================================
--- linux-compile-i386.git.orig/lib/Makefile 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/lib/Makefile 2008-01-09 14:10:07.000000000 -0500
@@ -66,6 +66,8 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o

+obj-$(CONFIG_MCOUNT) += tracing/
+
lib-$(CONFIG_GENERIC_BUG) += bug.o

hostprogs-y := gen_crc32table
Index: linux-compile-i386.git/lib/tracing/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/Kconfig 2008-01-09 15:17:48.000000000 -0500
@@ -0,0 +1,7 @@
+
+# MCOUNT itself is useless, or will just be added overhead.
+# It needs something to register a function with it.
+config MCOUNT
+ bool
+ depends on DEBUG_KERNEL
+ select FRAME_POINTER
Index: linux-compile-i386.git/lib/tracing/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/Makefile 2008-01-09 15:17:48.000000000 -0500
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MCOUNT) += libmcount.o
+
+libmcount-y := mcount.o
Index: linux-compile-i386.git/lib/tracing/mcount.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/mcount.c 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,77 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007 Arnaldo Carvalho de Melo <***@redhat.com>
+ *
+ * Converted to be more generic:
+ * Copyright (C) 2007-2008 Steven Rostedt <***@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/mcount.h>
+
+/*
+ * Since we have nothing protecting between the test of
+ * mcount_trace_function and the call to it, we can't
+ * set it to NULL without risking a race that will have
+ * the kernel call the NULL pointer. Instead, we just
+ * set the function pointer to a dummy function.
+ */
+notrace void dummy_mcount_tracer(unsigned long ip,
+ unsigned long parent_ip)
+{
+ /* do nothing */
+}
+
+mcount_func_t mcount_trace_function __read_mostly = dummy_mcount_tracer;
+int mcount_enabled __read_mostly;
+
+/** __mcount - hook for profiling
+ *
+ * This routine is called from the arch specific mcount routine, that in turn is
+ * called from code inserted by gcc -pg.
+ */
+notrace void __mcount(void)
+{
+ mcount_trace_function(CALLER_ADDR1, CALLER_ADDR2);
+}
+EXPORT_SYMBOL_GPL(mcount);
+/*
+ * The above EXPORT_SYMBOL is for the gcc call of mcount and not the
+ * function __mcount that it is underneath. I put the export there
+ * to fool checkpatch.pl. It wants that export to be with the
+ * function, but that function happens to be in assembly.
+ */
+
+/**
+ * register_mcount_function - register a function for profiling
+ * @func - the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @func and all the functions it calls must be labeled
+ * with "notrace", otherwise it will go into a
+ * recursive loop.
+ */
+int register_mcount_function(mcount_func_t func)
+{
+ mcount_trace_function = func;
+ return 0;
+}
+
+/**
+ * clear_mcount_function - reset the mcount function
+ *
+ * This NULLs the mcount function and in essence stops
+ * tracing. There may be lag
+ */
+void clear_mcount_function(void)
+{
+ mcount_trace_function = dummy_mcount_tracer;
+}
Index: linux-compile-i386.git/include/linux/mcount.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/include/linux/mcount.h 2008-01-09 15:17:20.000000000 -0500
@@ -0,0 +1,21 @@
+#ifndef _LINUX_MCOUNT_H
+#define _LINUX_MCOUNT_H
+
+#ifdef CONFIG_MCOUNT
+extern int mcount_enabled;
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip);
+
+extern void mcount(void);
+
+int register_mcount_function(mcount_func_t func);
+void clear_mcount_function(void);
+
+#endif /* CONFIG_MCOUNT */
+#endif /* _LINUX_MCOUNT_H */
Index: linux-compile-i386.git/arch/x86/kernel/entry_64.S
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/entry_64.S 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/entry_64.S 2008-01-09 14:10:07.000000000 -0500
@@ -53,6 +53,46 @@

.code64

+#ifdef CONFIG_MCOUNT
+
+ENTRY(mcount)
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %rbp
+ mov %rsp,%rbp
+
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rdx
+ push %rcx
+ push %rax
+
+ mov 0x0(%rbp),%rax
+ mov 0x8(%rbp),%rdi
+ mov 0x8(%rax),%rsi
+
+ call *mcount_trace_function
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+
+ pop %rbp
+out:
+ ret
+#endif
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Sam Ravnborg
2008-01-10 18:30:15 UTC
Permalink
Hi Steven.
Post by Steven Rostedt
Index: linux-compile-i386.git/arch/x86/Kconfig
===================================================================
--- linux-compile-i386.git.orig/arch/x86/Kconfig 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/Kconfig 2008-01-09 14:10:07.000000000 -0500
@@ -28,6 +28,10 @@ config GENERIC_CMOS_UPDATE
bool
default y
+config ARCH_HAS_MCOUNT
+ bool
+ default y
+
Please use the following scheme:

arch/x86/Kconfig:
config X86
+ select HAVE_MCOUNT

lib/tracing/Kconfig

+ # ARCH shall select HAVE_MCOUNT if they provide this function
+ config HAVE_MCOUNT
+ bool
+
+ config MCOUNT
+ bool
+ select FRAME_POINTER

And then in your later patches:
+config MCOUNT_TRACER
+ bool "Profiler instrumentation based tracer"
+ depends on DEBUG_KERNEL && HAVE_MCOUNT
+ select MCOUNT
+ help
+ Use profiler....

The "default n" is a noop since this is the default.
And note that the depends on is removed from MCOUNT
because you use it as a select target (so dependencies
are not checked anyway).

With this scheme implmented you:
- Use new naming convention (HAVE_*)
- Avoid defining one config variable per arch
- Do not have dependencies on selected symbols
- More compact representation in arch Kconfig files

Sam
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 19:20:11 UTC
Permalink
Hi Sam,
Post by Sam Ravnborg
Hi Steven.
Post by Steven Rostedt
Index: linux-compile-i386.git/arch/x86/Kconfig
===================================================================
--- linux-compile-i386.git.orig/arch/x86/Kconfig 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/Kconfig 2008-01-09 14:10:07.000000000 -0500
@@ -28,6 +28,10 @@ config GENERIC_CMOS_UPDATE
bool
default y
+config ARCH_HAS_MCOUNT
+ bool
+ default y
+
config X86
+ select HAVE_MCOUNT
lib/tracing/Kconfig
+ # ARCH shall select HAVE_MCOUNT if they provide this function
+ config HAVE_MCOUNT
+ bool
+
+ config MCOUNT
+ bool
+ select FRAME_POINTER
+config MCOUNT_TRACER
+ bool "Profiler instrumentation based tracer"
+ depends on DEBUG_KERNEL && HAVE_MCOUNT
+ select MCOUNT
+ help
+ Use profiler....
Thanks, this does look like a cleaner approach. I'll implement it into
my next series.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 20:00:17 UTC
Permalink
Post by Steven Rostedt
Index: linux-compile-i386.git/Makefile
===================================================================
--- linux-compile-i386.git.orig/Makefile 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/Makefile 2008-01-09 14:10:07.000000000 -0500
@@ -509,6 +509,10 @@ endif
include $(srctree)/arch/$(SRCARCH)/Makefile
+# MCOUNT expects frame pointer
This comment looks stray.
Actually it's not ;-)

The original code had something like this:

#if CONFIG_MCOUNT
KBUILD_CFLAGS += ...
#else
#if CONFIG_FRAME_POINTER
KBUILD_CFLAGS += ...
#else
KBUILD_CFLAGS += ...
#endif
#endif

And Sam Ravnborg suggested to put that logic into the Kbuild system. For
which I did, but I put that comment there to just let others know that
MCOUNT expects the flags of FRAME_POINTER. But, I guess we can nuke that
comment anyway. It just leads to confusion.
Post by Steven Rostedt
+ifdef CONFIG_MCOUNT
+KBUILD_CFLAGS += -pg
+endif
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
else
Index: linux-compile-i386.git/arch/x86/Kconfig
===================================================================
--- linux-compile-i386.git.orig/arch/x86/Kconfig 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/Kconfig 2008-01-09 14:10:07.000000000 -0500
@@ -28,6 +28,10 @@ config GENERIC_CMOS_UPDATE
bool
default y
+config ARCH_HAS_MCOUNT
+ bool
+ default y
+
config CLOCKSOURCE_WATCHDOG
bool
default y
Index: linux-compile-i386.git/arch/x86/kernel/Makefile_32
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/Makefile_32 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/Makefile_32 2008-01-09 14:10:07.000000000 -0500
@@ -23,6 +23,7 @@ obj-$(CONFIG_APM) += apm_32.o
obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
obj-$(CONFIG_SMP) += smpcommon_32.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
+obj-$(CONFIG_MCOUNT) += mcount-wrapper.o
So far the code organization is different for 32 and 64 bit. I would
suggest to either
o move both trampolines into entry_*.S or
o put them in something like mcount-wrapper_32/64.S.
Yeah, that's a relic from -rt. I never liked that, but I was just too lazy
to change it. I think I'll move the mcount_wrapper into entry_32.S
Post by Steven Rostedt
obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
Index: linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,25 @@
+/*
+ * linux/arch/x86/mcount-wrapper.S
+ *
+ * Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %ebp
+ mov %esp, %ebp
What is the benefit of having a call frame in this trampoline? We used
to carry this in the i386 mcount tracer for Adeos/I-pipe too (it was
derived from the -rt code), but I just successfully tested a removal
patch. Also glibc [1] doesn't include it.
Hmm, what about having frame pointers on? Isn't that a requirement?
Post by Steven Rostedt
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ call __mcount
I think this indirection should be avoided, just like the 64-bit version
and glibc do.
I thought about that too, but didn't have the time to look into the
calling convention for that. <does a quick look at glibc>

# objdump --start-address 0x`nm /lib/libc-2.7.so | sed -ne '/ mcount$/s/^\([0-9a-f]*\).*/\1/p'` -D /lib/libc-2.7.so |head -28 |tail -12
49201cd0 <_mcount>:
49201cd0: 50 push %eax
49201cd1: 51 push %ecx
49201cd2: 52 push %edx
49201cd3: 8b 54 24 0c mov 0xc(%esp),%edx
49201cd7: 8b 45 04 mov 0x4(%ebp),%eax
49201cda: e8 91 f4 ff ff call 49201170
<__mcount_internal>
49201cdf: 5a pop %edx
49201ce0: 59 pop %ecx
49201ce1: 58 pop %eax
49201ce2: c3 ret
49201ce3: 90 nop

Until I found out about the frame pointers, I'll leave in the ebp copy.
Post by Steven Rostedt
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ popl %ebp
+ ret
....
[...]
Post by Steven Rostedt
+/** __mcount - hook for profiling
+ *
+ * This routine is called from the arch specific mcount routine, that in turn is
+ * called from code inserted by gcc -pg.
+ */
+notrace void __mcount(void)
+{
+ mcount_trace_function(CALLER_ADDR1, CALLER_ADDR2);
+}
mcount_trace_function should always be called from the assembly
trampoline, IMO.
I'll try that.
Post by Steven Rostedt
Index: linux-compile-i386.git/include/linux/mcount.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/include/linux/mcount.h 2008-01-09 15:17:20.000000000 -0500
@@ -0,0 +1,21 @@
+#ifndef _LINUX_MCOUNT_H
+#define _LINUX_MCOUNT_H
+
+#ifdef CONFIG_MCOUNT
+extern int mcount_enabled;
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
Still used when __mcount would be gone?
Will be used later on by the tracers. Actually, I wish this was in a more
generic kernel header, since I find myself typing
"__builtin_return_address" quite often.
Post by Steven Rostedt
+
+typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip);
+
+extern void mcount(void);
+
+int register_mcount_function(mcount_func_t func);
+void clear_mcount_function(void);
+
+#endif /* CONFIG_MCOUNT */
+#endif /* _LINUX_MCOUNT_H */
Index: linux-compile-i386.git/arch/x86/kernel/entry_64.S
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/entry_64.S 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/entry_64.S 2008-01-09 14:10:07.000000000 -0500
@@ -53,6 +53,46 @@
.code64
+#ifdef CONFIG_MCOUNT
+
+ENTRY(mcount)
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %rbp
+ mov %rsp,%rbp
Same as for x86_32.
Same checking for frame pointers too.
Post by Steven Rostedt
+
+ push %r11
+ push %r10
glibc [2] doesn't save those two, and we were also happy without them so
far. Or are there nasty corner-cases in the kernel?
Probably not. I'll see what happens without them.
Post by Steven Rostedt
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rdx
+ push %rcx
+ push %rax
SAVE_ARGS/RESTORE_ARGS and glibc use explicit rsp manipulation + movq
instead of push/pop. I wonder if there is a small advantage, but I'm not
that deep into this arch.
Yeah, it's probably a bit faster to do the mov instead. I'll add that.
Post by Steven Rostedt
+
+ mov 0x0(%rbp),%rax
+ mov 0x8(%rbp),%rdi
+ mov 0x8(%rax),%rsi
See [2] for saving one instruction here. :)
hehe, yeah, will do.
Post by Steven Rostedt
+
+ call *mcount_trace_function
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+
+ pop %rbp
+ ret
+#endif
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
This generic approach is very appreciated here as well. It would take
away the burden of maintaining the arch-dependent stubs within I-pipe.
What we could contribute later on is a blackfin trampoline, there is
just still a bug in their toolchain which breaks mcount for modules. But
I could check with the bfin guys again about the progress and underline
the importance of this long-pending issue.
Thanks,

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 23:10:22 UTC
Permalink
Post by Steven Rostedt
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,25 @@
+/*
+ * linux/arch/x86/mcount-wrapper.S
+ *
+ * Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %ebp
+ mov %esp, %ebp
What is the benefit of having a call frame in this trampoline? We used
to carry this in the i386 mcount tracer for Adeos/I-pipe too (it was
derived from the -rt code), but I just successfully tested a removal
patch. Also glibc [1] doesn't include it.
OK, I just tried this out on i386, and it works fine.
Post by Steven Rostedt
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ call __mcount
I think this indirection should be avoided, just like the 64-bit version
and glibc do.
I also did this too.
Post by Steven Rostedt
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ popl %ebp
+ ret
I'll go try the updates on x86_64 now.

Thanks for the tips!

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Jan Kiszka
2008-01-11 00:10:13 UTC
Permalink
Post by Steven Rostedt
Index: linux-compile-i386.git/Makefile
===================================================================
--- linux-compile-i386.git.orig/Makefile 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/Makefile 2008-01-09 14:10:07.000000000 -0500
@@ -509,6 +509,10 @@ endif
include $(srctree)/arch/$(SRCARCH)/Makefile
+# MCOUNT expects frame pointer
This comment looks stray.
Post by Steven Rostedt
+ifdef CONFIG_MCOUNT
+KBUILD_CFLAGS += -pg
+endif
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
else
Index: linux-compile-i386.git/arch/x86/Kconfig
===================================================================
--- linux-compile-i386.git.orig/arch/x86/Kconfig 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/Kconfig 2008-01-09 14:10:07.000000000 -0500
@@ -28,6 +28,10 @@ config GENERIC_CMOS_UPDATE
bool
default y
+config ARCH_HAS_MCOUNT
+ bool
+ default y
+
config CLOCKSOURCE_WATCHDOG
bool
default y
Index: linux-compile-i386.git/arch/x86/kernel/Makefile_32
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/Makefile_32 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/Makefile_32 2008-01-09 14:10:07.000000000 -0500
@@ -23,6 +23,7 @@ obj-$(CONFIG_APM) += apm_32.o
obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
obj-$(CONFIG_SMP) += smpcommon_32.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
+obj-$(CONFIG_MCOUNT) += mcount-wrapper.o
So far the code organization is different for 32 and 64 bit. I would
suggest to either

o move both trampolines into entry_*.S or
o put them in something like mcount-wrapper_32/64.S.
Post by Steven Rostedt
obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
Index: linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/arch/x86/kernel/mcount-wrapper.S 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,25 @@
+/*
+ * linux/arch/x86/mcount-wrapper.S
+ *
+ * Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %ebp
+ mov %esp, %ebp
What is the benefit of having a call frame in this trampoline? We used
to carry this in the i386 mcount tracer for Adeos/I-pipe too (it was
derived from the -rt code), but I just successfully tested a removal
patch. Also glibc [1] doesn't include it.
Post by Steven Rostedt
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ call __mcount
I think this indirection should be avoided, just like the 64-bit version
and glibc do.
Post by Steven Rostedt
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ popl %ebp
+ ret
...
Post by Steven Rostedt
Index: linux-compile-i386.git/lib/tracing/mcount.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/mcount.c 2008-01-09 14:10:07.000000000 -0500
@@ -0,0 +1,77 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ *
+ *
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/mcount.h>
+
+/*
+ * Since we have nothing protecting between the test of
+ * mcount_trace_function and the call to it, we can't
+ * set it to NULL without risking a race that will have
+ * the kernel call the NULL pointer. Instead, we just
+ * set the function pointer to a dummy function.
+ */
+notrace void dummy_mcount_tracer(unsigned long ip,
+ unsigned long parent_ip)
+{
+ /* do nothing */
+}
+
+mcount_func_t mcount_trace_function __read_mostly = dummy_mcount_tracer;
+int mcount_enabled __read_mostly;
+
+/** __mcount - hook for profiling
+ *
+ * This routine is called from the arch specific mcount routine, that in turn is
+ * called from code inserted by gcc -pg.
+ */
+notrace void __mcount(void)
+{
+ mcount_trace_function(CALLER_ADDR1, CALLER_ADDR2);
+}
mcount_trace_function should always be called from the assembly
trampoline, IMO.
Post by Steven Rostedt
+EXPORT_SYMBOL_GPL(mcount);
+/*
+ * The above EXPORT_SYMBOL is for the gcc call of mcount and not the
+ * function __mcount that it is underneath. I put the export there
+ * to fool checkpatch.pl. It wants that export to be with the
+ * function, but that function happens to be in assembly.
+ */
+
+/**
+ * register_mcount_function - register a function for profiling
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * with "notrace", otherwise it will go into a
+ * recursive loop.
+ */
+int register_mcount_function(mcount_func_t func)
+{
+ mcount_trace_function = func;
+ return 0;
+}
+
+/**
+ * clear_mcount_function - reset the mcount function
+ *
+ * This NULLs the mcount function and in essence stops
+ * tracing. There may be lag
+ */
+void clear_mcount_function(void)
+{
+ mcount_trace_function = dummy_mcount_tracer;
+}
Index: linux-compile-i386.git/include/linux/mcount.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/include/linux/mcount.h 2008-01-09 15:17:20.000000000 -0500
@@ -0,0 +1,21 @@
+#ifndef _LINUX_MCOUNT_H
+#define _LINUX_MCOUNT_H
+
+#ifdef CONFIG_MCOUNT
+extern int mcount_enabled;
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
Still used when __mcount would be gone?
Post by Steven Rostedt
+
+typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip);
+
+extern void mcount(void);
+
+int register_mcount_function(mcount_func_t func);
+void clear_mcount_function(void);
+
+#endif /* CONFIG_MCOUNT */
+#endif /* _LINUX_MCOUNT_H */
Index: linux-compile-i386.git/arch/x86/kernel/entry_64.S
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/entry_64.S 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/entry_64.S 2008-01-09 14:10:07.000000000 -0500
@@ -53,6 +53,46 @@
.code64
+#ifdef CONFIG_MCOUNT
+
+ENTRY(mcount)
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %rbp
+ mov %rsp,%rbp
Same as for x86_32.
Post by Steven Rostedt
+
+ push %r11
+ push %r10
glibc [2] doesn't save those two, and we were also happy without them so
far. Or are there nasty corner-cases in the kernel?
Post by Steven Rostedt
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rdx
+ push %rcx
+ push %rax
SAVE_ARGS/RESTORE_ARGS and glibc use explicit rsp manipulation + movq
instead of push/pop. I wonder if there is a small advantage, but I'm not
that deep into this arch.
Post by Steven Rostedt
+
+ mov 0x0(%rbp),%rax
+ mov 0x8(%rbp),%rdi
+ mov 0x8(%rax),%rsi
See [2] for saving one instruction here. :)
Post by Steven Rostedt
+
+ call *mcount_trace_function
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+
+ pop %rbp
+ ret
+#endif
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
This generic approach is very appreciated here as well. It would take
away the burden of maintaining the arch-dependent stubs within I-pipe.

What we could contribute later on is a blackfin trampoline, there is
just still a bug in their toolchain which breaks mcount for modules. But
I could check with the bfin guys again about the progress and underline
the importance of this long-pending issue.

Jan

[1]http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/sysdeps/i386/i386-mcount.S?rev=1.6&content-type=text/x-cvsweb-markup&cvsroot=glibc
[2]http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/sysdeps/x86_64/_mcount.S?rev=1.5&content-type=text/x-cvsweb-markup&cvsroot=glibc

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:10 UTC
Permalink
The get_monotonic_cycles needs to produce a monotonic counter as output.

This patch adds a cycle_raw to produce an accumulative counter.
Unfortunately there is already an cycle_accumulate variable, but that is
used to avoid clock source overflow and can also be decremented
(probably that name should be changed and we should use that for this
patch).


Signed-off-by: Steven Rostedt <***@goodmis.org>
Acked-by: John Stultz <***@us.ibm.com>

---
include/linux/clocksource.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:23:29.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 15:17:31.000000000 -0500
@@ -87,7 +87,7 @@ struct clocksource {
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated;
+ cycle_t cycle_last, cycle_accumulated, cycle_raw;
} ____cacheline_aligned_in_smp;

u64 xtime_nsec;
@@ -204,6 +204,7 @@ static inline void clocksource_accumulat
cycle_t offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
+ cs->cycle_raw += offset;
}

/**
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:11 UTC
Permalink
This adds the task comm and pid to the trace output. This gives the
output like:

CPU 0: sshd:2605 [<ffffffff80251858>] remove_wait_queue+0xc/0x4a <-- [<ffffffff802ad7be>] free_poll_entry+0x1e/0x2a
CPU 2: bash:2610 [<ffffffff8038c3aa>] tty_check_change+0x9/0xb6 <-- [<ffffffff8038d295>] tty_ioctl+0x59f/0xcdd
CPU 0: sshd:2605 [<ffffffff80491ec6>] _spin_lock_irqsave+0xe/0x81 <-- [<ffffffff80251863>] remove_wait_queue+0x17/0x4a
CPU 2: bash:2610 [<ffffffff8024e2f7>] find_vpid+0x9/0x24 <-- [<ffffffff8038d325>] tty_ioctl+0x62f/0xcdd
CPU 0: sshd:2605 [<ffffffff804923ec>] _spin_unlock_irqrestore+0x9/0x3a <-- [<ffffffff80251891>] remove_wait_queue+0x45/0x4a
CPU 0: sshd:2605 [<ffffffff802a18b3>] fput+0x9/0x1b <-- [<ffffffff802ad7c6>] free_poll_entry+0x26/0x2a


Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 6 +++++-
lib/tracing/tracer.h | 3 +++
2 files changed, 8 insertions(+), 1 deletion(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:12:52.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:39.000000000 -0500
@@ -34,6 +34,7 @@ mctracer_add_trace_entry(struct mctracer
{
unsigned long idx, idx_next;
struct mctracer_entry *entry;
+ struct task_struct *tsk = current;

idx = tr->trace_idx[cpu];
idx_next = idx + 1;
@@ -52,6 +53,8 @@ mctracer_add_trace_entry(struct mctracer
entry->idx = atomic_inc_return(&tr->cnt);
entry->ip = ip;
entry->parent_ip = parent_ip;
+ entry->pid = tsk->pid;
+ memcpy(entry->comm, tsk->comm, TASK_COMM_LEN);
}

static notrace void trace_function(const unsigned long ip,
@@ -217,7 +220,8 @@ static int s_show(struct seq_file *m, vo
if (iter->ent == NULL) {
seq_printf(m, "mctracer:\n");
} else {
- seq_printf(m, " CPU %d: ", iter->cpu);
+ seq_printf(m, "CPU %d: ", iter->cpu);
+ seq_printf(m, "%s:%d ", iter->ent->comm, iter->ent->pid);
seq_print_ip_sym(m, iter->ent->ip);
if (iter->ent->parent_ip) {
seq_printf(m, " <-- ");
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:11:35.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:39.000000000 -0500
@@ -2,11 +2,14 @@
#define _LINUX_MCOUNT_TRACER_H

#include <asm/atomic.h>
+#include <linux/sched.h>

struct mctracer_entry {
unsigned long idx;
unsigned long ip;
unsigned long parent_ip;
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
};

struct mctracer_trace {
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:12 UTC
Permalink
The latency tracer can call clocksource_read very early in bootup and
before the clock source variable has been initialized. This results in a
crash at boot up (even before earlyprintk is initialized). Since the
clock->read variable points to NULL.

This patch simply initializes the clock to use clocksource_jiffies, so
that any early user of clocksource_read will not crash.

Signed-off-by: Steven Rostedt <***@goodmis.org>
Acked-by: John Stultz <***@us.ibm.com>
---
include/linux/clocksource.h | 3 +++
kernel/time/timekeeping.c | 9 +++++++--
2 files changed, 10 insertions(+), 2 deletions(-)

Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:17:53.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:30.000000000 -0500
@@ -53,8 +53,13 @@ static inline void update_xtime_cache(u6
timespec_add_ns(&xtime_cache, nsec);
}

-static struct clocksource *clock; /* pointer to current clocksource */
-
+/*
+ * pointer to current clocksource
+ * Just in case we use clocksource_read before we initialize
+ * the actual clock source. Instead of calling a NULL read pointer
+ * we return jiffies.
+ */
+static struct clocksource *clock = &clocksource_jiffies;

#ifdef CONFIG_GENERIC_TIME
/**
Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:23:42.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 15:17:30.000000000 -0500
@@ -274,6 +274,9 @@ extern struct clocksource* clocksource_g
extern void clocksource_change_rating(struct clocksource *cs, int rating);
extern void clocksource_resume(void);

+/* used to initialize clock */
+extern struct clocksource clocksource_jiffies;
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
extern void update_vsyscall_tz(void);
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:14 UTC
Permalink
This patch resets the trace when it is started by the user.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:14:02.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:36.000000000 -0500
@@ -78,6 +78,16 @@ static notrace void trace_function(const
raw_local_irq_restore(flags);
}

+static notrace void mctracer_reset(struct mctracer_trace *tr)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ tr->trace_idx[cpu] = 0;
+ atomic_set(&tr->underrun[cpu], 0);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
enum trace_iterator {
TRACE_ITER_SYM_ONLY = 1,
@@ -324,6 +334,10 @@ static ssize_t mctracer_ctrl_write(struc

val = !!simple_strtoul(buf, NULL, 10);

+ /* When starting a new trace, reset the buffers */
+ if (val)
+ mctracer_reset(tr);
+
if (tr->ctrl ^ val) {
if (val)
register_mcount_function(trace_function);
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:14 UTC
Permalink
Several different types of tracing needs to use the
same core functions. This patch separates the core
functions from more specific onecs to allow for
future tracing methods.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/Kconfig | 6
lib/tracing/Makefile | 3
lib/tracing/trace_function.c | 211 ++++++++++++++++++
lib/tracing/tracer.c | 457 ++++++++++++++---------------------------
lib/tracing/tracer.h | 55 ++++
lib/tracing/tracer_interface.h | 14 -
6 files changed, 430 insertions(+), 316 deletions(-)

Index: linux-compile-i386.git/lib/tracing/trace_function.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/trace_function.c 2008-01-09 15:17:20.000000000 -0500
@@ -0,0 +1,211 @@
+/*
+ * ring buffer based mcount tracer
+ *
+ * Copyright (C) 2007 Steven Rostedt <***@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/mcount.h>
+
+#include "tracer.h"
+
+static struct tracing_trace function_trace;
+static DEFINE_PER_CPU(struct tracing_trace_cpu, function_trace_cpu);
+
+static notrace void function_trace_reset(struct tracing_trace *tr)
+{
+ int cpu;
+
+ tr->time_start = now();
+ tr->saved_latency = 0;
+ tr->critical_start = 0;
+ tr->critical_end = 0;
+
+ for_each_online_cpu(cpu) {
+ tr->data[cpu]->trace_idx = 0;
+ atomic_set(&tr->data[cpu]->underrun, 0);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static ssize_t function_trace_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct tracing_trace *tr = filp->private_data;
+ char buf[16];
+ int r;
+
+ r = sprintf(buf, "%ld\n", tr->ctrl);
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+}
+
+static void notrace function_trace_call(unsigned long ip,
+ unsigned long parent_ip)
+{
+ struct tracing_trace *tr = &function_trace;
+
+ tracing_function_trace(tr, ip, parent_ip);
+}
+
+static ssize_t function_trace_ctrl_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct tracing_trace *tr = filp->private_data;
+ long val;
+ char buf[16];
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ val = !!simple_strtoul(buf, NULL, 10);
+
+ /* When starting a new trace, reset the buffers */
+ if (val)
+ function_trace_reset(tr);
+ else {
+ /* pretty meaningless for now */
+ tr->time_end = now();
+ tr->saved_latency = tr->time_end - tr->time_start;
+ memcpy(tr->comm, current->comm, TASK_COMM_LEN);
+ tr->pid = current->pid;
+ tr->uid = current->uid;
+ tr->nice = current->static_prio - 20 - MAX_RT_PRIO;
+ tr->policy = current->policy;
+ tr->rt_priority = current->rt_priority;
+ }
+
+ if (tr->ctrl ^ val) {
+ if (val)
+ register_mcount_function(function_trace_call);
+ else
+ clear_mcount_function();
+ tr->ctrl = val;
+ }
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations function_trace_ctrl_fops = {
+ .open = tracing_open_generic,
+ .read = function_trace_ctrl_read,
+ .write = function_trace_ctrl_write,
+};
+
+static __init void function_trace_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("fn_trace_ctrl", 0644, d_tracer,
+ &function_trace, &function_trace_ctrl_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'ctrl' entry\n");
+
+ entry = debugfs_create_file("function_trace", 0444, d_tracer,
+ &function_trace, &tracing_lt_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'function_trace' entry\n");
+
+ entry = debugfs_create_file("trace", 0444, d_tracer,
+ &function_trace, &tracing_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'trace' entry\n");
+
+}
+
+#else
+static __init void function_trace_init_debugfs(void)
+{
+ /*
+ * No way to turn on or off the trace function
+ * without debugfs, so we just turn it on.
+ */
+ register_mcount_function(trace_function);
+}
+#endif
+
+static void function_trace_open(struct tracing_iterator *iter)
+{
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl)
+ clear_mcount_function();
+}
+
+static void function_trace_close(struct tracing_iterator *iter)
+{
+ if (iter->tr->ctrl)
+ register_mcount_function(function_trace_call);
+}
+
+static notrace int page_order(const unsigned long size)
+{
+ const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+ return ilog2(roundup_pow_of_two(nr_pages));
+}
+
+__init static int function_trace_alloc_buffers(void)
+{
+ const int order = page_order(TRACING_NR_ENTRIES * TRACING_ENTRY_SIZE);
+ const unsigned long size = (1UL << order) << PAGE_SHIFT;
+ struct tracing_entry *array;
+ int i;
+
+ for_each_possible_cpu(i) {
+ function_trace.data[i] = &per_cpu(function_trace_cpu, i);
+ array = (struct tracing_entry *)
+ __get_free_pages(GFP_KERNEL, order);
+ if (array == NULL) {
+ printk(KERN_ERR "function tracer: failed to allocate"
+ " %ld bytes for trace buffer!\n", size);
+ goto free_buffers;
+ }
+ function_trace.data[i]->trace = array;
+ }
+
+ /*
+ * Since we allocate by orders of pages, we may be able to
+ * round up a bit.
+ */
+ function_trace.entries = size / TRACING_ENTRY_SIZE;
+
+ pr_info("function tracer: %ld bytes allocated for %ld",
+ size, TRACING_NR_ENTRIES);
+ pr_info(" entries of %d bytes\n", TRACING_ENTRY_SIZE);
+ pr_info(" actual entries %ld\n", function_trace.entries);
+
+ function_trace_init_debugfs();
+
+ function_trace.open = function_trace_open;
+ function_trace.close = function_trace_close;
+
+ return 0;
+
+ free_buffers:
+ for (i-- ; i >= 0; i--) {
+ if (function_trace.data[i] && function_trace.data[i]->trace) {
+ free_pages((unsigned long)function_trace.data[i]->trace,
+ order);
+ function_trace.data[i]->trace = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+device_initcall(function_trace_alloc_buffers);
Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:49:52.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:20.000000000 -0500
@@ -19,22 +19,21 @@
#include <linux/percpu.h>
#include <linux/debugfs.h>
#include <linux/kallsyms.h>
-#include <linux/clocksource.h>
#include <linux/utsrelease.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/mcount.h>

#include "tracer.h"
-#include "tracer_interface.h"

-static inline notrace cycle_t now(void)
+enum trace_type
{
- return get_monotonic_cycles();
-}
+ __TRACE_FIRST_TYPE = 0,
+
+ TRACE_FN,

-static struct mctracer_trace mctracer_trace;
-static DEFINE_PER_CPU(struct mctracer_trace_cpu, mctracer_trace_cpu);
+ __TRACE_LAST_TYPE
+};

enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -45,18 +44,12 @@ enum trace_flag_type {
TRACE_FLAG_IRQS_HARD_OFF = 0x20,
};

-static inline notrace void
-mctracer_add_trace_entry(struct mctracer_trace *tr,
- int cpu,
- const unsigned long ip,
- const unsigned long parent_ip,
- unsigned long flags)
+static inline notrace struct tracing_entry *
+tracing_get_trace_entry(struct tracing_trace *tr,
+ struct tracing_trace_cpu *data)
{
unsigned long idx, idx_next;
- struct mctracer_entry *entry;
- struct task_struct *tsk = current;
- struct mctracer_trace_cpu *data = tr->data[cpu];
- unsigned long pc;
+ struct tracing_entry *entry;

idx = data->trace_idx;
idx_next = idx + 1;
@@ -71,12 +64,21 @@ mctracer_add_trace_entry(struct mctracer
if (unlikely(idx_next != 0 && atomic_read(&data->underrun)))
atomic_inc(&data->underrun);

+ entry = data->trace + idx * TRACING_ENTRY_SIZE;
+
+ return entry;
+}
+
+static inline notrace void
+tracing_generic_entry_update(struct tracing_entry *entry,
+ unsigned long flags)
+{
+ struct task_struct *tsk = current;
+ unsigned long pc;
+
pc = preempt_count();

- entry = data->trace + idx * MCTRACER_ENTRY_SIZE;
entry->preempt_count = pc & 0xff;
- entry->ip = ip;
- entry->parent_ip = parent_ip;
entry->pid = tsk->pid;
entry->t = now();
entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -86,42 +88,33 @@ mctracer_add_trace_entry(struct mctracer
memcpy(entry->comm, tsk->comm, TASK_COMM_LEN);
}

-static notrace void trace_function(const unsigned long ip,
- const unsigned long parent_ip)
+notrace void tracing_function_trace(struct tracing_trace *tr,
+ unsigned long ip,
+ unsigned long parent_ip)
{
unsigned long flags;
- struct mctracer_trace *tr;
int cpu;

raw_local_irq_save(flags);
cpu = raw_smp_processor_id();

- tr = &mctracer_trace;
-
atomic_inc(&tr->data[cpu]->disabled);
- if (likely(atomic_read(&tr->data[cpu]->disabled) == 1))
- mctracer_add_trace_entry(tr, cpu, ip, parent_ip, flags);
+ if (likely(atomic_read(&tr->data[cpu]->disabled) == 1)) {
+ struct tracing_entry *entry;
+ struct tracing_trace_cpu *data = tr->data[cpu];
+
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, flags);
+ entry->type = TRACE_FN;
+ entry->fn.ip = ip;
+ entry->fn.parent_ip = parent_ip;
+ }

atomic_dec(&tr->data[cpu]->disabled);

raw_local_irq_restore(flags);
}

-static notrace void mctracer_reset(struct mctracer_trace *tr)
-{
- int cpu;
-
- tr->time_start = now();
- tr->saved_latency = 0;
- tr->critical_start = 0;
- tr->critical_end = 0;
-
- for_each_online_cpu(cpu) {
- tr->data[cpu]->trace_idx = 0;
- atomic_set(&tr->data[cpu]->underrun, 0);
- }
-}
-
#ifdef CONFIG_DEBUG_FS
enum trace_iterator {
TRACE_ITER_SYM_ONLY = 1,
@@ -135,25 +128,17 @@ static const char *trace_options[] = {
NULL
};

+static unsigned trace_flags;
+
enum trace_file_type {
TRACE_FILE_LAT_FMT = 1,
};

-struct mctracer_iterator {
- struct mctracer_trace *tr;
- struct mctracer_entry *ent;
- unsigned long iter_flags;
- loff_t pos;
- unsigned long next_idx[NR_CPUS];
- int cpu;
- int idx;
-};
-
-static struct mctracer_entry *mctracer_entry_idx(struct mctracer_trace *tr,
- unsigned long idx,
- int cpu)
+static struct tracing_entry *tracing_entry_idx(struct tracing_trace *tr,
+ unsigned long idx,
+ int cpu)
{
- struct mctracer_entry *array = tr->data[cpu]->trace;
+ struct tracing_entry *array = tr->data[cpu]->trace;
unsigned long underrun;

if (idx >= tr->entries)
@@ -168,18 +153,18 @@ static struct mctracer_entry *mctracer_e
return &array[idx];
}

-static struct notrace mctracer_entry *
-find_next_entry(struct mctracer_iterator *iter, int *ent_cpu)
+static struct notrace tracing_entry *
+find_next_entry(struct tracing_iterator *iter, int *ent_cpu)
{
- struct mctracer_trace *tr = iter->tr;
- struct mctracer_entry *ent, *next = NULL;
+ struct tracing_trace *tr = iter->tr;
+ struct tracing_entry *ent, *next = NULL;
int next_cpu = -1;
int cpu;

for_each_possible_cpu(cpu) {
if (!tr->data[cpu]->trace)
continue;
- ent = mctracer_entry_idx(tr, iter->next_idx[cpu], cpu);
+ ent = tracing_entry_idx(tr, iter->next_idx[cpu], cpu);
if (ent && (!next || next->t > ent->t)) {
next = ent;
next_cpu = cpu;
@@ -192,9 +177,9 @@ find_next_entry(struct mctracer_iterator
return next;
}

-static void *find_next_entry_inc(struct mctracer_iterator *iter)
+static void *find_next_entry_inc(struct tracing_iterator *iter)
{
- struct mctracer_entry *next;
+ struct tracing_entry *next;
int next_cpu = -1;

next = find_next_entry(iter, &next_cpu);
@@ -212,7 +197,7 @@ static void *find_next_entry_inc(struct
static void notrace *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct mctracer_iterator *iter = m->private;
+ struct tracing_iterator *iter = m->private;
void *ent;
void *last_ent = iter->ent;
int i = (int)*pos;
@@ -241,15 +226,11 @@ s_next(struct seq_file *m, void *v, loff

static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct mctracer_iterator *iter = m->private;
+ struct tracing_iterator *iter = m->private;
void *p = NULL;
loff_t l = 0;
int i;

- /* stop the trace while dumping */
- if (iter->tr->ctrl)
- clear_mcount_function();
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -271,9 +252,6 @@ static void *s_start(struct seq_file *m,

static void s_stop(struct seq_file *m, void *p)
{
- struct mctracer_iterator *iter = m->private;
- if (iter->tr->ctrl)
- register_mcount_function(trace_function);
}

#ifdef CONFIG_KALLSYMS
@@ -322,13 +300,13 @@ static void notrace print_help_header(st
}

static void notrace print_trace_header(struct seq_file *m,
- struct mctracer_iterator *iter)
+ struct tracing_iterator *iter)
{
- struct mctracer_trace *tr = iter->tr;
+ struct tracing_trace *tr = iter->tr;
unsigned long underruns = 0;
unsigned long underrun;
unsigned long entries = 0;
- int sym_only = !!(tr->iter_flags & TRACE_ITER_SYM_ONLY);
+ int sym_only = !!(trace_flags & TRACE_ITER_SYM_ONLY);
int cpu;

for_each_possible_cpu(cpu) {
@@ -388,7 +366,7 @@ static void notrace print_trace_header(s


static void notrace
-lat_print_generic(struct seq_file *m, struct mctracer_entry *entry, int cpu)
+lat_print_generic(struct seq_file *m, struct tracing_entry *entry, int cpu)
{
int hardirq, softirq;

@@ -415,7 +393,7 @@ lat_print_generic(struct seq_file *m, st
}

if (entry->preempt_count)
- seq_printf(m, "%lx", entry->preempt_count);
+ seq_printf(m, "%x", entry->preempt_count);
else
seq_puts(m, ".");
}
@@ -436,15 +414,15 @@ lat_print_timestamp(struct seq_file *m,
}

static void notrace
-print_lat_fmt(struct seq_file *m, struct mctracer_iterator *iter,
+print_lat_fmt(struct seq_file *m, struct tracing_iterator *iter,
unsigned int trace_idx, int cpu)
{
- struct mctracer_entry *entry = iter->ent;
- struct mctracer_entry *next_entry = find_next_entry(iter, NULL);
+ struct tracing_entry *entry = iter->ent;
+ struct tracing_entry *next_entry = find_next_entry(iter, NULL);
unsigned long abs_usecs;
unsigned long rel_usecs;
- int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);
- int verbose = !!(iter->tr->iter_flags & TRACE_ITER_VERBOSE);
+ int sym_only = !!(trace_flags & TRACE_ITER_SYM_ONLY);
+ int verbose = !!(trace_flags & TRACE_ITER_VERBOSE);

if (!next_entry)
next_entry = entry;
@@ -452,7 +430,7 @@ print_lat_fmt(struct seq_file *m, struct
abs_usecs = cycles_to_usecs(entry->t - iter->tr->time_start);

if (verbose) {
- seq_printf(m, "%16s %5d %d %ld %08lx %08x [%08lx]"
+ seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]"
" %ld.%03ldms (+%ld.%03ldms): ",
entry->comm,
entry->pid, cpu, entry->flags,
@@ -464,18 +442,22 @@ print_lat_fmt(struct seq_file *m, struct
lat_print_generic(m, entry, cpu);
lat_print_timestamp(m, abs_usecs, rel_usecs);
}
- seq_print_ip_sym(m, entry->ip, sym_only);
- seq_puts(m, " (");
- seq_print_ip_sym(m, entry->parent_ip, sym_only);
- seq_puts(m, ")\n");
+ switch (entry->type) {
+ case TRACE_FN:
+ seq_print_ip_sym(m, entry->fn.ip, sym_only);
+ seq_puts(m, " (");
+ seq_print_ip_sym(m, entry->fn.parent_ip, sym_only);
+ seq_puts(m, ")\n");
+ break;
+ }
}

static void notrace print_trace_fmt(struct seq_file *m,
- struct mctracer_iterator *iter)
+ struct tracing_iterator *iter)
{
unsigned long usec_rem;
unsigned long secs;
- int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);
+ int sym_only = !!(trace_flags & TRACE_ITER_SYM_ONLY);
unsigned long long t;

t = cycles_to_usecs(iter->ent->t);
@@ -486,18 +468,22 @@ static void notrace print_trace_fmt(stru
seq_printf(m, "CPU %d: ", iter->cpu);
seq_printf(m, "%s:%d ", iter->ent->comm,
iter->ent->pid);
- seq_print_ip_sym(m, iter->ent->ip, sym_only);
- if (iter->ent->parent_ip) {
- seq_printf(m, " <-- ");
- seq_print_ip_sym(m, iter->ent->parent_ip,
- sym_only);
+ switch (iter->ent->type) {
+ case TRACE_FN:
+ seq_print_ip_sym(m, iter->ent->fn.ip, sym_only);
+ if (iter->ent->fn.parent_ip) {
+ seq_printf(m, " <-- ");
+ seq_print_ip_sym(m, iter->ent->fn.parent_ip,
+ sym_only);
+ }
+ break;
}
seq_printf(m, "\n");
}

-static int trace_empty(struct mctracer_iterator *iter)
+static int trace_empty(struct tracing_iterator *iter)
{
- struct mctracer_trace_cpu *data;
+ struct tracing_trace_cpu *data;
int cpu;

for_each_possible_cpu(cpu) {
@@ -513,7 +499,7 @@ static int trace_empty(struct mctracer_i

static int s_show(struct seq_file *m, void *v)
{
- struct mctracer_iterator *iter = v;
+ struct tracing_iterator *iter = v;

if (iter->ent == NULL) {
if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
@@ -521,10 +507,10 @@ static int s_show(struct seq_file *m, vo
if (trace_empty(iter))
return 0;
print_trace_header(m, iter);
- if (!(iter->tr->iter_flags & TRACE_ITER_VERBOSE))
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
print_help_header(m);
} else
- seq_printf(m, "mctracer:\n");
+ seq_printf(m, "tracer:\n");
} else {
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
print_lat_fmt(m, iter, iter->idx, iter->cpu);
@@ -535,17 +521,17 @@ static int s_show(struct seq_file *m, vo
return 0;
}

-static struct seq_operations mctrace_seq_ops = {
+static struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};

-static struct mctracer_iterator *
-__mctrace_open(struct inode *inode, struct file *file, int *ret)
+static struct tracing_iterator notrace *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
{
- struct mctracer_iterator *iter;
+ struct tracing_iterator *iter;

iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
@@ -553,14 +539,21 @@ __mctrace_open(struct inode *inode, stru
goto out;
}

- iter->tr = &mctracer_trace;
+ iter->tr = inode->i_private;
iter->pos = -1;

/* TODO stop tracer */
- *ret = seq_open(file, &mctrace_seq_ops);
+ *ret = seq_open(file, &tracer_seq_ops);
if (!*ret) {
struct seq_file *m = file->private_data;
m->private = iter;
+
+ /*
+ * Most tracers want to disable the
+ * trace while printing a trace.
+ */
+ if (iter->tr->open)
+ iter->tr->open(iter);
} else {
kfree(iter);
iter = NULL;
@@ -570,21 +563,40 @@ __mctrace_open(struct inode *inode, stru
return iter;
}

-static int mctrace_open(struct inode *inode, struct file *file)
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct tracing_iterator *iter = m->private;
+
+ if (iter->tr->close)
+ iter->tr->close(iter);
+
+ seq_release(inode, file);
+ kfree(iter);
+ return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
{
int ret;

- __mctrace_open(inode, file, &ret);
+ __tracing_open(inode, file, &ret);

return ret;
}

-static int mctrace_lt_open(struct inode *inode, struct file *file)
+static int tracing_lt_open(struct inode *inode, struct file *file)
{
- struct mctracer_iterator *iter;
+ struct tracing_iterator *iter;
int ret;

- iter = __mctrace_open(inode, file, &ret);
+ iter = __tracing_open(inode, file, &ret);

if (!ret)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
@@ -592,105 +604,23 @@ static int mctrace_lt_open(struct inode
return ret;
}

-int mctrace_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- struct mctracer_iterator *iter = m->private;
-
- seq_release(inode, file);
- kfree(iter);
- return 0;
-}
-
-static struct file_operations mctrace_fops = {
- .open = mctrace_open,
+struct file_operations tracing_fops = {
+ .open = tracing_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = mctrace_release,
+ .release = tracing_release,
};

-static struct file_operations mctrace_lt_fops = {
- .open = mctrace_lt_open,
+struct file_operations tracing_lt_fops = {
+ .open = tracing_lt_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = mctrace_release,
+ .release = tracing_release,
};

-static int mctracer_open_generic(struct inode *inode, struct file *filp)
+static ssize_t tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- filp->private_data = inode->i_private;
- return 0;
-}
-
-
-static ssize_t mctracer_ctrl_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct mctracer_trace *tr = filp->private_data;
- char buf[16];
- int r;
-
- r = sprintf(buf, "%ld\n", tr->ctrl);
- return simple_read_from_buffer(ubuf, cnt, ppos,
- buf, r);
-}
-
-static ssize_t mctracer_ctrl_write(struct file *filp,
- const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct mctracer_trace *tr = filp->private_data;
- long val;
- char buf[16];
-
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- val = !!simple_strtoul(buf, NULL, 10);
-
- /* When starting a new trace, reset the buffers */
- if (val)
- mctracer_reset(tr);
- else {
- /* pretty meaningless for now */
- tr->time_end = now();
- tr->saved_latency = tr->time_end - tr->time_start;
- memcpy(tr->comm, current->comm, TASK_COMM_LEN);
- tr->pid = current->pid;
- tr->uid = current->uid;
- tr->nice = current->static_prio - 20 - MAX_RT_PRIO;
- tr->policy = current->policy;
- tr->rt_priority = current->rt_priority;
- }
-
- if (tr->ctrl ^ val) {
- if (val)
- register_mcount_function(trace_function);
- else
- clear_mcount_function();
- tr->ctrl = val;
- }
-
- filp->f_pos += cnt;
-
- return cnt;
-}
-
-static struct file_operations mctracer_ctrl_fops = {
- .open = mctracer_open_generic,
- .read = mctracer_ctrl_read,
- .write = mctracer_ctrl_write,
-};
-
-static ssize_t mctracer_iter_ctrl_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct mctracer_trace *tr = filp->private_data;
char *buf;
int r = 0;
int i;
@@ -708,7 +638,7 @@ static ssize_t mctracer_iter_ctrl_read(s
return -ENOMEM;

for (i = 0; trace_options[i]; i++) {
- if (tr->iter_flags & (1 << i))
+ if (trace_flags & (1 << i))
r += sprintf(buf + r, "%s ", trace_options[i]);
else
r += sprintf(buf + r, "no%s ", trace_options[i]);
@@ -725,11 +655,10 @@ static ssize_t mctracer_iter_ctrl_read(s
return r;
}

-static ssize_t mctracer_iter_ctrl_write(struct file *filp,
- const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static ssize_t tracing_iter_ctrl_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct mctracer_trace *tr = filp->private_data;
char buf[64];
char *cmp = buf;
int neg = 0;
@@ -753,9 +682,9 @@ static ssize_t mctracer_iter_ctrl_write(

if (strncmp(cmp, trace_options[i], len) == 0) {
if (neg)
- tr->iter_flags &= ~(1 << i);
+ trace_flags &= ~(1 << i);
else
- tr->iter_flags |= (1 << i);
+ trace_flags |= (1 << i);
break;
}
}
@@ -765,103 +694,49 @@ static ssize_t mctracer_iter_ctrl_write(
return cnt;
}

-static struct file_operations mctracer_iter_fops = {
- .open = mctracer_open_generic,
- .read = mctracer_iter_ctrl_read,
- .write = mctracer_iter_ctrl_write,
+static struct file_operations tracing_iter_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_iter_ctrl_read,
+ .write = tracing_iter_ctrl_write,
};

-static void mctrace_init_debugfs(void)
-{
- struct dentry *d_mctracer;
- struct dentry *entry;
+static struct dentry *d_tracer;

- d_mctracer = debugfs_create_dir("tracing", NULL);
- if (!d_mctracer) {
- pr_warning("Could not create debugfs directory mctracer\n");
- return;
- }
-
- entry = debugfs_create_file("ctrl", 0644, d_mctracer,
- &mctracer_trace, &mctracer_ctrl_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'ctrl' entry\n");
+struct dentry *tracing_init_dentry(void)
+{
+ static int once;

- entry = debugfs_create_file("iter_ctrl", 0644, d_mctracer,
- &mctracer_trace, &mctracer_iter_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+ if (d_tracer)
+ return d_tracer;

- entry = debugfs_create_file("function_trace", 0444, d_mctracer,
- &mctracer_trace, &mctrace_lt_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'function_trace' entry\n");
+ d_tracer = debugfs_create_dir("tracing", NULL);

- entry = debugfs_create_file("trace", 0444, d_mctracer,
- &mctracer_trace, &mctrace_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
+ if (!d_tracer && !once) {
+ once = 1;
+ pr_warning("Could not create debugfs directory 'tracing'\n");
+ return NULL;
+ }

+ return d_tracer;
}
-#else /* CONFIG_DEBUG_FS */
-static void mctrace_init_debugfs(void)
-{
- /*
- * No way to turn on or off the trace function
- * without debugfs, so we just turn it on.
- */
- register_mcount_function(trace_function);
-}
-#endif /* CONFIG_DEBUG_FS */

-static notrace int page_order(const unsigned long size)
+static __init int trace_init_debugfs(void)
{
- const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
- return ilog2(roundup_pow_of_two(nr_pages));
-}
-
-static notrace int mctracer_alloc_buffers(void)
-{
- const int order = page_order(MCTRACER_NR_ENTRIES * MCTRACER_ENTRY_SIZE);
- const unsigned long size = (1UL << order) << PAGE_SHIFT;
- struct mctracer_entry *array;
- int i;
-
- for_each_possible_cpu(i) {
- mctracer_trace.data[i] = &per_cpu(mctracer_trace_cpu, i);
- array = (struct mctracer_entry *)
- __get_free_pages(GFP_KERNEL, order);
- if (array == NULL) {
- printk(KERN_ERR "mctracer: failed to allocate"
- " %ld bytes for trace buffer!\n", size);
- goto free_buffers;
- }
- mctracer_trace.data[i]->trace = array;
- }
-
- /*
- * Since we allocate by orders of pages, we may be able to
- * round up a bit.
- */
- mctracer_trace.entries = size / MCTRACER_ENTRY_SIZE;
+ struct dentry *d_tracer;
+ struct dentry *entry;

- pr_info("mctracer: %ld bytes allocated for %ld entries of %d bytes\n",
- size, MCTRACER_NR_ENTRIES, MCTRACER_ENTRY_SIZE);
- pr_info(" actual entries %ld\n", mctracer_trace.entries);
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;

- mctrace_init_debugfs();
+ entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+ NULL, &tracing_iter_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'iter_ctrl' entry\n");

return 0;
-
- free_buffers:
- for (i-- ; i >= 0; i--) {
- if (mctracer_trace.data[i] && mctracer_trace.data[i]->trace) {
- free_pages((unsigned long)mctracer_trace.data[i]->trace,
- order);
- mctracer_trace.data[i]->trace = NULL;
- }
- }
- return -ENOMEM;
}

-device_initcall(mctracer_alloc_buffers);
+device_initcall(trace_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:49:52.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:20.000000000 -0500
@@ -3,28 +3,36 @@

#include <asm/atomic.h>
#include <linux/sched.h>
+#include <linux/clocksource.h>

-struct mctracer_entry {
- unsigned long long t;
+struct tracing_function {
unsigned long ip;
unsigned long parent_ip;
- unsigned long preempt_count;
- unsigned long flags;
+};
+
+struct tracing_entry {
+ char type;
+ char cpu; /* who will want to trace more than 256 CPUS? */
+ char flags;
+ char preempt_count; /* assumes PREEMPT_MASK is 8 bits or less */
+ int pid;
+ cycle_t t;
char comm[TASK_COMM_LEN];
- pid_t pid;
+ struct tracing_function fn;
};

-struct mctracer_trace_cpu {
+struct tracing_trace_cpu {
void *trace;
unsigned long trace_idx;
atomic_t disabled;
atomic_t underrun;
};

-struct mctracer_trace {
+struct tracing_iterator;
+
+struct tracing_trace {
unsigned long entries;
long ctrl;
- unsigned long iter_flags;
char comm[TASK_COMM_LEN];
pid_t pid;
uid_t uid;
@@ -36,7 +44,36 @@ struct mctracer_trace {
unsigned long critical_end;
unsigned long long time_start;
unsigned long long time_end;
- struct mctracer_trace_cpu *data[NR_CPUS];
+ void (*open)(struct tracing_iterator *iter);
+ void (*close)(struct tracing_iterator *iter);
+ struct tracing_trace_cpu *data[NR_CPUS];
};

+struct tracing_iterator {
+ struct tracing_trace *tr;
+ struct tracing_entry *ent;
+ unsigned long iter_flags;
+ loff_t pos;
+ unsigned long next_idx[NR_CPUS];
+ int cpu;
+ int idx;
+};
+
+#define TRACING_ENTRY_SIZE sizeof(struct tracing_entry)
+#define TRACING_NR_ENTRIES (65536UL)
+
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void tracing_function_trace(struct tracing_trace *tr,
+ unsigned long ip,
+ unsigned long parent_ip);
+
+extern struct file_operations tracing_fops;
+extern struct file_operations tracing_lt_fops;
+
+static inline notrace cycle_t now(void)
+{
+ return get_monotonic_cycles();
+}
+
#endif /* _LINUX_MCOUNT_TRACER_H */
Index: linux-compile-i386.git/lib/tracing/Kconfig
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/Kconfig 2008-01-09 14:49:52.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/Kconfig 2008-01-09 15:17:20.000000000 -0500
@@ -6,12 +6,16 @@ config MCOUNT
depends on DEBUG_KERNEL
select FRAME_POINTER

+config TRACING
+ bool
+ depends on DEBUG_KERNEL

-config MCOUNT_TRACER
+config FUNCTION_TRACER
bool "Profiler instrumentation based tracer"
depends on DEBUG_KERNEL && ARCH_HAS_MCOUNT
default n
select MCOUNT
+ select TRACING
help
Use profiler instrumentation, adding -pg to CFLAGS. This will
insert a call to an architecture specific __mcount routine,
Index: linux-compile-i386.git/lib/tracing/Makefile
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/Makefile 2008-01-09 14:49:52.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/Makefile 2008-01-09 15:17:20.000000000 -0500
@@ -1,5 +1,6 @@
obj-$(CONFIG_MCOUNT) += libmcount.o

-obj-$(CONFIG_MCOUNT_TRACER) += tracer.o
+obj-$(CONFIG_TRACING) += tracer.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace_function.o

libmcount-y := mcount.o
Index: linux-compile-i386.git/lib/tracing/tracer_interface.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer_interface.h 2008-01-09 14:49:52.000000000 -0500
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-#ifndef _LINUX_MCTRACER_INTERFACE_H
-#define _LINUX_MCTRACER_INTERFACE_H
-
-#include "tracer.h"
-
-/*
- * Will be at least sizeof(struct mctracer_entry), but callers can request more
- * space for private stuff, such as a timestamp, preempt_count, etc.
- */
-#define MCTRACER_ENTRY_SIZE sizeof(struct mctracer_entry)
-
-#define MCTRACER_NR_ENTRIES (65536UL)
-
-#endif /* _LINUX_MCTRACER_INTERFACE_H */
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:15 UTC
Permalink
This patch adds a latency_trace file with the format used
by RT in which others have created tools to disect. This
file adds some useful recording for tracing, but still
does not add actual latency tracing.

Format like:

preemption latency trace v1.1.5 on 2.6.24-rc7-tst
--------------------------------------------------------------------
latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
-----------------
| task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
-----------------

_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
swapper-0 0d.h. 1595128us+: set_normalized_timespec+0x8/0x2d <c043841d> (ktime_get_ts+0x4a/0x4e <c04499d4>)
swapper-0 0d.h. 1595131us+: _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)

Or with verbose turned on:

preemption latency trace v1.1.5 on 2.6.24-rc7-tst
--------------------------------------------------------------------
latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
-----------------
| task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
-----------------

swapper 0 0 9 00000000 00000000 [f3675f41] 1595.128ms (+0.003ms): set_normalized_timespec+0x8/0x2d <c043841d> (ktime_get_ts+0x4a/0x4e <c04499d4>)
swapper 0 0 9 00000000 00000001 [f3675f45] 1595.131ms (+0.003ms): _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)
swapper 0 0 9 00000000 00000002 [f3675f48] 1595.135ms (+0.003ms): _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)


Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 454 +++++++++++++++++++++++++++++++++++++++++++++------
lib/tracing/tracer.h | 13 +
2 files changed, 414 insertions(+), 53 deletions(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:38:55.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:22.000000000 -0500
@@ -20,7 +20,9 @@
#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/clocksource.h>
+#include <linux/utsrelease.h>
#include <linux/uaccess.h>
+#include <linux/hardirq.h>
#include <linux/mcount.h>

#include "tracer.h"
@@ -34,16 +36,27 @@ static inline notrace cycle_t now(void)
static struct mctracer_trace mctracer_trace;
static DEFINE_PER_CPU(struct mctracer_trace_cpu, mctracer_trace_cpu);

+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_NEED_RESCHED = 0x02,
+ TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_IRQS_HARD_OFF = 0x20,
+};
+
static inline notrace void
mctracer_add_trace_entry(struct mctracer_trace *tr,
int cpu,
const unsigned long ip,
- const unsigned long parent_ip)
+ const unsigned long parent_ip,
+ unsigned long flags)
{
unsigned long idx, idx_next;
struct mctracer_entry *entry;
struct task_struct *tsk = current;
struct mctracer_trace_cpu *data = tr->data[cpu];
+ unsigned long pc;

idx = data->trace_idx;
idx_next = idx + 1;
@@ -58,11 +71,18 @@ mctracer_add_trace_entry(struct mctracer
if (unlikely(idx_next != 0 && atomic_read(&data->underrun)))
atomic_inc(&data->underrun);

+ pc = preempt_count();
+
entry = data->trace + idx * MCTRACER_ENTRY_SIZE;
+ entry->preempt_count = pc & 0xff;
entry->ip = ip;
entry->parent_ip = parent_ip;
entry->pid = tsk->pid;
entry->t = now();
+ entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+ ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+ ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
memcpy(entry->comm, tsk->comm, TASK_COMM_LEN);
}

@@ -80,7 +100,7 @@ static notrace void trace_function(const

atomic_inc(&tr->data[cpu]->disabled);
if (likely(atomic_read(&tr->data[cpu]->disabled) == 1))
- mctracer_add_trace_entry(tr, cpu, ip, parent_ip);
+ mctracer_add_trace_entry(tr, cpu, ip, parent_ip, flags);

atomic_dec(&tr->data[cpu]->disabled);

@@ -91,6 +111,11 @@ static notrace void mctracer_reset(struc
{
int cpu;

+ tr->time_start = now();
+ tr->saved_latency = 0;
+ tr->critical_start = 0;
+ tr->critical_end = 0;
+
for_each_online_cpu(cpu) {
tr->data[cpu]->trace_idx = 0;
atomic_set(&tr->data[cpu]->underrun, 0);
@@ -100,11 +125,24 @@ static notrace void mctracer_reset(struc
#ifdef CONFIG_DEBUG_FS
enum trace_iterator {
TRACE_ITER_SYM_ONLY = 1,
+ TRACE_ITER_VERBOSE = 2,
+};
+
+/* These must match the bit postions above */
+static const char *trace_options[] = {
+ "symonly",
+ "verbose",
+ NULL
+};
+
+enum trace_file_type {
+ TRACE_FILE_LAT_FMT = 1,
};

struct mctracer_iterator {
struct mctracer_trace *tr;
struct mctracer_entry *ent;
+ unsigned long iter_flags;
loff_t pos;
unsigned long next_idx[NR_CPUS];
int cpu;
@@ -130,37 +168,53 @@ static struct mctracer_entry *mctracer_e
return &array[idx];
}

-static void *find_next_entry(struct mctracer_iterator *iter)
+static struct notrace mctracer_entry *
+find_next_entry(struct mctracer_iterator *iter, int *ent_cpu)
{
struct mctracer_trace *tr = iter->tr;
- struct mctracer_entry *ent;
- struct mctracer_entry *next = NULL;
- int next_i = -1;
- int i;
+ struct mctracer_entry *ent, *next = NULL;
+ int next_cpu = -1;
+ int cpu;

- for_each_possible_cpu(i) {
- if (!tr->data[i]->trace)
+ for_each_possible_cpu(cpu) {
+ if (!tr->data[cpu]->trace)
continue;
- ent = mctracer_entry_idx(tr, iter->next_idx[i], i);
+ ent = mctracer_entry_idx(tr, iter->next_idx[cpu], cpu);
if (ent && (!next || next->t > ent->t)) {
next = ent;
- next_i = i;
+ next_cpu = cpu;
}
}
+
+ if (ent_cpu)
+ *ent_cpu = next_cpu;
+
+ return next;
+}
+
+static void *find_next_entry_inc(struct mctracer_iterator *iter)
+{
+ struct mctracer_entry *next;
+ int next_cpu = -1;
+
+ next = find_next_entry(iter, &next_cpu);
+
if (next) {
- iter->next_idx[next_i]++;
+ iter->next_idx[next_cpu]++;
iter->idx++;
}
iter->ent = next;
- iter->cpu = next_i;
+ iter->cpu = next_cpu;

return next ? iter : NULL;
}

-static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+static void notrace *
+s_next(struct seq_file *m, void *v, loff_t *pos)
{
struct mctracer_iterator *iter = m->private;
void *ent;
+ void *last_ent = iter->ent;
int i = (int)*pos;

(*pos)++;
@@ -170,15 +224,18 @@ static void *s_next(struct seq_file *m,
return NULL;

if (iter->idx < 0)
- ent = find_next_entry(iter);
+ ent = find_next_entry_inc(iter);
else
ent = iter;

while (ent && iter->idx < i)
- ent = find_next_entry(iter);
+ ent = find_next_entry_inc(iter);

iter->pos = *pos;

+ if (last_ent && !ent)
+ seq_puts(m, "\n\nvim:ft=help\n");
+
return ent;
}

@@ -239,40 +296,240 @@ static void seq_print_symbol(struct seq_
#endif

static void notrace seq_print_ip_sym(struct seq_file *m,
- unsigned long ip,
- int sym_only)
+ unsigned long ip, int sym_only)
{
+ if (!ip) {
+ seq_printf(m, "0");
+ return;
+ }
+
seq_print_symbol(m, "%s", ip);
if (!sym_only)
seq_printf(m, " <" IP_FMT ">", ip);
}

+static void notrace print_help_header(struct seq_file *m)
+{
+ seq_puts(m, " _------=> CPU# \n");
+ seq_puts(m, " / _-----=> irqs-off \n");
+ seq_puts(m, " | / _----=> need-resched \n");
+ seq_puts(m, " || / _---=> hardirq/softirq \n");
+ seq_puts(m, " ||| / _--=> preempt-depth \n");
+ seq_puts(m, " |||| / \n");
+ seq_puts(m, " ||||| delay \n");
+ seq_puts(m, " cmd pid ||||| time | caller \n");
+ seq_puts(m, " \\ / ||||| \\ | / \n");
+}
+
+static void notrace print_trace_header(struct seq_file *m,
+ struct mctracer_iterator *iter)
+{
+ struct mctracer_trace *tr = iter->tr;
+ unsigned long underruns = 0;
+ unsigned long underrun;
+ unsigned long entries = 0;
+ int sym_only = !!(tr->iter_flags & TRACE_ITER_SYM_ONLY);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (tr->data[cpu]->trace) {
+ underrun = atomic_read(&tr->data[cpu]->underrun);
+ if (underrun) {
+ underruns += underrun;
+ entries += tr->entries;
+ } else
+ entries += tr->data[cpu]->trace_idx;
+ }
+ }
+
+ seq_printf(m, "preemption latency trace v1.1.5 on %s\n",
+ UTS_RELEASE);
+ seq_puts(m, "-----------------------------------"
+ "---------------------------------\n");
+ seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+ " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+ cycles_to_usecs(tr->saved_latency),
+ entries,
+ (entries + underruns),
+ smp_processor_id(),
+#if defined(CONFIG_PREEMPT_NONE)
+ "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+ "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+ "preempt",
+#else
+ "rt",
+#endif
+ /* These are reserved for later use */
+ 0, 0, 0, 0);
+#ifdef CONFIG_SMP
+ seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+ seq_puts(m, ")\n");
+#endif
+ seq_puts(m, " -----------------\n");
+ seq_printf(m, " | task: %.16s-%d "
+ "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+ tr->comm, tr->pid, tr->uid, tr->nice,
+ tr->policy, tr->rt_priority);
+ seq_puts(m, " -----------------\n");
+
+ if (tr->critical_start) {
+ seq_puts(m, " => started at: ");
+ seq_print_ip_sym(m, tr->critical_start, sym_only);
+ seq_puts(m, "\n => ended at: ");
+ seq_print_ip_sym(m, tr->critical_end, sym_only);
+ seq_puts(m, "\n");
+ }
+
+ seq_puts(m, "\n");
+}
+
+
+static void notrace
+lat_print_generic(struct seq_file *m, struct mctracer_entry *entry, int cpu)
+{
+ int hardirq, softirq;
+
+ seq_printf(m, "%8.8s-%-5d ", entry->comm, entry->pid);
+ seq_printf(m, "%d", cpu);
+ seq_printf(m, "%c%c",
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ if (hardirq && softirq)
+ seq_putc(m, 'H');
+ else {
+ if (hardirq)
+ seq_putc(m, 'h');
+ else {
+ if (softirq)
+ seq_putc(m, 's');
+ else
+ seq_putc(m, '.');
+ }
+ }
+
+ if (entry->preempt_count)
+ seq_printf(m, "%lx", entry->preempt_count);
+ else
+ seq_puts(m, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void notrace
+lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs,
+ unsigned long rel_usecs)
+{
+ seq_printf(m, " %4lldus", abs_usecs);
+ if (rel_usecs > preempt_mark_thresh)
+ seq_puts(m, "!: ");
+ else if (rel_usecs > 1)
+ seq_puts(m, "+: ");
+ else
+ seq_puts(m, " : ");
+}
+
+static void notrace
+print_lat_fmt(struct seq_file *m, struct mctracer_iterator *iter,
+ unsigned int trace_idx, int cpu)
+{
+ struct mctracer_entry *entry = iter->ent;
+ struct mctracer_entry *next_entry = find_next_entry(iter, NULL);
+ unsigned long abs_usecs;
+ unsigned long rel_usecs;
+ int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);
+ int verbose = !!(iter->tr->iter_flags & TRACE_ITER_VERBOSE);
+
+ if (!next_entry)
+ next_entry = entry;
+ rel_usecs = cycles_to_usecs(next_entry->t - entry->t);
+ abs_usecs = cycles_to_usecs(entry->t - iter->tr->time_start);
+
+ if (verbose) {
+ seq_printf(m, "%16s %5d %d %ld %08lx %08x [%08lx]"
+ " %ld.%03ldms (+%ld.%03ldms): ",
+ entry->comm,
+ entry->pid, cpu, entry->flags,
+ entry->preempt_count, trace_idx,
+ cycles_to_usecs(entry->t),
+ abs_usecs/1000,
+ abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+ } else {
+ lat_print_generic(m, entry, cpu);
+ lat_print_timestamp(m, abs_usecs, rel_usecs);
+ }
+ seq_print_ip_sym(m, entry->ip, sym_only);
+ seq_puts(m, " (");
+ seq_print_ip_sym(m, entry->parent_ip, sym_only);
+ seq_puts(m, ")\n");
+}
+
+static void notrace print_trace_fmt(struct seq_file *m,
+ struct mctracer_iterator *iter)
+{
+ unsigned long usec_rem;
+ unsigned long secs;
+ int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);
+ unsigned long long t;
+
+ t = cycles_to_usecs(iter->ent->t);
+ usec_rem = do_div(t, 1000000ULL);
+ secs = (unsigned long)t;
+
+ seq_printf(m, "[%5lu.%06lu] ", secs, usec_rem);
+ seq_printf(m, "CPU %d: ", iter->cpu);
+ seq_printf(m, "%s:%d ", iter->ent->comm,
+ iter->ent->pid);
+ seq_print_ip_sym(m, iter->ent->ip, sym_only);
+ if (iter->ent->parent_ip) {
+ seq_printf(m, " <-- ");
+ seq_print_ip_sym(m, iter->ent->parent_ip,
+ sym_only);
+ }
+ seq_printf(m, "\n");
+}
+
+static int trace_empty(struct mctracer_iterator *iter)
+{
+ struct mctracer_trace_cpu *data;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ data = iter->tr->data[cpu];
+
+ if (data->trace &&
+ (data->trace_idx ||
+ atomic_read(&data->underrun)))
+ return 0;
+ }
+ return 1;
+}
+
static int s_show(struct seq_file *m, void *v)
{
struct mctracer_iterator *iter = v;
- int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);

if (iter->ent == NULL) {
- seq_printf(m, "mctracer:\n");
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return 0;
+ print_trace_header(m, iter);
+ if (!(iter->tr->iter_flags & TRACE_ITER_VERBOSE))
+ print_help_header(m);
+ } else
+ seq_printf(m, "mctracer:\n");
} else {
- unsigned long long t;
- unsigned long usec_rem;
- unsigned long secs;
-
- t = cycles_to_usecs(iter->ent->t);
- usec_rem = do_div(t, 1000000ULL);
- secs = (unsigned long)t;
-
- seq_printf(m, "[%5lu.%06lu] ", secs, usec_rem);
- seq_printf(m, "CPU %d: ", iter->cpu);
- seq_printf(m, "%s:%d ", iter->ent->comm, iter->ent->pid);
- seq_print_ip_sym(m, iter->ent->ip, sym_only);
- if (iter->ent->parent_ip) {
- seq_printf(m, " <-- ");
- seq_print_ip_sym(m, iter->ent->parent_ip,
- sym_only);
- }
- seq_printf(m, "\n");
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ print_lat_fmt(m, iter, iter->idx, iter->cpu);
+ else
+ print_trace_fmt(m, iter);
}

return 0;
@@ -285,25 +542,52 @@ static struct seq_operations mctrace_seq
.show = s_show,
};

-static int mctrace_open(struct inode *inode, struct file *file)
+static struct mctracer_iterator *
+__mctrace_open(struct inode *inode, struct file *file, int *ret)
{
struct mctracer_iterator *iter;
- int ret;

iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
+ if (!iter) {
+ *ret = -ENOMEM;
+ goto out;
+ }

iter->tr = &mctracer_trace;
iter->pos = -1;

/* TODO stop tracer */
- ret = seq_open(file, &mctrace_seq_ops);
- if (!ret) {
+ *ret = seq_open(file, &mctrace_seq_ops);
+ if (!*ret) {
struct seq_file *m = file->private_data;
m->private = iter;
- } else
+ } else {
kfree(iter);
+ iter = NULL;
+ }
+
+ out:
+ return iter;
+}
+
+static int mctrace_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ __mctrace_open(inode, file, &ret);
+
+ return ret;
+}
+
+static int mctrace_lt_open(struct inode *inode, struct file *file)
+{
+ struct mctracer_iterator *iter;
+ int ret;
+
+ iter = __mctrace_open(inode, file, &ret);
+
+ if (!ret)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;

return ret;
}
@@ -325,6 +609,13 @@ static struct file_operations mctrace_fo
.release = mctrace_release,
};

+static struct file_operations mctrace_lt_fops = {
+ .open = mctrace_lt_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mctrace_release,
+};
+
static int mctracer_open_generic(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
@@ -365,6 +656,17 @@ static ssize_t mctracer_ctrl_write(struc
/* When starting a new trace, reset the buffers */
if (val)
mctracer_reset(tr);
+ else {
+ /* pretty meaningless for now */
+ tr->time_end = now();
+ tr->saved_latency = tr->time_end - tr->time_start;
+ memcpy(tr->comm, current->comm, TASK_COMM_LEN);
+ tr->pid = current->pid;
+ tr->uid = current->uid;
+ tr->nice = current->static_prio - 20 - MAX_RT_PRIO;
+ tr->policy = current->policy;
+ tr->rt_priority = current->rt_priority;
+ }

if (tr->ctrl ^ val) {
if (val)
@@ -389,15 +691,38 @@ static ssize_t mctracer_iter_ctrl_read(s
size_t cnt, loff_t *ppos)
{
struct mctracer_trace *tr = filp->private_data;
- char buf[64];
+ char *buf;
int r = 0;
+ int i;
+ int len = 0;

- if (tr->iter_flags & TRACE_ITER_SYM_ONLY)
- r = sprintf(buf, "%s", "symonly ");
- r += sprintf(buf+r, "\n");
+ /* calulate max size */
+ for (i = 0; trace_options[i]; i++) {
+ len += strlen(trace_options[i]);
+ len += 3; /* "no" and space */
+ }

- return simple_read_from_buffer(ubuf, cnt, ppos,
- buf, r);
+ /* +2 for \n and \0 */
+ buf = kmalloc(len + 2, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = 0; trace_options[i]; i++) {
+ if (tr->iter_flags & (1 << i))
+ r += sprintf(buf + r, "%s ", trace_options[i]);
+ else
+ r += sprintf(buf + r, "no%s ", trace_options[i]);
+ }
+
+ r += sprintf(buf + r, "\n");
+ WARN_ON(r >= len + 2);
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+
+ kfree(buf);
+
+ return r;
}

static ssize_t mctracer_iter_ctrl_write(struct file *filp,
@@ -406,6 +731,9 @@ static ssize_t mctracer_iter_ctrl_write(
{
struct mctracer_trace *tr = filp->private_data;
char buf[64];
+ char *cmp = buf;
+ int neg = 0;
+ int i;

if (cnt > 63)
cnt = 63;
@@ -415,8 +743,22 @@ static ssize_t mctracer_iter_ctrl_write(

buf[cnt] = 0;

- if (strncmp(buf, "symonly", 7) == 0)
- tr->iter_flags |= TRACE_ITER_SYM_ONLY;
+ if (strncmp(buf, "no", 2) == 0) {
+ neg = 1;
+ cmp += 2;
+ }
+
+ for (i = 0; trace_options[i]; i++) {
+ int len = strlen(trace_options[i]);
+
+ if (strncmp(cmp, trace_options[i], len) == 0) {
+ if (neg)
+ tr->iter_flags &= ~(1 << i);
+ else
+ tr->iter_flags |= (1 << i);
+ break;
+ }
+ }

filp->f_pos += cnt;

@@ -449,6 +791,12 @@ static void mctrace_init_debugfs(void)
&mctracer_trace, &mctracer_iter_fops);
if (!entry)
pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+ entry = debugfs_create_file("function_trace", 0444, d_mctracer,
+ &mctracer_trace, &mctrace_lt_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'function_trace' entry\n");
+
entry = debugfs_create_file("trace", 0444, d_mctracer,
&mctracer_trace, &mctrace_fops);
if (!entry)
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:37:13.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:22.000000000 -0500
@@ -8,6 +8,8 @@ struct mctracer_entry {
unsigned long long t;
unsigned long ip;
unsigned long parent_ip;
+ unsigned long preempt_count;
+ unsigned long flags;
char comm[TASK_COMM_LEN];
pid_t pid;
};
@@ -23,6 +25,17 @@ struct mctracer_trace {
unsigned long entries;
long ctrl;
unsigned long iter_flags;
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
+ uid_t uid;
+ unsigned long nice;
+ unsigned long policy;
+ unsigned long rt_priority;
+ unsigned long saved_latency;
+ unsigned long critical_start;
+ unsigned long critical_end;
+ unsigned long long time_start;
+ unsigned long long time_end;
struct mctracer_trace_cpu *data[NR_CPUS];
};
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Daniel Walker
2008-01-10 03:50:15 UTC
Permalink
Post by Steven Rostedt
+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_NEED_RESCHED = 0x02,
+ TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_IRQS_HARD_OFF = 0x20,
+};
You've got some errant flags here .. TRACE_FLAG_NEED_RESCHED_DELAYED is
-rt only , and TRACE_FLAG_IRQS_HARD_OFF is unsed (and sort of a
relic) ..

Daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:16 UTC
Permalink
The design is for mcount based tracers to be added thru the
lib/tracing/tracer_interface.h file, just like mcount users should add
themselves to lib/tracing/mcount.h. A Kconfig rule chooses the right MCOUNT and
MCOUNT_TRACER user.

This is to avoid function call costs for something that is supposed to be used
only in a debug kernel and that has to reduce to the bare minimum the per
function call overhead of mcount based tracing.

Signed-off-by: Arnaldo Carvalho de Melo <***@ghostprotocols.net>
Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/Kconfig | 12 +++
lib/tracing/Makefile | 2
lib/tracing/tracer.c | 124 +++++++++++++++++++++++++++++++++++++++++
lib/tracing/tracer.h | 21 ++++++
lib/tracing/tracer_interface.h | 14 ++++
5 files changed, 173 insertions(+)
create mode 100644 lib/tracing/tracer.c
create mode 100644 lib/tracing/tracer.h
create mode 100644 lib/tracing/tracer_interface.h

Index: linux-compile-i386.git/lib/tracing/Kconfig
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/Kconfig 2008-01-09 14:10:07.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/Kconfig 2008-01-09 15:17:22.000000000 -0500
@@ -5,3 +5,15 @@ config MCOUNT
bool
depends on DEBUG_KERNEL
select FRAME_POINTER
+
+
+config MCOUNT_TRACER
+ bool "Profiler instrumentation based tracer"
+ depends on DEBUG_KERNEL && ARCH_HAS_MCOUNT
+ default n
+ select MCOUNT
+ help
+ Use profiler instrumentation, adding -pg to CFLAGS. This will
+ insert a call to an architecture specific __mcount routine,
+ that the debugging mechanism using this facility will hook by
+ providing a set of inline routines.
Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:45.000000000 -0500
@@ -0,0 +1,124 @@
+/*
+ * ring buffer based mcount tracer
+ *
+ * Copyright (C) 2007 Arnaldo Carvalho de Melo <***@redhat.com>
+ * Steven Rostedt <***@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/seq_file.h>
+#include <linux/mcount.h>
+
+#include "tracer.h"
+#include "tracer_interface.h"
+
+static struct mctracer_trace mctracer_trace;
+
+static inline notrace void
+mctracer_add_trace_entry(struct mctracer_trace *tr,
+ int cpu,
+ const unsigned long ip,
+ const unsigned long parent_ip)
+{
+ unsigned long idx, idx_next;
+ struct mctracer_entry *entry;
+
+ idx = tr->trace_idx[cpu];
+ idx_next = idx + 1;
+
+ if (unlikely(idx_next >= tr->entries)) {
+ atomic_inc(&tr->underrun[cpu]);
+ idx_next = 0;
+ }
+
+ tr->trace_idx[cpu] = idx_next;
+
+ if (unlikely(idx_next != 0 && atomic_read(&tr->underrun[cpu])))
+ atomic_inc(&tr->underrun[cpu]);
+
+ entry = tr->trace[cpu] + idx * MCTRACER_ENTRY_SIZE;
+ entry->idx = atomic_inc_return(&tr->cnt);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+}
+
+static notrace void trace_function(const unsigned long ip,
+ const unsigned long parent_ip)
+{
+ unsigned long flags;
+ struct mctracer_trace *tr;
+ int cpu;
+
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+
+ tr = &mctracer_trace;
+
+ atomic_inc(&tr->disabled[cpu]);
+ if (likely(atomic_read(&tr->disabled[cpu]) == 1))
+ mctracer_add_trace_entry(tr, cpu, ip, parent_ip);
+
+ atomic_dec(&tr->disabled[cpu]);
+
+ raw_local_irq_restore(flags);
+}
+
+
+static notrace int page_order(const unsigned long size)
+{
+ const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+ return ilog2(roundup_pow_of_two(nr_pages));
+}
+
+static notrace int mctracer_alloc_buffers(void)
+{
+ const int order = page_order(MCTRACER_NR_ENTRIES * MCTRACER_ENTRY_SIZE);
+ const unsigned long size = (1UL << order) << PAGE_SHIFT;
+ struct mctracer_entry *array;
+ int i;
+
+ for_each_possible_cpu(i) {
+ array = (struct mctracer_entry *)
+ __get_free_pages(GFP_KERNEL, order);
+ if (array == NULL) {
+ printk(KERN_ERR "mctracer: failed to allocate"
+ " %ld bytes for trace buffer!\n", size);
+ goto free_buffers;
+ }
+ mctracer_trace.trace[i] = array;
+ }
+
+ /*
+ * Since we allocate by orders of pages, we may be able to
+ * round up a bit.
+ */
+ mctracer_trace.entries = size / MCTRACER_ENTRY_SIZE;
+
+ pr_info("mctracer: %ld bytes allocated for %ld entries of %d bytes\n",
+ size, MCTRACER_NR_ENTRIES, MCTRACER_ENTRY_SIZE);
+ pr_info(" actual entries %ld\n", mctracer_trace.entries);
+
+ register_mcount_function(trace_function);
+
+ return 0;
+
+ free_buffers:
+ for (i-- ; i >= 0; i--) {
+ if (mctracer_trace.trace[i]) {
+ free_pages((unsigned long)mctracer_trace.trace[i],
+ order);
+ mctracer_trace.trace[i] = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+device_initcall(mctracer_alloc_buffers);
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:45.000000000 -0500
@@ -0,0 +1,21 @@
+#ifndef _LINUX_MCOUNT_TRACER_H
+#define _LINUX_MCOUNT_TRACER_H
+
+#include <asm/atomic.h>
+
+struct mctracer_entry {
+ unsigned long idx;
+ unsigned long ip;
+ unsigned long parent_ip;
+};
+
+struct mctracer_trace {
+ void *trace[NR_CPUS];
+ unsigned long trace_idx[NR_CPUS];
+ unsigned long entries;
+ atomic_t cnt;
+ atomic_t disabled[NR_CPUS];
+ atomic_t underrun[NR_CPUS];
+};
+
+#endif /* _LINUX_MCOUNT_TRACER_H */
Index: linux-compile-i386.git/lib/tracing/tracer_interface.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile-i386.git/lib/tracing/tracer_interface.h 2008-01-09 15:17:22.000000000 -0500
@@ -0,0 +1,14 @@
+#ifndef _LINUX_MCTRACER_INTERFACE_H
+#define _LINUX_MCTRACER_INTERFACE_H
+
+#include "tracer.h"
+
+/*
+ * Will be at least sizeof(struct mctracer_entry), but callers can request more
+ * space for private stuff, such as a timestamp, preempt_count, etc.
+ */
+#define MCTRACER_ENTRY_SIZE sizeof(struct mctracer_entry)
+
+#define MCTRACER_NR_ENTRIES (65536UL)
+
+#endif /* _LINUX_MCTRACER_INTERFACE_H */
Index: linux-compile-i386.git/lib/tracing/Makefile
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/Makefile 2008-01-09 14:10:07.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/Makefile 2008-01-09 15:17:22.000000000 -0500
@@ -1,3 +1,5 @@
obj-$(CONFIG_MCOUNT) += libmcount.o

+obj-$(CONFIG_MCOUNT_TRACER) += tracer.o
+
libmcount-y := mcount.o
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:15 UTC
Permalink
The trace output is very verbose with outputing both the
IP address (Instruction Pointer not Internet Protocol!)
and the kallsyms symbol. So if kallsyms is configured into
the kernel, another file is created in the debugfs system.
This is the trace_symonly file that leaves out the IP address.

Here's an example:

CPU 1: swapper:0 smp_apic_timer_interrupt+0xc/0x58 <-- apic_timer_interrupt+0x66/0x70
CPU 1: swapper:0 exit_idle+0x9/0x22 <-- smp_apic_timer_interrupt+0x35/0x58
CPU 0: sshd:2611 _spin_unlock+0x9/0x38 <-- __qdisc_run+0xb2/0x1a1
CPU 1: swapper:0 __exit_idle+0x9/0x2e <-- exit_idle+0x20/0x22
CPU 0: sshd:2611 _spin_lock+0xe/0x7a <-- __qdisc_run+0xba/0x1a1
CPU 1: swapper:0 atomic_notifier_call_chain+0x9/0x16 <-- __exit_idle+0x2c/0x2e
CPU 1: swapper:0 __atomic_notifier_call_chain+0xe/0x56 <-- atomic_notifier_call_chain+0x14/0x16


Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++----
lib/tracing/tracer.h | 1
2 files changed, 62 insertions(+), 4 deletions(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:13:46.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:37.000000000 -0500
@@ -79,6 +79,10 @@ static notrace void trace_function(const
}

#ifdef CONFIG_DEBUG_FS
+enum trace_iterator {
+ TRACE_ITER_SYM_ONLY = 1,
+};
+
struct mctracer_iterator {
struct mctracer_trace *tr;
struct mctracer_entry *ent;
@@ -207,25 +211,29 @@ static void seq_print_symbol(struct seq_
#endif

static void notrace seq_print_ip_sym(struct seq_file *m,
- unsigned long ip)
+ unsigned long ip,
+ int sym_only)
{
seq_print_symbol(m, "%s", ip);
- seq_printf(m, " <" IP_FMT ">", ip);
+ if (!sym_only)
+ seq_printf(m, " <" IP_FMT ">", ip);
}

static int s_show(struct seq_file *m, void *v)
{
struct mctracer_iterator *iter = v;
+ int sym_only = !!(iter->tr->iter_flags & TRACE_ITER_SYM_ONLY);

if (iter->ent == NULL) {
seq_printf(m, "mctracer:\n");
} else {
seq_printf(m, "CPU %d: ", iter->cpu);
seq_printf(m, "%s:%d ", iter->ent->comm, iter->ent->pid);
- seq_print_ip_sym(m, iter->ent->ip);
+ seq_print_ip_sym(m, iter->ent->ip, sym_only);
if (iter->ent->parent_ip) {
seq_printf(m, " <-- ");
- seq_print_ip_sym(m, iter->ent->parent_ip);
+ seq_print_ip_sym(m, iter->ent->parent_ip,
+ sym_only);
}
seq_printf(m, "\n");
}
@@ -335,6 +343,50 @@ static struct file_operations mctracer_c
.write = mctracer_ctrl_write,
};

+static ssize_t mctracer_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct mctracer_trace *tr = filp->private_data;
+ char buf[64];
+ int r = 0;
+
+ if (tr->iter_flags & TRACE_ITER_SYM_ONLY)
+ r = sprintf(buf, "%s", "symonly ");
+ r += sprintf(buf+r, "\n");
+
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+}
+
+static ssize_t mctracer_iter_ctrl_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct mctracer_trace *tr = filp->private_data;
+ char buf[64];
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "symonly", 7) == 0)
+ tr->iter_flags |= TRACE_ITER_SYM_ONLY;
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations mctracer_iter_fops = {
+ .open = mctracer_open_generic,
+ .read = mctracer_iter_ctrl_read,
+ .write = mctracer_iter_ctrl_write,
+};
+
static void mctrace_init_debugfs(void)
{
struct dentry *d_mctracer;
@@ -351,10 +403,15 @@ static void mctrace_init_debugfs(void)
if (!entry)
pr_warning("Could not create debugfs 'ctrl' entry\n");

+ entry = debugfs_create_file("iter_ctrl", 0644, d_mctracer,
+ &mctracer_trace, &mctracer_iter_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
entry = debugfs_create_file("trace", 0444, d_mctracer,
&mctracer_trace, &mctrace_fops);
if (!entry)
pr_warning("Could not create debugfs 'trace' entry\n");
+
}
#else /* CONFIG_DEBUG_FS */
static void mctrace_init_debugfs(void)
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:13:46.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:36.000000000 -0500
@@ -17,6 +17,7 @@ struct mctracer_trace {
unsigned long trace_idx[NR_CPUS];
unsigned long entries;
long ctrl;
+ unsigned long iter_flags;
atomic_t cnt;
atomic_t disabled[NR_CPUS];
atomic_t underrun[NR_CPUS];
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:16 UTC
Permalink
Add /debugfs/tracing/trace to output trace output.

Here's an example of the content.

CPU 0: [<ffffffff80494691>] notifier_call_chain+0x16/0x60 <-- [<ffffffff80494701>] __atomic_notifier_call_chain+0x26/0x56
CPU 0: [<ffffffff802161c8>] mce_idle_callback+0x9/0x2f <-- [<ffffffff804946b3>] notifier_call_chain+0x38/0x60
CPU 0: [<ffffffff8037fb7a>] acpi_processor_idle+0x16/0x518 <-- [<ffffffff8020aee8>] cpu_idle+0xa1/0xe7
CPU 0: [<ffffffff8037fa98>] acpi_safe_halt+0x9/0x43 <-- [<ffffffff8037fd3a>] acpi_processor_idle+0x1d6/0x518
CPU 1: [<ffffffff80221db8>] smp_apic_timer_interrupt+0xc/0x58 <-- [<ffffffff8020cf06>] apic_timer_interrupt+0x66/0x70
CPU 1: [<ffffffff8020ac22>] exit_idle+0x9/0x22 <-- [<ffffffff80221de1>] smp_apic_timer_interrupt+0x35/0x58
CPU 1: [<ffffffff8020ab97>] __exit_idle+0x9/0x2e <-- [<ffffffff8020ac39>] exit_idle+0x20/0x22
CPU 1: [<ffffffff8049473a>] atomic_notifier_call_chain+0x9/0x16 <-- [<ffffffff8020abba>] __exit_idle+0x2c/0x2e
CPU 1: [<ffffffff804946e9>] __atomic_notifier_call_chain+0xe/0x56 <-- [<ffffffff80494745>] atomic_notifier_call_chain+0x14/0x16
CPU 1: [<ffffffff80494691>] notifier_call_chain+0x16/0x60 <-- [<ffffffff80494701>] __atomic_notifier_call_chain+0x26/0x56
CPU 1: [<ffffffff802161c8>] mce_idle_callback+0x9/0x2f <-- [<ffffffff804946b3>] notifier_call_chain+0x38/0x60

This is in the format of the output when KALLSYMS is defined.

CPU <CPU#>: [<IP>] <func> <-- [<Parent-IP>] <parent-func>

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 206 insertions(+)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:11:59.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:41.000000000 -0500
@@ -13,9 +13,11 @@
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
+#include <linux/module.h>
#include <linux/linkage.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/mcount.h>

@@ -74,6 +76,205 @@ static notrace void trace_function(const
}

#ifdef CONFIG_DEBUG_FS
+struct mctracer_iterator {
+ struct mctracer_trace *tr;
+ struct mctracer_entry *ent;
+ unsigned long next_idx[NR_CPUS];
+ int cpu;
+ int idx;
+};
+
+static struct mctracer_entry *mctracer_entry_idx(struct mctracer_trace *tr,
+ unsigned long idx,
+ int cpu)
+{
+ struct mctracer_entry *array = tr->trace[cpu];
+ unsigned long underrun;
+
+ if (idx >= tr->entries)
+ return NULL;
+
+ underrun = atomic_read(&tr->underrun[cpu]);
+ if (underrun)
+ idx = ((underrun - 1) + idx) % tr->entries;
+ else if (idx >= tr->trace_idx[cpu])
+ return NULL;
+
+ return &array[idx];
+}
+
+static void *find_next_entry(struct mctracer_iterator *iter)
+{
+ struct mctracer_trace *tr = iter->tr;
+ struct mctracer_entry *ent;
+ struct mctracer_entry *next = NULL;
+ int next_i = -1;
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (!tr->trace[i])
+ continue;
+ ent = mctracer_entry_idx(tr, iter->next_idx[i], i);
+ if (ent && (!next || next->idx > ent->idx)) {
+ next = ent;
+ next_i = i;
+ }
+ }
+ if (next) {
+ iter->next_idx[next_i]++;
+ iter->idx++;
+ }
+ iter->ent = next;
+ iter->cpu = next_i;
+
+ return next ? iter : NULL;
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct mctracer_iterator *iter = m->private;
+ void *ent;
+ int i = (int)*pos;
+
+ (*pos)++;
+
+ /* can't go backwards */
+ if (iter->idx > i)
+ return NULL;
+
+ if (iter->idx < 0)
+ ent = find_next_entry(iter);
+ else
+ ent = iter;
+
+ while (ent && iter->idx < i)
+ ent = find_next_entry(iter);
+
+ return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ struct mctracer_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l = 0;
+ int i;
+
+ iter->ent = NULL;
+ iter->cpu = 0;
+ iter->idx = -1;
+
+ for (i = 0; i < NR_CPUS; i++)
+ iter->next_idx[i] = 0;
+
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl)
+ clear_mcount_function();
+
+ for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+ ;
+
+ return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ struct mctracer_iterator *iter = m->private;
+ if (iter->tr->ctrl)
+ register_mcount_function(trace_function);
+}
+
+#ifdef CONFIG_KALLSYMS
+static void seq_print_symbol(struct seq_file *m,
+ const char *fmt, unsigned long address)
+{
+ char buffer[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(buffer, address);
+ seq_printf(m, fmt, buffer);
+}
+#else
+# define seq_print_symbol(m, fmt, address) do { } while (0)
+#endif
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static void notrace seq_print_ip_sym(struct seq_file *m,
+ unsigned long ip)
+{
+ seq_print_symbol(m, "%s", ip);
+ seq_printf(m, " <" IP_FMT ">", ip);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+ struct mctracer_iterator *iter = v;
+
+ if (iter->ent == NULL) {
+ seq_printf(m, "mctracer:\n");
+ } else {
+ seq_printf(m, " CPU %d: ", iter->cpu);
+ seq_print_ip_sym(m, iter->ent->ip);
+ if (iter->ent->parent_ip) {
+ seq_printf(m, " <-- ");
+ seq_print_ip_sym(m, iter->ent->parent_ip);
+ }
+ seq_printf(m, "\n");
+ }
+
+ return 0;
+}
+
+static struct seq_operations mctrace_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int mctrace_open(struct inode *inode, struct file *file)
+{
+ struct mctracer_iterator *iter;
+ int ret;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ iter->tr = &mctracer_trace;
+
+ /* TODO stop tracer */
+ ret = seq_open(file, &mctrace_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = iter;
+ } else
+ kfree(iter);
+
+ return ret;
+}
+
+int mctrace_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct mctracer_iterator *iter = m->private;
+
+ seq_release(inode, file);
+ kfree(iter);
+ return 0;
+}
+
+static struct file_operations mctrace_fops = {
+ .open = mctrace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mctrace_release,
+};
+
static int mctracer_open_generic(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
@@ -145,6 +346,11 @@ static void mctrace_init_debugfs(void)
&mctracer_trace, &mctracer_ctrl_fops);
if (!entry)
pr_warning("Could not create debugfs 'ctrl' entry\n");
+
+ entry = debugfs_create_file("trace", 0444, d_mctracer,
+ &mctracer_trace, &mctrace_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'trace' entry\n");
}
#else /* CONFIG_DEBUG_FS */
static void mctrace_init_debugfs(void)
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:16 UTC
Permalink
Now that each entry has a reliable timestamp, we can
use the timestamp as the source of sorting the trace and
remove the atomic increment.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 3 +--
lib/tracing/tracer.h | 2 --
2 files changed, 1 insertion(+), 4 deletions(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:35:58.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:25.000000000 -0500
@@ -59,7 +59,6 @@ mctracer_add_trace_entry(struct mctracer
atomic_inc(&data->underrun);

entry = data->trace + idx * MCTRACER_ENTRY_SIZE;
- entry->idx = atomic_inc_return(&tr->cnt);
entry->ip = ip;
entry->parent_ip = parent_ip;
entry->pid = tsk->pid;
@@ -142,7 +141,7 @@ static void *find_next_entry(struct mctr
if (!tr->data[i]->trace)
continue;
ent = mctracer_entry_idx(tr, iter->next_idx[i], i);
- if (ent && (!next || next->idx > ent->idx)) {
+ if (ent && (!next || next->t > ent->t)) {
next = ent;
next_i = i;
}
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:35:11.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:24.000000000 -0500
@@ -6,7 +6,6 @@

struct mctracer_entry {
unsigned long long t;
- unsigned long idx;
unsigned long ip;
unsigned long parent_ip;
char comm[TASK_COMM_LEN];
@@ -24,7 +23,6 @@ struct mctracer_trace {
unsigned long entries;
long ctrl;
unsigned long iter_flags;
- atomic_t cnt;
struct mctracer_trace_cpu *data[NR_CPUS];
};
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:17 UTC
Permalink
This patch adds an interface into debugfs.

/debugfs/tracing/ctrl

echoing 1 into the ctrl file turns on the tracer,
and echoing 0 turns it off.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-
lib/tracing/tracer.h | 1
2 files changed, 87 insertions(+), 1 deletion(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:10:46.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:43.000000000 -0500
@@ -15,6 +15,8 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
#include <linux/mcount.h>

#include "tracer.h"
@@ -71,6 +73,89 @@ static notrace void trace_function(const
raw_local_irq_restore(flags);
}

+#ifdef CONFIG_DEBUG_FS
+static int mctracer_open_generic(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+
+static ssize_t mctracer_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct mctracer_trace *tr = filp->private_data;
+ char buf[16];
+ int r;
+
+ r = sprintf(buf, "%ld\n", tr->ctrl);
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+}
+
+static ssize_t mctracer_ctrl_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct mctracer_trace *tr = filp->private_data;
+ long val;
+ char buf[16];
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ val = !!simple_strtoul(buf, NULL, 10);
+
+ if (tr->ctrl ^ val) {
+ if (val)
+ register_mcount_function(trace_function);
+ else
+ clear_mcount_function();
+ tr->ctrl = val;
+ }
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations mctracer_ctrl_fops = {
+ .open = mctracer_open_generic,
+ .read = mctracer_ctrl_read,
+ .write = mctracer_ctrl_write,
+};
+
+static void mctrace_init_debugfs(void)
+{
+ struct dentry *d_mctracer;
+ struct dentry *entry;
+
+ d_mctracer = debugfs_create_dir("tracing", NULL);
+ if (!d_mctracer) {
+ pr_warning("Could not create debugfs directory mctracer\n");
+ return;
+ }
+
+ entry = debugfs_create_file("ctrl", 0644, d_mctracer,
+ &mctracer_trace, &mctracer_ctrl_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'ctrl' entry\n");
+}
+#else /* CONFIG_DEBUG_FS */
+static void mctrace_init_debugfs(void)
+{
+ /*
+ * No way to turn on or off the trace function
+ * without debugfs, so we just turn it on.
+ */
+ register_mcount_function(trace_function);
+}
+#endif /* CONFIG_DEBUG_FS */

static notrace int page_order(const unsigned long size)
{
@@ -106,7 +191,7 @@ static notrace int mctracer_alloc_buffer
size, MCTRACER_NR_ENTRIES, MCTRACER_ENTRY_SIZE);
pr_info(" actual entries %ld\n", mctracer_trace.entries);

- register_mcount_function(trace_function);
+ mctrace_init_debugfs();

return 0;

Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:10:46.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:41.000000000 -0500
@@ -13,6 +13,7 @@ struct mctracer_trace {
void *trace[NR_CPUS];
unsigned long trace_idx[NR_CPUS];
unsigned long entries;
+ long ctrl;
atomic_t cnt;
atomic_t disabled[NR_CPUS];
atomic_t underrun[NR_CPUS];
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:17 UTC
Permalink
The latency tracer needs a way to get an accurate time
without grabbing any locks. Locks themselves might call
the latency tracer and cause at best a slow down.

This patch adds get_monotonic_cycles that returns cycles
from a reliable clock source in a monotonic fashion.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/clocksource.h | 3 ++
kernel/time/timekeeping.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+)

Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:27:26.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 14:34:40.000000000 -0500
@@ -103,6 +103,54 @@ static inline void __get_realtime_clock_
timespec_add_ns(ts, nsecs);
}

+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
+
+ do {
+ /*
+ * cycle_raw and cycle_last can change on
+ * another CPU and we need the delta calculation
+ * of cycle_now and cycle_last happen atomic, as well
+ * as the adding to cycle_raw. We don't need to grab
+ * any locks, we just keep trying until get all the
+ * calculations together in one state.
+ *
+ * In fact, we __cant__ grab any locks. This
+ * function is called from the latency_tracer which can
+ * be called anywhere. To grab any locks (including
+ * seq_locks) we risk putting ourselves into a deadlock.
+ */
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
+
+unsigned long notrace cycles_to_usecs(cycle_t cycles)
+{
+ u64 ret = cyc2ns(clock, cycles);
+
+ ret += NSEC_PER_USEC/2; /* For rounding in do_div() */
+ do_div(ret, NSEC_PER_USEC);
+
+ return ret;
+}
+
+cycle_t notrace usecs_to_cycles(unsigned long usecs)
+{
+ return ns2cyc(clock, (u64)usecs * 1000);
+}
+
/**
* getnstimeofday - Returns the time of day in a timespec
* @ts: pointer to the timespec to be set
Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:27:51.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 14:29:44.000000000 -0500
@@ -273,6 +273,9 @@ extern int clocksource_register(struct c
extern struct clocksource* clocksource_get_next(void);
extern void clocksource_change_rating(struct clocksource *cs, int rating);
extern void clocksource_resume(void);
+extern cycle_t get_monotonic_cycles(void);
+extern unsigned long cycles_to_usecs(cycle_t cycles);
+extern cycle_t usecs_to_cycles(unsigned long usecs);

/* used to initialize clock */
extern struct clocksource clocksource_jiffies;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Daniel Walker
2008-01-10 03:40:07 UTC
Permalink
Post by Steven Rostedt
+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
+
+ do {
+ /*
+ * cycle_raw and cycle_last can change on
+ * another CPU and we need the delta calculation
+ * of cycle_now and cycle_last happen atomic, as well
+ * as the adding to cycle_raw. We don't need to grab
+ * any locks, we just keep trying until get all the
+ * calculations together in one state.
+ *
+ * In fact, we __cant__ grab any locks. This
+ * function is called from the latency_tracer which can
+ * be called anywhere. To grab any locks (including
+ * seq_locks) we risk putting ourselves into a deadlock.
+ */
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
The last I check this changed caused problems for me with the -rt
latency tracer.. I haven't tested this tree , but all things being equal
I would imagine the exists here also..

Daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-15 21:50:10 UTC
Permalink
Post by Steven Rostedt
The latency tracer needs a way to get an accurate time
without grabbing any locks. Locks themselves might call
the latency tracer and cause at best a slow down.
This patch adds get_monotonic_cycles that returns cycles
from a reliable clock source in a monotonic fashion.
---
include/linux/clocksource.h | 3 ++
kernel/time/timekeeping.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+)
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:27:26.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 14:34:40.000000000 -0500
@@ -103,6 +103,54 @@ static inline void __get_realtime_clock_
timespec_add_ns(ts, nsecs);
}
+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
+
+ do {
+ /*
+ * cycle_raw and cycle_last can change on
+ * another CPU and we need the delta calculation
+ * of cycle_now and cycle_last happen atomic, as well
+ * as the adding to cycle_raw. We don't need to grab
+ * any locks, we just keep trying until get all the
+ * calculations together in one state.
+ *
+ * In fact, we __cant__ grab any locks. This
+ * function is called from the latency_tracer which can
+ * be called anywhere. To grab any locks (including
+ * seq_locks) we risk putting ourselves into a deadlock.
+ */
I wonder what makes the compiler read the clock->cycle_raw and
clock->cycle_last variables twice ? I guess some memory barriers could
be welcome here ?
Post by Steven Rostedt
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
+
+unsigned long notrace cycles_to_usecs(cycle_t cycles)
+{
+ u64 ret = cyc2ns(clock, cycles);
+
+ ret += NSEC_PER_USEC/2; /* For rounding in do_div() */
+ do_div(ret, NSEC_PER_USEC);
+
+ return ret;
+}
+
+cycle_t notrace usecs_to_cycles(unsigned long usecs)
+{
+ return ns2cyc(clock, (u64)usecs * 1000);
+}
+
/**
* getnstimeofday - Returns the time of day in a timespec
Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:27:51.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 14:29:44.000000000 -0500
@@ -273,6 +273,9 @@ extern int clocksource_register(struct c
extern struct clocksource* clocksource_get_next(void);
extern void clocksource_change_rating(struct clocksource *cs, int rating);
extern void clocksource_resume(void);
+extern cycle_t get_monotonic_cycles(void);
+extern unsigned long cycles_to_usecs(cycle_t cycles);
+extern cycle_t usecs_to_cycles(unsigned long usecs);
/* used to initialize clock */
extern struct clocksource clocksource_jiffies;
--
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-15 22:10:14 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
---
include/linux/clocksource.h | 3 ++
kernel/time/timekeeping.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+)
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:27:26.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 14:34:40.000000000 -0500
@@ -103,6 +103,54 @@ static inline void __get_realtime_clock_
timespec_add_ns(ts, nsecs);
}
+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
+
+ do {
+ /*
+ * cycle_raw and cycle_last can change on
+ * another CPU and we need the delta calculation
+ * of cycle_now and cycle_last happen atomic, as well
+ * as the adding to cycle_raw. We don't need to grab
+ * any locks, we just keep trying until get all the
+ * calculations together in one state.
+ *
+ * In fact, we __cant__ grab any locks. This
+ * function is called from the latency_tracer which can
+ * be called anywhere. To grab any locks (including
+ * seq_locks) we risk putting ourselves into a deadlock.
+ */
I wonder what makes the compiler read the clock->cycle_raw and
clock->cycle_last variables twice ? I guess some memory barriers could
be welcome here ?
We need both cycle_raw and cycle_last to be the same from the time we read
the clock source to the time we calculate cycle_delta. If either one
changes then delta is bogus.

Also, it just occurred to me that this is an old patch. I thought I
renamed cycle_raw to cycle_monotonic. But I must have lost that patch :-/
Post by Mathieu Desnoyers
Post by Steven Rostedt
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-15 22:10:13 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
---
include/linux/clocksource.h | 3 ++
kernel/time/timekeeping.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+)
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:27:26.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 14:34:40.000000000 -0500
@@ -103,6 +103,54 @@ static inline void __get_realtime_clock_
timespec_add_ns(ts, nsecs);
}
+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
+
+ do {
+ /*
+ * cycle_raw and cycle_last can change on
+ * another CPU and we need the delta calculation
+ * of cycle_now and cycle_last happen atomic, as well
+ * as the adding to cycle_raw. We don't need to grab
+ * any locks, we just keep trying until get all the
+ * calculations together in one state.
+ *
+ * In fact, we __cant__ grab any locks. This
+ * function is called from the latency_tracer which can
+ * be called anywhere. To grab any locks (including
+ * seq_locks) we risk putting ourselves into a deadlock.
+ */
I wonder what makes the compiler read the clock->cycle_raw and
clock->cycle_last variables twice ? I guess some memory barriers could
be welcome here ?
We need both cycle_raw and cycle_last to be the same from the time we read
the clock source to the time we calculate cycle_delta. If either one
changes then delta is bogus.
Ok, but what actually insures that the clock->cycle_* reads won't be
reordered across the clocksource_read() ?
Post by Steven Rostedt
Also, it just occurred to me that this is an old patch. I thought I
renamed cycle_raw to cycle_monotonic. But I must have lost that patch :-/
Post by Mathieu Desnoyers
Post by Steven Rostedt
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 01:40:09 UTC
Permalink
Post by Mathieu Desnoyers
Ok, but what actually insures that the clock->cycle_* reads won't be
reordered across the clocksource_read() ?
<looks at code>

Hmm, interesting.I didn't notice that clocksource_read() is a static
inline. I was thinking that since it was passing a pointer to a function,
gcc could not assume that it could move that code across it. But now
looking to see that clocksource_read is simply a static inline that does:

cs->read();

But still, can gcc assume that it can push loads of unknown origin
variables across function calls? So something like:

static int *glob;

void foo(void) {
int x;

x = *glob;

bar();

if (x != *glob)
/* ... */
}

I can't see how any compiler could honestly move the loading of the first
x after the calling of bar(). With glob pointing to some unknown
variable, that may be perfectly fine for bar to modify.
Post by Mathieu Desnoyers
Post by Steven Rostedt
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
So the question here is,can cycle_raw and cycle_last be loaded from
the unknown source that clock points to after the call to
clocksource_read()?

I'm thinking not.
Post by Mathieu Desnoyers
Post by Steven Rostedt
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 03:30:21 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
Ok, but what actually insures that the clock->cycle_* reads won't be
reordered across the clocksource_read() ?
<looks at code>
Hmm, interesting.I didn't notice that clocksource_read() is a static
inline. I was thinking that since it was passing a pointer to a function,
gcc could not assume that it could move that code across it. But now
cs->read();
But still, can gcc assume that it can push loads of unknown origin
static int *glob;
void foo(void) {
int x;
x = *glob;
bar();
if (x != *glob)
/* ... */
}
I can't see how any compiler could honestly move the loading of the first
x after the calling of bar(). With glob pointing to some unknown
variable, that may be perfectly fine for bar to modify.
Post by Mathieu Desnoyers
Post by Steven Rostedt
+ cycle_raw = clock->cycle_raw;
+ cycle_last = clock->cycle_last;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
So the question here is,can cycle_raw and cycle_last be loaded from
the unknown source that clock points to after the call to
clocksource_read()?
I'm thinking not.
I agree with you that I don't see how the compiler could reorder this.
So we forget about compiler barriers. Also, the clock source used is a
synchronized clock source (get_cycles_sync on x86_64), so it should make
sure the TSC is read at the right moment.

However, what happens if the clock source is, say, the jiffies ?

Is this case, we have :

static cycle_t jiffies_read(void)
{
return (cycle_t) jiffies;
}

Which is nothing more than a memory read of

extern unsigned long volatile __jiffy_data jiffies;

I think it is wrong to assume that reads from clock->cycle_raw and from
jiffies will be ordered correctly in SMP. I am tempted to think that
ordering memory writes to clock->cycle_raw vs jiffies is also needed in this
case (where clock->cycle_raw is updated, or where jiffies is updated).

We can fall in the same kind of issue if we read the HPET, which is
memory I/O based. It does not seems correct to assume that MMIO vs
normal memory reads are ordered. (pointing back to this article :
http://lwn.net/Articles/198988/)

Mathieu
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+ } while (cycle_raw != clock->cycle_raw ||
+ cycle_last != clock->cycle_last);
+
+ return cycle_raw + cycle_delta;
+}
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 13:20:08 UTC
Permalink
[ CC'd Daniel Walker, since he had problems with this code ]
Post by Mathieu Desnoyers
I agree with you that I don't see how the compiler could reorder this.
So we forget about compiler barriers. Also, the clock source used is a
synchronized clock source (get_cycles_sync on x86_64), so it should make
sure the TSC is read at the right moment.
However, what happens if the clock source is, say, the jiffies ?
static cycle_t jiffies_read(void)
{
return (cycle_t) jiffies;
}
Which is nothing more than a memory read of
extern unsigned long volatile __jiffy_data jiffies;
Yep, and that's not my concern.
Post by Mathieu Desnoyers
I think it is wrong to assume that reads from clock->cycle_raw and from
jiffies will be ordered correctly in SMP. I am tempted to think that
ordering memory writes to clock->cycle_raw vs jiffies is also needed in this
case (where clock->cycle_raw is updated, or where jiffies is updated).
We can fall in the same kind of issue if we read the HPET, which is
memory I/O based. It does not seems correct to assume that MMIO vs
http://lwn.net/Articles/198988/)
That and the dread memory barrier thread that my head is still spinning
on.

Ok, lets take a close look at the code in question. I may be wrong, and if
so, great, we can fix it.

We have this in get_monotonic_cycles:

{
cycle_t cycle_now, cycle_delta, cycle_monotonic, cycle_last;
do {
cycle_monotonic = clock->cycle_monotonic;
cycle_last = clock->cycle_last;
cycle_now = clocksource_read(clock);
cycle_delta = (cycle_now - cycle_last) & clock->mask;
} while (cycle_monotonic != clock->cycle_monotonic ||
cycle_last != clock->cycle_last);
return cycle_monotonic + cycle_delta;
}

and this in clocksource.h

static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
cycle_t offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
cs->cycle_monotonic += offset;
}

now is usually just a clocksource_read() passed in.

The goal is to have clock_monotonic always return something that is
greater than what was read the last time.

Let's make a few assumptions now (for others to shoot them down). One
thing is that we don't need to worry too much about MMIO, because we are
doing a read. This means we need the data right now to contiune. So the
read being a function call should keep gcc from moving stuff around, and
since we are doing an IO read, the order of events should be pretty much
synchronized. in

1. load cycle_last and cycle_monotonic (we don't care which order)*
2. read clock source
3. calculate delta and while() compare (order doesn't matter)

* we might care (see below)

If the above is incorrect, then we need to fix get_monotonic_cycles.

in clocksource_accumulate, we have:

offset = ((now = cs->read()) - cycle_last) & cs->mask;
cycle_last = now;
cycle_accumulate += offset;
cycle_monotonic += offset;

The order of events here are. Using the same reasoning as above, the read
must be first and completed because for gcc it's a function, and for IO,
it needs to return data.

1. cs->read
2. update cycle_last, cycle_accumulate, cycle_monotonic.

Can we assume, if the above for get_monotonic_cycles is correct, that
since we read and compare cycle_last and cycle_monotonic, that neither of
them have changed over the read? So we have a snapshot of the
clocksource_accumulate.

So the worst thing that I can think of, is that cycle_monotonic is update
*before* cycle_last:

cycle_monotonic += offest;
<get_monotonic_cycles run on other CPU>
cycle_last = now;


cycle_last = 5
cycle_monotonic = 0


CPU 0 CPU 1
---------- -------------
cs->read() = 10
offset = 10 - 5 = 5
cycle_monotonic = 5
cycle_monotonic = 5
cycle_last = 5
cs->read() = 11
delta = 11 - 5 = 6
cycle_monotonic and cycle_last still same
return 5 + 6 = 11

cycle_last = 10

cycle_monotonic = 5
cycle_last = 10
cs->read() = 12
delta = 12 - 10 = 2
cycle_monotonic and cycle_last still same
return 5 + 2 = 7

**** ERROR *****

So, we *do* need memory barriers. Looks like cycle_last and
cycle_monotonic need to be synchronized.

OK, will this do?

cycle_t notrace get_monotonic_cycles(void)
{
cycle_t cycle_now, cycle_delta, cycle_monotonic, cycle_last;
do {
cycle_monotonic = clock->cycle_monotonic;
smp_rmb();
cycle_last = clock->cycle_last;
cycle_now = clocksource_read(clock);
cycle_delta = (cycle_now - cycle_last) & clock->mask;
} while (cycle_monotonic != clock->cycle_monotonic ||
cycle_last != clock->cycle_last);
return cycle_monotonic + cycle_delta;
}

and this in clocksource.h

static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
cycle_t offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
smp_wmb();
cs->cycle_accumulated += offset;
cs->cycle_monotonic += offset;
}

We may still get to a situation where cycle_monotonic is of the old value
and cycle_last is of the new value. That would give us a smaller delta
than we want.

Lets look at this, with a slightly different situation.

cycle_last = 5
cycle_monotonic = 0


CPU 0 CPU 1
---------- -------------
cs->read() = 10
offset = 10 - 5 = 5
cycle_last = 10
cycle_monotonic = 5

cycle_monotonic = 5
cycle_last = 10
cs->read() = 12
delta = 12 - 10 = 2
cycle_monotonic and cycle_last still same
return 5 + 2 = 7


cs->read() = 13
offset = 13 - 10 = 2
cycle_last = 13

cycle_monotonic = 5
cycle_last = 13
cs->read() = 14
delta = 14 - 13 = 1
cycle_monotonic and cycle_last still same
return 5 + 1 = 6

**** ERROR ****

Crap, looks like we do need a stronger locking here :-(

Hmm, I might as well just use seq_locks, and make sure that tracing
does not hit them.

Thanks!

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 15:00:20 UTC
Permalink
Post by Steven Rostedt
[ CC'd Daniel Walker, since he had problems with this code ]
Post by Mathieu Desnoyers
I agree with you that I don't see how the compiler could reorder this.
So we forget about compiler barriers. Also, the clock source used is a
synchronized clock source (get_cycles_sync on x86_64), so it should make
sure the TSC is read at the right moment.
However, what happens if the clock source is, say, the jiffies ?
static cycle_t jiffies_read(void)
{
return (cycle_t) jiffies;
}
Which is nothing more than a memory read of
extern unsigned long volatile __jiffy_data jiffies;
Yep, and that's not my concern.
Hrm, I will reply to the rest of this email in a separate mail, but
there is another concern, simpler than memory ordering, that just hit
me :

If we have CPU A calling clocksource_accumulate while CPU B is calling
get_monotonic_cycles, but events happens in the following order (because
of preemption or interrupts). Here, to make things worse, we would be on
x86 where cycle_t is not an atomic write (64 bits) :


CPU A CPU B

clocksource read
update cycle_mono (1st 32 bits)
read cycle_mono
read cycle_last
clocksource read
read cycle_mono
read cycle_last
update cycle_mono (2nd 32 bits)
update cycle_last
update cycle_acc

Therefore, we have :
- an inconsistant cycle_monotonic value
- inconsistant cycle_monotonic and cycle_last values.

Or is there something I have missed ?

If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).

Mathieu
Post by Steven Rostedt
Post by Mathieu Desnoyers
I think it is wrong to assume that reads from clock->cycle_raw and from
jiffies will be ordered correctly in SMP. I am tempted to think that
ordering memory writes to clock->cycle_raw vs jiffies is also needed in this
case (where clock->cycle_raw is updated, or where jiffies is updated).
We can fall in the same kind of issue if we read the HPET, which is
memory I/O based. It does not seems correct to assume that MMIO vs
http://lwn.net/Articles/198988/)
That and the dread memory barrier thread that my head is still spinning
on.
Ok, lets take a close look at the code in question. I may be wrong, and if
so, great, we can fix it.
{
cycle_t cycle_now, cycle_delta, cycle_monotonic, cycle_last;
do {
cycle_monotonic = clock->cycle_monotonic;
cycle_last = clock->cycle_last;
cycle_now = clocksource_read(clock);
cycle_delta = (cycle_now - cycle_last) & clock->mask;
} while (cycle_monotonic != clock->cycle_monotonic ||
cycle_last != clock->cycle_last);
return cycle_monotonic + cycle_delta;
}
and this in clocksource.h
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
cycle_t offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
cs->cycle_monotonic += offset;
}
now is usually just a clocksource_read() passed in.
The goal is to have clock_monotonic always return something that is
greater than what was read the last time.
Let's make a few assumptions now (for others to shoot them down). One
thing is that we don't need to worry too much about MMIO, because we are
doing a read. This means we need the data right now to contiune. So the
read being a function call should keep gcc from moving stuff around, and
since we are doing an IO read, the order of events should be pretty much
synchronized. in
1. load cycle_last and cycle_monotonic (we don't care which order)*
2. read clock source
3. calculate delta and while() compare (order doesn't matter)
* we might care (see below)
If the above is incorrect, then we need to fix get_monotonic_cycles.
offset = ((now = cs->read()) - cycle_last) & cs->mask;
cycle_last = now;
cycle_accumulate += offset;
cycle_monotonic += offset;
The order of events here are. Using the same reasoning as above, the read
must be first and completed because for gcc it's a function, and for IO,
it needs to return data.
1. cs->read
2. update cycle_last, cycle_accumulate, cycle_monotonic.
Can we assume, if the above for get_monotonic_cycles is correct, that
since we read and compare cycle_last and cycle_monotonic, that neither of
them have changed over the read? So we have a snapshot of the
clocksource_accumulate.
So the worst thing that I can think of, is that cycle_monotonic is update
cycle_monotonic += offest;
<get_monotonic_cycles run on other CPU>
cycle_last = now;
cycle_last = 5
cycle_monotonic = 0
CPU 0 CPU 1
---------- -------------
cs->read() = 10
offset = 10 - 5 = 5
cycle_monotonic = 5
cycle_monotonic = 5
cycle_last = 5
cs->read() = 11
delta = 11 - 5 = 6
cycle_monotonic and cycle_last still same
return 5 + 6 = 11
cycle_last = 10
cycle_monotonic = 5
cycle_last = 10
cs->read() = 12
delta = 12 - 10 = 2
cycle_monotonic and cycle_last still same
return 5 + 2 = 7
**** ERROR *****
So, we *do* need memory barriers. Looks like cycle_last and
cycle_monotonic need to be synchronized.
OK, will this do?
cycle_t notrace get_monotonic_cycles(void)
{
cycle_t cycle_now, cycle_delta, cycle_monotonic, cycle_last;
do {
cycle_monotonic = clock->cycle_monotonic;
smp_rmb();
cycle_last = clock->cycle_last;
cycle_now = clocksource_read(clock);
cycle_delta = (cycle_now - cycle_last) & clock->mask;
} while (cycle_monotonic != clock->cycle_monotonic ||
cycle_last != clock->cycle_last);
return cycle_monotonic + cycle_delta;
}
and this in clocksource.h
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
cycle_t offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
smp_wmb();
cs->cycle_accumulated += offset;
cs->cycle_monotonic += offset;
}
We may still get to a situation where cycle_monotonic is of the old value
and cycle_last is of the new value. That would give us a smaller delta
than we want.
Lets look at this, with a slightly different situation.
cycle_last = 5
cycle_monotonic = 0
CPU 0 CPU 1
---------- -------------
cs->read() = 10
offset = 10 - 5 = 5
cycle_last = 10
cycle_monotonic = 5
cycle_monotonic = 5
cycle_last = 10
cs->read() = 12
delta = 12 - 10 = 2
cycle_monotonic and cycle_last still same
return 5 + 2 = 7
cs->read() = 13
offset = 13 - 10 = 2
cycle_last = 13
cycle_monotonic = 5
cycle_last = 13
cs->read() = 14
delta = 14 - 13 = 1
cycle_monotonic and cycle_last still same
return 5 + 1 = 6
**** ERROR ****
Crap, looks like we do need a stronger locking here :-(
Hmm, I might as well just use seq_locks, and make sure that tracing
does not hit them.
Thanks!
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 15:10:10 UTC
Permalink
Post by Mathieu Desnoyers
Hrm, I will reply to the rest of this email in a separate mail, but
there is another concern, simpler than memory ordering, that just hit
If we have CPU A calling clocksource_accumulate while CPU B is calling
get_monotonic_cycles, but events happens in the following order (because
of preemption or interrupts). Here, to make things worse, we would be on
CPU A CPU B
clocksource read
update cycle_mono (1st 32 bits)
read cycle_mono
read cycle_last
clocksource read
read cycle_mono
read cycle_last
update cycle_mono (2nd 32 bits)
update cycle_last
update cycle_acc
- an inconsistant cycle_monotonic value
- inconsistant cycle_monotonic and cycle_last values.
Or is there something I have missed ?
No, there's probably issues there too, but no need to worry about it,
since I already showed that allowing for clocksource_accumulate to happen
inside the get_monotonic_cycles loop is already flawed.
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
I know you pointed me the code, but lets assume that I'm still ignorant
;-)

do you actually use the RCU internals? or do you just reimplement an RCU
algorithm?

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 15:30:17 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
Hrm, I will reply to the rest of this email in a separate mail, but
there is another concern, simpler than memory ordering, that just hit
If we have CPU A calling clocksource_accumulate while CPU B is calling
get_monotonic_cycles, but events happens in the following order (because
of preemption or interrupts). Here, to make things worse, we would be on
CPU A CPU B
clocksource read
update cycle_mono (1st 32 bits)
read cycle_mono
read cycle_last
clocksource read
read cycle_mono
read cycle_last
update cycle_mono (2nd 32 bits)
update cycle_last
update cycle_acc
- an inconsistant cycle_monotonic value
- inconsistant cycle_monotonic and cycle_last values.
Or is there something I have missed ?
No, there's probably issues there too, but no need to worry about it,
since I already showed that allowing for clocksource_accumulate to happen
inside the get_monotonic_cycles loop is already flawed.
Yep, I just re-read through your previous email, and totally agree that
the algorithm is flawed in the way you pointed out.
Post by Steven Rostedt
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
I know you pointed me the code, but lets assume that I'm still ignorant
;-)
do you actually use the RCU internals? or do you just reimplement an RCU
algorithm?
Nope, I don't use RCU internals in this code. Preempt disable seemed
like the best way to handle this utterly short code path and I wanted
the write side to be fast enough to be called periodically. What I do is:

- Disable preemption at the read-side :
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.

(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.

Mathieu
Post by Steven Rostedt
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 16:00:31 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
No, there's probably issues there too, but no need to worry about it,
since I already showed that allowing for clocksource_accumulate to happen
inside the get_monotonic_cycles loop is already flawed.
Yep, I just re-read through your previous email, and totally agree that
the algorithm is flawed in the way you pointed out.
Yeah, but if we replace the loop with a seq lock, then it would work.
albeit, more cacheline bouncing (caused by writes). (maybe not, see below)
Post by Mathieu Desnoyers
Post by Steven Rostedt
do you actually use the RCU internals? or do you just reimplement an RCU
algorithm?
Nope, I don't use RCU internals in this code. Preempt disable seemed
like the best way to handle this utterly short code path and I wanted
grmble. Then how do you trace preempt_disable? As my tracer does that
(see the last patch in the series).
Post by Mathieu Desnoyers
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.
(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.

Placing a read_seqlock in get_monotonic_cycles would not be that bad,
since the only slow down would be the rmb. read_seqlocks don't modify
global data. Only the write_seqlock does. So the cache line bouncing would
only happen on updates in clocksource_accumulate. But then after the
caches are all balanced again, the reads will continue fine.

Question: Is a cache-miss a greater cost than a read to a clocksource
(besides the TSC)?

Also note how I arrange these variables in the clock struct:

struct {
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;

I could do the following:

struct {
seqlock_t cycle_lock;
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;

Which would help to keep all these in the same cache line. These are all
updated at the same time, and hopefully this will keep the cache line
bouncing limited to a single cacheline.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 17:10:07 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
No, there's probably issues there too, but no need to worry about it,
since I already showed that allowing for clocksource_accumulate to happen
inside the get_monotonic_cycles loop is already flawed.
Yep, I just re-read through your previous email, and totally agree that
the algorithm is flawed in the way you pointed out.
Yeah, but if we replace the loop with a seq lock, then it would work.
albeit, more cacheline bouncing (caused by writes). (maybe not, see below)
Yes, but then you would trigger a deadlock if you instrument code called
from NMI, SMI, MCE contexts :(

grep -ri NMI drivers/* arch/* |grep -vi PNMI

is quite interesting : actually, it show that a few spots need to handle
those "so special interrupts" : watchdogs, oprofile, virtualization and
much more in architecture specific code.

I just would not like to add a tracer in the kernel that is _so_
intrusive that module writers and architecture maintainers would have to
audit their code and think about the tracer for each implementation that
would deal with these kind of interrupts.
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
do you actually use the RCU internals? or do you just reimplement an RCU
algorithm?
Nope, I don't use RCU internals in this code. Preempt disable seemed
like the best way to handle this utterly short code path and I wanted
grmble. Then how do you trace preempt_disable? As my tracer does that
(see the last patch in the series).
I think using a kind of preempt_disable_notrace() would make sense here.
I mean.. even if you use the seqlock, eventually, you will want to trace
the seqlock behavior. Then you have to find the lightest way to do some
sort of synchronization that will have a predictable execution. seqlock
has the following disadvantage : if the seqlock read has to wait for the
write seqlock to end, we add up some time to the execution of the
code we are trying to profile, which will mix up the results. On the
other hand, if the read-side executes in a constant amount of cycles
which does not depend on the write-side activity, then we get a clearer
picture of what the time should be accounted for. We can even create a
module that will figure out how many nanoseconds are spent for reading
the clock so we can substract this time from our analysis if required.

That's why having to choose between read seqlock and preemption disable
for the read-side, I would strongly prefer the preemption disable.
(constant execution time and it's deadlock-free)
Post by Steven Rostedt
Post by Mathieu Desnoyers
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.
(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.
LTTng hooks in the lockdep tracer to trace irq on/off, spinlocks, etc..
in flight recorder mode, we have nothing to write to disk and therefore
we can handle very frequent events. We then do the analysis off-line
using the last MB written in the buffers. The advantage is that the
kernel dumbly writes data to a buffer : we therefore move the complexity
to user-space.

I agree that some kind of tracing, like the one you are doing, might be
done more efficiently if you do a first clever analysis phase directly
in the kernel without writing the raw high event rate data in memory
buffers. However, I believe that it would be more powerful if we combine
the two approaches rather than trying to do everything in or out of the
kernel. LTTng could provide the comm names, priorities, etc, and your
tracer could provide the top X list of processes that had a bad
behavior. It would mean that the complete overall information would be
made available after a post-processing phase done in an analysis tool
like LTTV, but I don't see any problem with it.
Post by Steven Rostedt
Placing a read_seqlock in get_monotonic_cycles would not be that bad,
since the only slow down would be the rmb. read_seqlocks don't modify
global data. Only the write_seqlock does. So the cache line bouncing would
only happen on updates in clocksource_accumulate. But then after the
caches are all balanced again, the reads will continue fine.
Yep, cache-line bouncing for rare updates in not much of an issue.
Post by Steven Rostedt
Question: Is a cache-miss a greater cost than a read to a clocksource
(besides the TSC)?
If HPET reads are as slow as I expect, then no. Even then, a
synchronized TSC read will take about 100 cycles. If we have to hit main
memory, some tests I have done on a P4 showed that it could take about
600 cycles. However, cacheline bouncing, in my understanding, has more
effects that barely burning cycles : wasting memory I/O, when we
increase the number of CPUs, becomes increasingly bad.
Post by Steven Rostedt
struct {
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;
struct {
seqlock_t cycle_lock;
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;
Which would help to keep all these in the same cache line. These are all
updated at the same time, and hopefully this will keep the cache line
bouncing limited to a single cacheline.
And if the cache line only bounces when the write seqlock is taken, it's
not really an issue. I am more concerned about deadlocks ;)

Mathieu
Post by Steven Rostedt
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 17:50:30 UTC
Permalink
....
Post by Mathieu Desnoyers
Post by Steven Rostedt
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.
LTTng hooks in the lockdep tracer to trace irq on/off, spinlocks, etc..
in flight recorder mode, we have nothing to write to disk and therefore
we can handle very frequent events. We then do the analysis off-line
using the last MB written in the buffers. The advantage is that the
kernel dumbly writes data to a buffer : we therefore move the complexity
to user-space.
I agree that some kind of tracing, like the one you are doing, might be
done more efficiently if you do a first clever analysis phase directly
in the kernel without writing the raw high event rate data in memory
buffers. However, I believe that it would be more powerful if we combine
the two approaches rather than trying to do everything in or out of the
kernel. LTTng could provide the comm names, priorities, etc, and your
tracer could provide the top X list of processes that had a bad
behavior. It would mean that the complete overall information would be
made available after a post-processing phase done in an analysis tool
like LTTV, but I don't see any problem with it.
Just to expand a bit on the design-side of my proposal :

Your module would create "profiles" based on the hooks called. If we
take the interrupt on/off for example, it would be called by lockdep and
could keep a table of the top X instructions that disables interrupts
for a long time. (it's just an example, you could want to save the pid
instead...)

Then, whenever a "profile dump" is triggered, you would simply have to
send the current state of your profile to lttng with something like :


struct irq_latency_table {
void *ip;
cycles_t time;
};

/*
* Make sure only one profile at a time is written to the trace for the
* whole system.
*/
static DECLARE_MUTEX(latency_profile_mutex);
static struct irq_latency_table latency_table[NR_ENTRIES];


void irq_latency_dump_profile(void)
{
int i;
char namebuf[KSYM_NAME_LEN];

mutex_lock(&latency_profile_mutex);
trace_mark(irq_latency_dump_begin, MARK_NOARGS);
for (i = 0; i < NR_ENTRIES; i++) {
sprint_symbol(namebuf, (unsigned long)latency_table[i].ip);
trace_mark(irq_latency_entry, "ip %p symbol %s time %llu",
latency_table[i].ip, namebuf,
(unsigned long long)latency_table[i].time);
}
trace_mark(irq_latency_dump_end, MARK_NOARGS);
mutex_unlock(&latency_profile_mutex);
}

You can then create a LTTV module that will format your nice output each
time a profile dump is encountered.

By doing this, your specialized profile generator would only have to
hook into the irq on/off events to gather the information it needs,
nothing more. I think that would trim the code size and the complexity
of your module by an interesting factor.

Note that I could optimize the way I currently deal with symbols by not
having to dump them in the trace, but since it's only for low rate
events, this optimization has a low priority on my todo list.

Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 19:50:18 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
Yeah, but if we replace the loop with a seq lock, then it would work.
albeit, more cacheline bouncing (caused by writes). (maybe not, see below)
Yes, but then you would trigger a deadlock if you instrument code called
from NMI, SMI, MCE contexts :(
grep -ri NMI drivers/* arch/* |grep -vi PNMI
is quite interesting : actually, it show that a few spots need to handle
those "so special interrupts" : watchdogs, oprofile, virtualization and
much more in architecture specific code.
I just would not like to add a tracer in the kernel that is _so_
intrusive that module writers and architecture maintainers would have to
audit their code and think about the tracer for each implementation that
would deal with these kind of interrupts.
I don't want driver writers to worry about that either.
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
do you actually use the RCU internals? or do you just reimplement an RCU
algorithm?
Nope, I don't use RCU internals in this code. Preempt disable seemed
like the best way to handle this utterly short code path and I wanted
grmble. Then how do you trace preempt_disable? As my tracer does that
(see the last patch in the series).
I think using a kind of preempt_disable_notrace() would make sense here.
Actually after hitting the send button, I thought the same.
Post by Mathieu Desnoyers
I mean.. even if you use the seqlock, eventually, you will want to trace
the seqlock behavior. Then you have to find the lightest way to do some
sort of synchronization that will have a predictable execution. seqlock
has the following disadvantage : if the seqlock read has to wait for the
write seqlock to end, we add up some time to the execution of the
code we are trying to profile, which will mix up the results. On the
other hand, if the read-side executes in a constant amount of cycles
which does not depend on the write-side activity, then we get a clearer
picture of what the time should be accounted for. We can even create a
module that will figure out how many nanoseconds are spent for reading
the clock so we can substract this time from our analysis if required.
That's why having to choose between read seqlock and preemption disable
for the read-side, I would strongly prefer the preemption disable.
(constant execution time and it's deadlock-free)
You may convince me yet ;-)
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Mathieu Desnoyers
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.
(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.
LTTng hooks in the lockdep tracer to trace irq on/off, spinlocks, etc..
in flight recorder mode, we have nothing to write to disk and therefore
we can handle very frequent events. We then do the analysis off-line
using the last MB written in the buffers. The advantage is that the
kernel dumbly writes data to a buffer : we therefore move the complexity
to user-space.
But you would still need to do something in case you want this information
dumped to console on a kernel crash. Of course you can rely on kexec, but
if the kexec fails (which is possible) then you lose all the information.
Having the ability to dump the output to console on a crash is one of the
benefits of latency_tracer that I want to keep.
Post by Mathieu Desnoyers
I agree that some kind of tracing, like the one you are doing, might be
done more efficiently if you do a first clever analysis phase directly
in the kernel without writing the raw high event rate data in memory
buffers. However, I believe that it would be more powerful if we combine
the two approaches rather than trying to do everything in or out of the
kernel. LTTng could provide the comm names, priorities, etc, and your
tracer could provide the top X list of processes that had a bad
behavior. It would mean that the complete overall information would be
made available after a post-processing phase done in an analysis tool
like LTTV, but I don't see any problem with it.
Of course you don't see any problem with it, because you know LTTV and
LTTng very well ;-)

latency_tracer has been very detrimental in solving -rt patch latencies by
telling the customer to run with latency trace on, and then having them
simply set a few sysctl variables and run their app. By combining this
with LTTng, I wouldn't know how to start with telling a customer how to
analyze the problem.

Simply put, latency_tracer has a much smaller learning curve than LTTng.
Not to mention, a smaller footprint. The tracer here is very focused on
what to do, and is not meant to be a general profiling tool as LTTng is.

In-other-words, latency_tracer is LTTng-lite ;-)
Post by Mathieu Desnoyers
Post by Steven Rostedt
Placing a read_seqlock in get_monotonic_cycles would not be that bad,
since the only slow down would be the rmb. read_seqlocks don't modify
global data. Only the write_seqlock does. So the cache line bouncing would
only happen on updates in clocksource_accumulate. But then after the
caches are all balanced again, the reads will continue fine.
Yep, cache-line bouncing for rare updates in not much of an issue.
Post by Steven Rostedt
Question: Is a cache-miss a greater cost than a read to a clocksource
(besides the TSC)?
If HPET reads are as slow as I expect, then no. Even then, a
synchronized TSC read will take about 100 cycles. If we have to hit main
memory, some tests I have done on a P4 showed that it could take about
600 cycles. However, cacheline bouncing, in my understanding, has more
effects that barely burning cycles : wasting memory I/O, when we
increase the number of CPUs, becomes increasingly bad.
I haven't seen too much of an effect on a 64 CPU box. For the rare update
that is. But I'm sure it will get much worse when we run a 1024 CPU box.
Post by Mathieu Desnoyers
Post by Steven Rostedt
struct {
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;
struct {
seqlock_t cycle_lock;
cycle_t cycle_last, cycle_accumulated, cycle_monotonic;
cycle_t cycle_interval;
} ____cacheline_aligned_in_smp;
Which would help to keep all these in the same cache line. These are all
updated at the same time, and hopefully this will keep the cache line
bouncing limited to a single cacheline.
And if the cache line only bounces when the write seqlock is taken, it's
not really an issue. I am more concerned about deadlocks ;)
The write_seqlock is updated the same time as the other variables are
written (all on the same cacheline - if the cache line is big enough).

But I do share you concern with deadlocks.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 20:30:12 UTC
Permalink
* Steven Rostedt (***@goodmis.org) wrote:
...
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Mathieu Desnoyers
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.
(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.
LTTng hooks in the lockdep tracer to trace irq on/off, spinlocks, etc..
in flight recorder mode, we have nothing to write to disk and therefore
we can handle very frequent events. We then do the analysis off-line
using the last MB written in the buffers. The advantage is that the
kernel dumbly writes data to a buffer : we therefore move the complexity
to user-space.
But you would still need to do something in case you want this information
dumped to console on a kernel crash. Of course you can rely on kexec, but
if the kexec fails (which is possible) then you lose all the information.
Having the ability to dump the output to console on a crash is one of the
benefits of latency_tracer that I want to keep.
There has been some integration done between LTTng and the "crash" tool
to extract the buffers from a crashed kernel. I am not an expert in
crash buffer extraction though, but I guess all the available mechanisms
depend on kexec and could show the limits you are referring to.

If you really want to pretty-print the information to the console, I
would propose that you leave that part of the problem to a different
output module. The core of the latency tracer could keep the minimum
information. Then, when a dump is triggered, it either sends the
information to LTTng or to your console pretty-printer.

However, I would not call the pretty-printer a "tracer" module per-se.
We would have to accept that it is a bit more tied to the kernel
internals than the latency tracer. My goal is to separate the core
"profiling" module from the optional "pretty-printing" module as much as
possible so the latency tracer core could be reused by other output
modules.
Post by Steven Rostedt
Post by Mathieu Desnoyers
I agree that some kind of tracing, like the one you are doing, might be
done more efficiently if you do a first clever analysis phase directly
in the kernel without writing the raw high event rate data in memory
buffers. However, I believe that it would be more powerful if we combine
the two approaches rather than trying to do everything in or out of the
kernel. LTTng could provide the comm names, priorities, etc, and your
tracer could provide the top X list of processes that had a bad
behavior. It would mean that the complete overall information would be
made available after a post-processing phase done in an analysis tool
like LTTV, but I don't see any problem with it.
Of course you don't see any problem with it, because you know LTTV and
LTTng very well ;-)
latency_tracer has been very detrimental in solving -rt patch latencies by
telling the customer to run with latency trace on, and then having them
simply set a few sysctl variables and run their app. By combining this
with LTTng, I wouldn't know how to start with telling a customer how to
analyze the problem.
Simply put, latency_tracer has a much smaller learning curve than LTTng.
Not to mention, a smaller footprint. The tracer here is very focused on
what to do, and is not meant to be a general profiling tool as LTTng is.
In-other-words, latency_tracer is LTTng-lite ;-)
If LTTng is already ported to your specific kernel, the learning-curve
is not big at all. Here is what the latency_tracer over LTTng guide
could look like :

Well, once you have LTTng in your kernel and have compiled and installed
the ltt-control and lttv packages (configure, make, make install), all
that would be needed is :

(there may be some bits in the QUICKSTART GUIDE on
http://ltt.polymtl.ca, like adding the debugfs mount to fstab and make sure
the LTTng modules are loaded)

#arm all the markers
ltt-armall
#start lttng tracing
lttctl -n test -t /tmp/trace1 -d -l /mnt/debugfs/ltt

-> start latency tracer
-> stop latency tracer
-> trigger latency tracer dump

While the tracing is active, trigger the condition...

(rince, repeat, can handle multiple latency tracer dumps)

#stop lttng tracing
lttctl -n test -R
#disarm all markers
ltt-disarmall

You can easily test the trace with :
lttv -m textDump -t /tmp/trace1

Your users would issue something like :

lttv -m latencytracerDump -t /tmp/trace1

that's it. LatencytracerDump would be a new specialized plugin, inspired
from the generic textDump.c plugin and from the state.c module (for
hooking on specific events rather that on _all_ events). It would
generate a text output from the trace collected at each latency tracer
dump.

Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tim Bird
2008-01-16 20:50:10 UTC
Permalink
Post by Mathieu Desnoyers
If LTTng is already ported to your specific kernel, the learning-curve
is not big at all. Here is what the latency_tracer over LTTng guide
Well, once you have LTTng in your kernel and have compiled and installed
the ltt-control and lttv packages (configure, make, make install), all
(there may be some bits in the QUICKSTART GUIDE on
http://ltt.polymtl.ca, like adding the debugfs mount to fstab and make sure
the LTTng modules are loaded)
#arm all the markers
ltt-armall
#start lttng tracing
lttctl -n test -t /tmp/trace1 -d -l /mnt/debugfs/ltt
-> start latency tracer
-> stop latency tracer
-> trigger latency tracer dump
While the tracing is active, trigger the condition...
(rince, repeat, can handle multiple latency tracer dumps)
#stop lttng tracing
lttctl -n test -R
#disarm all markers
ltt-disarmall
lttv -m textDump -t /tmp/trace1
lttv -m latencytracerDump -t /tmp/trace1
No offense, but this is still quite a bit harder than:

echo 1 >/proc/something
... wait a bit ...
cat /proc/something

(substitute /sys or /debugfs where appropriate)

Having to compile something (besides the kernel) for the target
is sometimes a major hassle. I avoided it completely with KFT.
-- Tim

=============================
Tim Bird
Architecture Group Chair, CE Linux Forum
Senior Staff Engineer, Sony Corporation of America
=============================

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 20:50:15 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
In-other-words, latency_tracer is LTTng-lite ;-)
If LTTng is already ported to your specific kernel, the learning-curve
is not big at all. Here is what the latency_tracer over LTTng guide
Well, once you have LTTng in your kernel and have compiled and installed
the ltt-control and lttv packages (configure, make, make install), all
(there may be some bits in the QUICKSTART GUIDE on
http://ltt.polymtl.ca, like adding the debugfs mount to fstab and make sure
the LTTng modules are loaded)
#arm all the markers
ltt-armall
#start lttng tracing
lttctl -n test -t /tmp/trace1 -d -l /mnt/debugfs/ltt
-> start latency tracer
-> stop latency tracer
-> trigger latency tracer dump
While the tracing is active, trigger the condition...
(rince, repeat, can handle multiple latency tracer dumps)
#stop lttng tracing
lttctl -n test -R
#disarm all markers
ltt-disarmall
lttv -m textDump -t /tmp/trace1
lttv -m latencytracerDump -t /tmp/trace1
that's it. LatencytracerDump would be a new specialized plugin, inspired
from the generic textDump.c plugin and from the state.c module (for
hooking on specific events rather that on _all_ events). It would
generate a text output from the trace collected at each latency tracer
dump.
Mathieu,

That's quite a bit. Currently we have tools that already hook into
latency_tracer. e.g. Thomas Gleixner's cyclictest. It turns on the latency
tracer runs a quick test, turns it off. If the latency is ok, the trace is
discarded and it runs the test again. When the latency is not acceptable,
it stops the test. Then all one needs to do is look at the given trace.

Hooking something simple as this to LTTng is not going to fly.

Don't get me wrong, I'm a huge supporter of LTTng, and even recommend it.
But there's things that using LTTng for is like using a sledgehammer for
a tack. I like to use the tools that are best for the job (which also
mean easiest). I don't buy the one tracer fits all mentality. And by
pushing that too much, I'm thinking we'll never get a tracer into the
kernel.

What my goal of these patches is, is to get the infrastructure for tracing
into the kernel. The latency_tracer is like Rusty Russells lguest is for
pvops. LTTng is the Xen equivalent. Where latency_tracer is nothing more
than the quick and dirty tracer. For administrators that want to analyze
their systems, LTTng is much more appropriate. latency_tracer is more of
the first aid in finding latency troubles, and if that doesn't work, then
we can do the LTTng surgery.

Lets go back and focus on the infrastructure again. Namely, the mcount and
notrace parts of the kernel. As well as the tracer timer, which by the
way, we are looking more into what you have done ;-)

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 20:10:24 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
One thing I want to clear up. The major difference between this
latency_tracer and LTTng is what we consider fast paths. The latency
tracer is recording things like enabling and disabling interrupts, preempt
count changes, or simply profiling all function calls. Those are what I
consider fast paths. The slow path WRT the latency_tracer are things like
context switches. This is why I don't have a problem with copying the
comm at context switch time. Because that _is_ a slow path for the latency
tracer.
LTTng hooks in the lockdep tracer to trace irq on/off, spinlocks, etc..
in flight recorder mode, we have nothing to write to disk and therefore
we can handle very frequent events. We then do the analysis off-line
using the last MB written in the buffers. The advantage is that the
kernel dumbly writes data to a buffer : we therefore move the complexity
to user-space.
But you would still need to do something in case you want this information
dumped to console on a kernel crash. Of course you can rely on kexec, but
if the kexec fails (which is possible) then you lose all the information.
Having the ability to dump the output to console on a crash is one of the
benefits of latency_tracer that I want to keep.
There has been some integration done between LTTng and the "crash" tool
to extract the buffers from a crashed kernel. I am not an expert in
crash buffer extraction though, but I guess all the available mechanisms
depend on kexec and could show the limits you are referring to.
If you really want to pretty-print the information to the console, I
would propose that you leave that part of the problem to a different
output module. The core of the latency tracer could keep the minimum
information. Then, when a dump is triggered, it either sends the
information to LTTng or to your console pretty-printer.
However, I would not call the pretty-printer a "tracer" module per-se.
We would have to accept that it is a bit more tied to the kernel
internals than the latency tracer. My goal is to separate the core
"profiling" module from the optional "pretty-printing" module as much as
possible so the latency tracer core could be reused by other output
modules.
Mathieu,

I've been thinking about the context switch marker, and the printf format
field you have:

prepare_task_switch(rq, prev, next);
+ trace_mark(kernel_sched_schedule,
+ "prev_pid %d next_pid %d prev_state %ld",
+ prev->pid, next->pid, prev->state);
mm = next->mm;


Now I see that this is great for your tracer, since all your hook would
need to do is:

static notrace void simple_trace(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
va_list ap;

va_start(ap, format);
simple_trace_record(ap, format);
va_end(ap);
}

And you could hook this up to all your traces. Wonderful!

But...

Tracers that want to do a bit more work, like recording timings and seeing
if we hit some max somewhere, can't do much with that pretty print data.
For example, I like to record the priority of a task that is being swapped
out as well as the one being swapped in. But with this, all I can get is
the priority of the prev task (since it is still current).

You told me that I should put hooks into where the priority gets modified,
so that I can trace it there at a non hot path. Well, I have some issues
with this.

1) to me it's a management nightmare. Yes, I could hook into lttng, but
that too is too much. That takes me back to the days when COTS became the
new buzzword, and we ended up having to incorporate COTS products into
things better left done in house. The end result was more glue code than
what would happen if it was simple done in house, and a product that was
totally inefficient.

2) this requires we put a marker at all the places that might change the
data we want the snapshot of at the marker (here being prio at context
switch time). I could imaging some new code comes into the kernel that
modifies priority but the author has no idea about having to update the
trace marker, and the trace output ends up showing stale data. And this
would cause the poor bastard that needs to maintain this code to debug the
tracer on top of the code they are maintaining. Which simply would end up
where that poor bastard will lose all confidence in the tracer and would
simply give up on it.

I know that if we did something like:

trace_mark(kernel_sched_schedule,
"prev %p next %p",
prev, next);

It would be useless for the simple recording, because what the user would
see is just two meaningless pointer numbers.

So, at a minimum, I'd like to at least have meta data attached:

trace_mark(kernel_sched_schedule,
"prev_pid %d next_pid %d prev_state %ld\0"
"prev %p next %p",
prev->pid, next->pid, prev->state,
prev, next);

This would allow for both the nice pretty print of your trace, as well as
allowing other tracers to get to better meta data.
The '\0' would keep your tracer from recording the extra data, and we
could add some way to ignore the parameters in the printf to let other
traces get straight to the meta data.

Actually here, since prev == current, we could omit that.

Just a thought.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Frank Ch. Eigler
2008-01-17 20:50:17 UTC
Permalink
Hi -
Post by Steven Rostedt
[...]
+ trace_mark(kernel_sched_schedule,
+ "prev_pid %d next_pid %d prev_state %ld",
+ prev->pid, next->pid, prev->state);
[...]
But...
Tracers that want to do a bit more work, like recording timings and seeing
if we hit some max somewhere, can't do much with that pretty print data.
If you find yourself wanting to perform computations like finding
maxima, or responding right there as opposed to later during userspace
trace data extraction, then you're trending toward a tool like
systemtap.
Post by Steven Rostedt
[...]
trace_mark(kernel_sched_schedule,
"prev_pid %d next_pid %d prev_state %ld\0"
"prev %p next %p",
prev->pid, next->pid, prev->state,
prev, next);
This would allow for both the nice pretty print of your trace, as well as
allowing other tracers to get to better meta data.
Yes, more self-contained marker events are necessary for meaningful
in-situ processing. That needs to be balanced by the increased cost
for computing and passing the extra parameters, multiplied the event
occurrence rate.

In this case, the prev/next pointers are sufficient to compute the
other values. For particularly performance-critical markers, it may
not be unreasonable to expect the callback functions to dereference
such pointers for pretty-printing or other processing.
Post by Steven Rostedt
The '\0' would keep your tracer from recording the extra data, and we
could add some way to ignore the parameters in the printf to let other
traces get straight to the meta data.
This \0 hack is perhaps too clever. Much of the cost of the extra
parameters is already paid by the time that a simpleminded tracing
callback function starts going through the string. Also, I believe
the systemtap marker interface would break if the format strings were
not singly terminated ordinary strings.

- FChE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 21:10:15 UTC
Permalink
Post by Frank Ch. Eigler
Hi -
Post by Steven Rostedt
[...]
+ trace_mark(kernel_sched_schedule,
+ "prev_pid %d next_pid %d prev_state %ld",
+ prev->pid, next->pid, prev->state);
[...]
But...
Tracers that want to do a bit more work, like recording timings and seeing
if we hit some max somewhere, can't do much with that pretty print data.
If you find yourself wanting to perform computations like finding
maxima, or responding right there as opposed to later during userspace
trace data extraction, then you're trending toward a tool like
systemtap.
Yes, very much so. I'm working on getting the latency_tracer from the -rt
patch into something suitable for mainline. We need to calculate the max
latencies on the fly. If we hit a max, then we save it off, otherwise, we
blow away the trace and start again.
Post by Frank Ch. Eigler
Post by Steven Rostedt
[...]
trace_mark(kernel_sched_schedule,
"prev_pid %d next_pid %d prev_state %ld\0"
"prev %p next %p",
prev->pid, next->pid, prev->state,
prev, next);
This would allow for both the nice pretty print of your trace, as well as
allowing other tracers to get to better meta data.
Yes, more self-contained marker events are necessary for meaningful
in-situ processing. That needs to be balanced by the increased cost
for computing and passing the extra parameters, multiplied the event
occurrence rate.
The cost is only done when the marker is armed. Since the marker is an
unlikely, and will be placed at the end of the function.
Post by Frank Ch. Eigler
In this case, the prev/next pointers are sufficient to compute the
other values. For particularly performance-critical markers, it may
not be unreasonable to expect the callback functions to dereference
such pointers for pretty-printing or other processing.
This was exactly my point to Mathieu, but I think he has LTTng very much
coupled with the markers. I haven't played with LTTng (yet), but from what
I've read (Mathieu, correct me if I'm wrong), it seems that all the
markers become visible to userspace, and the user can simple turn them on
or off. LTTng doesn't need any knowledge of the marker since the marker
contains how to print the information.

So* by placing a "prev %p next %p" as the only information, we lose out on
this automated way LTTng works. Because the two pointers are just
meaningless numbers to the user.
Post by Frank Ch. Eigler
Post by Steven Rostedt
The '\0' would keep your tracer from recording the extra data, and we
could add some way to ignore the parameters in the printf to let other
traces get straight to the meta data.
This \0 hack is perhaps too clever. Much of the cost of the extra
parameters is already paid by the time that a simpleminded tracing
callback function starts going through the string. Also, I believe
the systemtap marker interface would break if the format strings were
not singly terminated ordinary strings.
Well, actually when I first wrote this letter, I used "--" as a delimiter
to allow a tool to hide the pretty stuff. But then I thought about the
"clever hack" with the '\0', The "--" may be better since it wont break
systemtap.

-- Steve

* dvhart - bah!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-18 22:30:25 UTC
Permalink
Post by Steven Rostedt
Post by Frank Ch. Eigler
Hi -
Post by Steven Rostedt
[...]
+ trace_mark(kernel_sched_schedule,
+ "prev_pid %d next_pid %d prev_state %ld",
+ prev->pid, next->pid, prev->state);
[...]
But...
Tracers that want to do a bit more work, like recording timings and seeing
if we hit some max somewhere, can't do much with that pretty print data.
If you find yourself wanting to perform computations like finding
maxima, or responding right there as opposed to later during userspace
trace data extraction, then you're trending toward a tool like
systemtap.
Yes, very much so. I'm working on getting the latency_tracer from the -rt
patch into something suitable for mainline. We need to calculate the max
latencies on the fly. If we hit a max, then we save it off, otherwise, we
blow away the trace and start again.
Post by Frank Ch. Eigler
Post by Steven Rostedt
[...]
trace_mark(kernel_sched_schedule,
"prev_pid %d next_pid %d prev_state %ld\0"
"prev %p next %p",
prev->pid, next->pid, prev->state,
prev, next);
This would allow for both the nice pretty print of your trace, as well as
allowing other tracers to get to better meta data.
Yes, more self-contained marker events are necessary for meaningful
in-situ processing. That needs to be balanced by the increased cost
for computing and passing the extra parameters, multiplied the event
occurrence rate.
The cost is only done when the marker is armed. Since the marker is an
unlikely, and will be placed at the end of the function.
Post by Frank Ch. Eigler
In this case, the prev/next pointers are sufficient to compute the
other values. For particularly performance-critical markers, it may
not be unreasonable to expect the callback functions to dereference
such pointers for pretty-printing or other processing.
This was exactly my point to Mathieu, but I think he has LTTng very much
coupled with the markers. I haven't played with LTTng (yet), but from what
I've read (Mathieu, correct me if I'm wrong), it seems that all the
markers become visible to userspace, and the user can simple turn them on
or off. LTTng doesn't need any knowledge of the marker since the marker
contains how to print the information.
So* by placing a "prev %p next %p" as the only information, we lose out on
this automated way LTTng works. Because the two pointers are just
meaningless numbers to the user.
Exactly. We have, at the marker site :

- a marker identifier
- format string containing field names and types
- arguments

I would like to keep that as much in a straight line as possible with
what ends up in the trace.

However, I see that it limits what can be done by in-kernel tracers. And
by the way, I also suffer from the same kind of limitation in LTTng. Here
is an example :

I would like to replace blktrace (actually, I already have a quite
complete implementation). However, there is some code ran in the kernel
to "prepare" the information for the trace which is blktrace specific.
Since this code is not required to run when tracing is disabled, it can
be seen as "glue-code" between the kernel tracing point and the
extraction of data to trace.

What looked like the less intrusive solution was to create inline
functions that consist of branches over code considered unlikely (could
be a function call) where the glue-code is executed to prepare the data.
It's a bit like what the markers are doing, except that there is no
marker name associated and no format string : the subsystem being traced
must enable its tracing features by itself (could be a /proc file). It
makes sense, since this type of code has to be subsystem-specific
anyway.

But I have not seen a lot of situations where that kind of glue-code was
needed, so I think it makes sense to keep markers simple to use and
efficient for the common case.

Then, in this glue-code, we can put trace_mark() and calls to in-kernel
tracers.

Since the markers are eventually meant to become an API visible from
user-space, I think it makes sense to keep it clean. If an in-kernel
tracer needs extra information, I think it would make sense for it to
get it from a mechanism that does not make the exported information
visible to user-space.

What do you think ?
Post by Steven Rostedt
Post by Frank Ch. Eigler
Post by Steven Rostedt
The '\0' would keep your tracer from recording the extra data, and we
could add some way to ignore the parameters in the printf to let other
traces get straight to the meta data.
This \0 hack is perhaps too clever. Much of the cost of the extra
parameters is already paid by the time that a simpleminded tracing
callback function starts going through the string. Also, I believe
the systemtap marker interface would break if the format strings were
not singly terminated ordinary strings.
Well, actually when I first wrote this letter, I used "--" as a delimiter
to allow a tool to hide the pretty stuff. But then I thought about the
"clever hack" with the '\0', The "--" may be better since it wont break
systemtap.
It could be done I guess. But it looks a bit ugly. :) I would rather
prefer to export the "pretty stuff" through an interface not involving
markers. Or if there is a way to separate the "callback" mechanism from
the "export to user-space" API parts of the markers, I am open to
proposals.

Mathieu
Post by Steven Rostedt
-- Steve
* dvhart - bah!
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-18 22:50:33 UTC
Permalink
Post by Mathieu Desnoyers
But I have not seen a lot of situations where that kind of glue-code was
needed, so I think it makes sense to keep markers simple to use and
efficient for the common case.
Then, in this glue-code, we can put trace_mark() and calls to in-kernel
tracers.
I'm almost done with the latency tracer work, and there are only a total
of 6 hooks that I needed.

- schedule context switch
- try_to_wake_up
- hard_irqs_off (which is already there for lockdep)
- hard irqs on (also for lockdep)
- lock_contention (already in for the lock contention code)
- lock acquire (also in there for contention code)

With the above, we could have this (if this is what I think you are
recommending). For example in the context_switch case:

trace_switch_to(prev, next);
switch_to(prev, next, prev);

and in sched.h I could have:

static inline trace_switch_to(struct task_struct *prev,
struct task_struct *next)
{
trace_mark(kernel_schedudule,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->pid);

trace_context_switch(prev, next);
}

and have the trace_context_switch code be something that is turned on with
the latency tracing utility (config option). That way production code can
keep it off.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-18 23:20:20 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
But I have not seen a lot of situations where that kind of glue-code was
needed, so I think it makes sense to keep markers simple to use and
efficient for the common case.
Then, in this glue-code, we can put trace_mark() and calls to in-kernel
tracers.
I'm almost done with the latency tracer work, and there are only a total
of 6 hooks that I needed.
- schedule context switch
- try_to_wake_up
- hard_irqs_off (which is already there for lockdep)
- hard irqs on (also for lockdep)
- lock_contention (already in for the lock contention code)
- lock acquire (also in there for contention code)
With the above, we could have this (if this is what I think you are
trace_switch_to(prev, next);
switch_to(prev, next, prev);
Almost.. I would add :

static int trace_switch_to_enabled;
Post by Steven Rostedt
static inline trace_switch_to(struct task_struct *prev,
struct task_struct *next)
{
if (likely(!trace_switch_to_enabled))
return;
Post by Steven Rostedt
trace_mark(kernel_schedudule,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->pid);
trace_context_switch(prev, next);
}
And some code to activate the trace_switch_to_enabled variable (ideally
keeping a refcount).

By doing this, we would have the minimum impact on the scheduled when
disabled.

But remember that this trace_switch_to_enabled could be enabled for both
markers and your tracer, so you might need to put a branch at the
beginning of trace_context_switch() too.

Mathieu
Post by Steven Rostedt
and have the trace_context_switch code be something that is turned on with
the latency tracing utility (config option). That way production code can
keep it off.
-- Steve
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Frank Ch. Eigler
2008-01-19 03:40:06 UTC
Permalink
Hi -
Post by Mathieu Desnoyers
[...]
static int trace_switch_to_enabled;
Post by Steven Rostedt
static inline trace_switch_to(struct task_struct *prev,
struct task_struct *next)
{
if (likely(!trace_switch_to_enabled))
return;
Post by Steven Rostedt
trace_mark(kernel_schedudule,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->pid);
trace_context_switch(prev, next);
}
And some code to activate the trace_switch_to_enabled variable (ideally
keeping a refcount). [...]
All this complexity is to be justified by keeping the raw prev/next
pointers from being sent to a naive tracer? It seems to me way out of
proportion.

- FChE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-19 04:00:13 UTC
Permalink
Post by Frank Ch. Eigler
All this complexity is to be justified by keeping the raw prev/next
pointers from being sent to a naive tracer? It seems to me way out of
proportion.
Damn, and I just blew away all my marker code for something like this ;-)

Actually, you just gave me a great idea that I think can help all of us.
OK, Mathieu may not be in total agreement, but I think this is the
ultimate compromise.

We have in sched.c the following marker:

trace_mark(kernel_sched_scheduler, "prev %p next %p", prev, next);


Then Mathieu can add in some code somewhere (or a module, or something)

ret = marker_probe_register("kernel_sched_scheduler",
"prev %p next %p",
pretty_print_sched_switch, NULL);

static void pretty_print_sched_switch(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
va_list ap;
struct task_struct *prev;
struct task_struct *next;

va_start(ap, format);
prev = va_arg(ap, typeof(prev));
next = va_arg(ap, typeof(next));
va_end;

trace_mark(kernel_pretty_print_sched_switch,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->state);
}


Then LTTng on startup could arm the normal kernel_sched_switch code and
have the user see the nice one. All without adding any more goo or
overhead to the non tracing case, and keeping a few critical markers with
enough information to be useful to other tracers!

Thoughts?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Frank Ch. Eigler
2008-01-19 04:30:13 UTC
Permalink
Hi -
Post by Steven Rostedt
[...]
Post by Frank Ch. Eigler
All this complexity is to be justified by keeping the raw prev/next
pointers from being sent to a naive tracer? It seems to me way out of
proportion.
Damn, and I just blew away all my marker code for something like this ;-)
Sorry! :-)
Post by Steven Rostedt
[...]
trace_mark(kernel_sched_scheduler, "prev %p next %p", prev, next);
Fine so far!
Post by Steven Rostedt
Then Mathieu can add in some code somewhere (or a module, or something)
ret = marker_probe_register("kernel_sched_scheduler",
"prev %p next %p",
pretty_print_sched_switch, NULL);
static void pretty_print_sched_switch(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
[...]
trace_mark(kernel_pretty_print_sched_switch,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->state);
}
That marker_probe_register call would need to be done only when the
embedded (k_p_p_s_s) marker is actually being used. Otherwise we'd
lose all the savings of a dormant sched.c marker by always calling
into pretty_print_sched_switch(), whether or not the k_p_p_s_s marker
was active.

In any case, if the naive tracer agrees to become educated about some
of these markers in the form of intermediary functions like that, they
need not insist on a second hop through marker territory anyway:

static void pretty_print_sched_switch(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
[...]
lttng_backend_trace(kernel_pretty_print_sched_switch,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->state);
}


- FChE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-19 15:30:10 UTC
Permalink
Post by Frank Ch. Eigler
Hi -
Post by Steven Rostedt
[...]
Post by Frank Ch. Eigler
All this complexity is to be justified by keeping the raw prev/next
pointers from being sent to a naive tracer? It seems to me way out of
proportion.
Damn, and I just blew away all my marker code for something like this ;-)
Sorry! :-)
Post by Steven Rostedt
[...]
trace_mark(kernel_sched_scheduler, "prev %p next %p", prev, next);
Fine so far!
Post by Steven Rostedt
Then Mathieu can add in some code somewhere (or a module, or something)
ret = marker_probe_register("kernel_sched_scheduler",
"prev %p next %p",
pretty_print_sched_switch, NULL);
static void pretty_print_sched_switch(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
[...]
trace_mark(kernel_pretty_print_sched_switch,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->state);
}
That marker_probe_register call would need to be done only when the
embedded (k_p_p_s_s) marker is actually being used. Otherwise we'd
lose all the savings of a dormant sched.c marker by always calling
into pretty_print_sched_switch(), whether or not the k_p_p_s_s marker
was active.
In any case, if the naive tracer agrees to become educated about some
of these markers in the form of intermediary functions like that, they
static void pretty_print_sched_switch(const struct marker *mdata,
void *private_data,
const char *format, ...)
{
[...]
lttng_backend_trace(kernel_pretty_print_sched_switch,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->state);
}
Oh! perfect then :) Since I already planned my ltt-marker-control kernel
module to connect specialized callbacks instead of the dumb one, it
shouldn't be so hard to do.

I would just have to find another way to declare the trace events (it's
currently embedded in the markers), but it's not a showstopper. I'll try
this.

Thanks to you both for the good proposals,

Mathieu
Post by Frank Ch. Eigler
- FChE
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Frank Ch. Eigler
2008-01-19 03:40:09 UTC
Permalink
Hi -
Post by Steven Rostedt
[...]
Post by Mathieu Desnoyers
But I have not seen a lot of situations where that kind of glue-code was
needed, so I think it makes sense to keep markers simple to use and
efficient for the common case.
Then, in this glue-code, we can put trace_mark() and calls to in-kernel
tracers.
I'm almost done with the latency tracer work, and there are only a total
of 6 hooks that I needed.
[...]
With the above, we could have this (if this is what I think you are
recommending). [...]
static inline trace_switch_to(struct task_struct *prev,
struct task_struct *next)
{
trace_mark(kernel_schedudule,
"prev_pid %d next_pid %d prev_state %ld",
prev->pid, next->pid, prev->pid);
trace_context_switch(prev, next);
}
I'm afraid I don't see the point in this. You could use one marker
for all that data (and force the more naive tracer callbacks to ignore
out some of them). You could even use two markers (and force the more
naive tracer to attach to only to its favorite subset). But to use a
second, different, less efficient, not more configurable tracing hook
mechanism in the same logical spot makes no sense to me.

- FChE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tim Bird
2008-01-16 18:00:21 UTC
Permalink
Post by Steven Rostedt
grmble. Then how do you trace preempt_disable? As my tracer does that
(see the last patch in the series).
One way is to make a tracer_preempt_disable() and tracer_preempt_enable(),
both of which would be 'notrace'. You could probably optimize them
as well. The standard preempt_disable and preempt_enable don't look
very efficient (e.g. what's up with converting an increment operation into
an addition? - gak!)

Any lock you do is going to have a pretty bad effect.

In order to be able to trace as much as possible, for KFT, I implemented
my own synchronization mechanism using cmpxchg, to avoid using any of the
existing kernel locks (which change more often than you'd think, and have
weird side effects).

=============================
Tim Bird
Architecture Group Chair, CE Linux Forum
Senior Staff Engineer, Sony Corporation of America
=============================

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-16 22:40:14 UTC
Permalink
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
Yea. After our earlier discussion and talking w/ Steven, I'm taking a
swing at this now. The lock-free method still doesn't apply to the
update_wall_time function, but does work fine for the monotonic cycle
uses. I'll send a patch for review as soon as I get things building.

thanks
-john
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-16 23:00:21 UTC
Permalink
Post by john stultz
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
Yea. After our earlier discussion and talking w/ Steven, I'm taking a
swing at this now. The lock-free method still doesn't apply to the
update_wall_time function, but does work fine for the monotonic cycle
uses. I'll send a patch for review as soon as I get things building.
So here's my first attempt at adding Mathieu's lock-free method to
Steven's get_monotonic_cycles() interface.

Completely un-tested, but it builds, so I figured I'd send it out for
review.

I'm not super sure the update or the read doesn't need something
additional to force a memory access, but as I didn't see anything
special in Mathieu's implementation, I'm going to guess this is ok.

Mathieu, Let me know if this isn't what you're suggesting.

Signed-off-by: John Stultz <***@us.ibm.com>

Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 14:41:31.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
+ cycle_t cycle_last, cycle_accumulated;

+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
+ cycle_t cycle_base_last, cycle_base;
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;

@@ -175,19 +183,21 @@
}

/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
* @cs: pointer to clocksource being read
* @now: current cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
+ int num = cs->base_num;
+ cycle_t offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
return offset;
}

@@ -197,14 +207,25 @@
* @now: current cycle value
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
+ cs->base_num = num;
+
+ /* Now update the cycle_accumulated portion */
+ offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
- cs->cycle_raw += offset;
}

/**
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;

- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
ns_offset = cyc2ns(clock, cycle_delta);

return ns_offset;
@@ -105,35 +107,7 @@

cycle_t notrace get_monotonic_cycles(void)
{
- cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
-
- do {
- /*
- * cycle_raw and cycle_last can change on
- * another CPU and we need the delta calculation
- * of cycle_now and cycle_last happen atomic, as well
- * as the adding to cycle_raw. We don't need to grab
- * any locks, we just keep trying until get all the
- * calculations together in one state.
- *
- * In fact, we __cant__ grab any locks. This
- * function is called from the latency_tracer which can
- * be called anywhere. To grab any locks (including
- * seq_locks) we risk putting ourselves into a deadlock.
- */
- cycle_raw = clock->cycle_raw;
- cycle_last = clock->cycle_last;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - cycle_last) & clock->mask;
-
- } while (cycle_raw != clock->cycle_raw ||
- cycle_last != clock->cycle_last);
-
- return cycle_raw + cycle_delta;
+ return clocksource_get_basecycles(clock, clocksource_read(clock));
}

unsigned long notrace cycles_to_usecs(cycle_t cycles)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-16 23:40:08 UTC
Permalink
Post by john stultz
Post by john stultz
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
Yea. After our earlier discussion and talking w/ Steven, I'm taking a
swing at this now. The lock-free method still doesn't apply to the
update_wall_time function, but does work fine for the monotonic cycle
uses. I'll send a patch for review as soon as I get things building.
So here's my first attempt at adding Mathieu's lock-free method to
Steven's get_monotonic_cycles() interface.
Completely un-tested, but it builds, so I figured I'd send it out for
review.
I'm not super sure the update or the read doesn't need something
additional to force a memory access, but as I didn't see anything
special in Mathieu's implementation, I'm going to guess this is ok.
Mathieu, Let me know if this isn't what you're suggesting.
Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 14:41:31.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
Shouldn't the cycle_last and cycle_accumulated by in the array too ?
Post by john stultz
+ cycle_t cycle_last, cycle_accumulated;
+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
We had cycle_raw before, why do we need the following two ?
Post by john stultz
+ cycle_t cycle_base_last, cycle_base;
I'm not quite sure why you need both cycle_base_last and cycle_base...

I think I'll need a bit of an explanation of what you are trying to
achieve here to see what to expect from the clock source. Are you trying
to deal with non-synchronized TSCs across CPUs in a way that will
generate a monotonic (sometimes stalling) clock ?

What I am trying to say is : I know you are trying to make a virtual
clock source where time cannot go backward, but what are your
assumptions about the "real" clock source ?

Is the intent to deal with an HPET suddenly reset to 0 or something
like this ?

Basically, I wonder why you have to calculate the current cycle count
from the previous update_wall_time event. Is is because you need to be
consistent when a clocksource change occurs ?
Post by john stultz
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
@@ -175,19 +183,21 @@
}
/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
I would disable preemption in clocksource_get_basecycles. We would not
want to be scheduled out while we hold a pointer to the old array
element.
Post by john stultz
+ int num = cs->base_num;
Since you deal with base_num in a shared manner (not per cpu), you will
need a smp_read_barrier_depend() here after the cs->base_num read.

You should think about reading the cs->base_num first, and _after_ that
read the real clocksource. Here, the clocksource value is passed as
parameter. It means that the read clocksource may have been read in the
previous RCU window.
Post by john stultz
+ cycle_t offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
return offset;
}
@@ -197,14 +207,25 @@
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
Why do we still require xtime_lock here ? Can you tell exactly which
contexts this function will be called from (periodical timer interrupt?)
I guess it is called from one and only one CPU periodically.
Post by john stultz
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1. This is why I use cpu_synth->index ? 0 : 1 in
my code. The two previous lines seems buggy. (I did the same mistake in
my first implementation) ;)
Post by john stultz
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
Here too.
Post by john stultz
+ cs->base[num].cycle_base_last = now;
Since you deal with shared data (in my algo, I use per-cpu data), you
have to add a wmb() before the base_num value update. Only then will you
ensure that other CPUs will see consistent values.
Post by john stultz
+ cs->base_num = num;
+
+ /* Now update the cycle_accumulated portion */
+ offset = (now - cs->cycle_last) & cs->mask;
The following two updates are racy. I think they should be in the array
too. We want consistant cycle_raw, cycle_last and cycle_accumulated
values; they should therefore be presented to the reader atomically with
a pointer change.
Post by john stultz
cs->cycle_last = now;
cs->cycle_accumulated += offset;
- cs->cycle_raw += offset;
}
/**
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
Why is there a second level of cycle_last and cycle_accumulated here ?
Post by john stultz
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
ns_offset = cyc2ns(clock, cycle_delta);
return ns_offset;
@@ -105,35 +107,7 @@
cycle_t notrace get_monotonic_cycles(void)
{
- cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
-
- do {
- /*
- * cycle_raw and cycle_last can change on
- * another CPU and we need the delta calculation
- * of cycle_now and cycle_last happen atomic, as well
- * as the adding to cycle_raw. We don't need to grab
- * any locks, we just keep trying until get all the
- * calculations together in one state.
- *
- * In fact, we __cant__ grab any locks. This
- * function is called from the latency_tracer which can
- * be called anywhere. To grab any locks (including
- * seq_locks) we risk putting ourselves into a deadlock.
- */
- cycle_raw = clock->cycle_raw;
- cycle_last = clock->cycle_last;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - cycle_last) & clock->mask;
-
- } while (cycle_raw != clock->cycle_raw ||
- cycle_last != clock->cycle_last);
-
- return cycle_raw + cycle_delta;
+ return clocksource_get_basecycles(clock, clocksource_read(clock));
}
unsigned long notrace cycles_to_usecs(cycle_t cycles)
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 00:00:14 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1. This is why I use cpu_synth->index ? 0 : 1 in
How about simply "cpu_synth->index ^ 1"? Seems the best choice if you ask
me, if all you are doing is changing it from 1 to zero and back to 1.

-- Steve
Post by Mathieu Desnoyers
my code. The two previous lines seems buggy. (I did the same mistake in
my first implementation) ;)
Post by john stultz
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
Here too.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 00:40:07 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
!0 is not necessarily 1. This is why I use cpu_synth->index ? 0 : 1 in
How about simply "cpu_synth->index ^ 1"? Seems the best choice if you ask
me, if all you are doing is changing it from 1 to zero and back to 1.
FYI:

***@bilbo:~/c$ cat flipme.c
int flip1 (int x)
{
return !x;
}

int flip2 (int x)
{
return x ? 0 : 1;
}

int flip3(int x)
{
return x ^ 1;
}
***@bilbo:~/c$ gcc -O2 -c flipme.c
***@bilbo:~/c$ objdump -d flipme.o

flipme.o: file format elf32-i386

Disassembly of section .text:

00000000 <flip1>:
0: 55 push %ebp
1: 31 c0 xor %eax,%eax
3: 89 e5 mov %esp,%ebp
5: 83 7d 08 00 cmpl $0x0,0x8(%ebp)
9: 5d pop %ebp
a: 0f 94 c0 sete %al
d: c3 ret
e: 66 90 xchg %ax,%ax

00000010 <flip2>:
10: 55 push %ebp
11: 31 c0 xor %eax,%eax
13: 89 e5 mov %esp,%ebp
15: 83 7d 08 00 cmpl $0x0,0x8(%ebp)
19: 5d pop %ebp
1a: 0f 94 c0 sete %al
1d: c3 ret
1e: 66 90 xchg %ax,%ax

00000020 <flip3>:
20: 55 push %ebp
21: 89 e5 mov %esp,%ebp
23: 8b 45 08 mov 0x8(%ebp),%eax
26: 5d pop %ebp
27: 83 f0 01 xor $0x1,%eax
2a: c3 ret


So, if you know for sure that x is only 1 or 0, then using x ^ 1 to invert
it, seems the most efficient.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-17 00:40:06 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
Post by john stultz
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
Yea. After our earlier discussion and talking w/ Steven, I'm taking a
swing at this now. The lock-free method still doesn't apply to the
update_wall_time function, but does work fine for the monotonic cycle
uses. I'll send a patch for review as soon as I get things building.
So here's my first attempt at adding Mathieu's lock-free method to
Steven's get_monotonic_cycles() interface.
Completely un-tested, but it builds, so I figured I'd send it out for
review.
I'm not super sure the update or the read doesn't need something
additional to force a memory access, but as I didn't see anything
special in Mathieu's implementation, I'm going to guess this is ok.
Mathieu, Let me know if this isn't what you're suggesting.
Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 14:41:31.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
Shouldn't the cycle_last and cycle_accumulated by in the array too ?
No, we're leaving cycle_last and cycle_accumulated alone. They're
relating to the update_wall_time conversion of cycles to xtime.
Post by Mathieu Desnoyers
Post by john stultz
+ cycle_t cycle_last, cycle_accumulated;
+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
We had cycle_raw before, why do we need the following two ?
Post by john stultz
+ cycle_t cycle_base_last, cycle_base;
I'm not quite sure why you need both cycle_base_last and cycle_base...
So on my first shot at this, I tried to layer the concepts. Using the
lock-free method to create a abstracted 64bit counter, as provided by
get_monotonic_cycles(). Then I tried to use that abstraction directly in
the update_wall_time() code, reading the abstracted 64bit counter and
using it to update time.

However, then we start keeping cycle_last in 64bit cycles, rather then
an actual counter read. This then caused changes to be needed in the
arch vsyscall implementations, and that started to get ugly, as we had
to also re-implement the abstracted 64bit counter w/ the lock free
method as well.

So I just backed off and tried to make it simple: We have two sets of
data that counts cycles from the clocksource. One for timekeeping and
one for get_monotoinc_cycles(). It is a little redundant, but I don't
think you can escape that (the layering method above also has
redundancy, but its just hidden until you implement the vsyscall gtod
methods).
Post by Mathieu Desnoyers
I think I'll need a bit of an explanation of what you are trying to
achieve here to see what to expect from the clock source. Are you trying
to deal with non-synchronized TSCs across CPUs in a way that will
generate a monotonic (sometimes stalling) clock ?
No no no.. I'm not touching the non-synced TSC issue. I'm just trying to
take clocksource counters, which may be of different bit-widths (ACPI PM
is 24bits, for instance), and create lock-free method to translate that
into a virtual 64bit wide counter (using an accumulation bucket,
basically).
Post by Mathieu Desnoyers
What I am trying to say is : I know you are trying to make a virtual
clock source where time cannot go backward, but what are your
assumptions about the "real" clock source ?
The assumptions of the real clocksource is the same we keep in the
timekeeping core. It counts forward, at a constant rate and only wraps
after the mask value has been reached.
Post by Mathieu Desnoyers
Is the intent to deal with an HPET suddenly reset to 0 or something
like this ?
Well, dealing with clocksources wrapping short of 64bits.
Post by Mathieu Desnoyers
Basically, I wonder why you have to calculate the current cycle count
from the previous update_wall_time event. Is is because you need to be
consistent when a clocksource change occurs ?
Actually, we try to do it from the last clocksource_accumulate() call
(which is called from update_wall_time).
Post by Mathieu Desnoyers
Post by john stultz
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
@@ -175,19 +183,21 @@
}
/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
I would disable preemption in clocksource_get_basecycles. We would not
want to be scheduled out while we hold a pointer to the old array
element.
Ok. This is the part I wasn't so sure about. But yes, that sounds
reasonable.
Post by Mathieu Desnoyers
Post by john stultz
+ int num = cs->base_num;
Since you deal with base_num in a shared manner (not per cpu), you will
need a smp_read_barrier_depend() here after the cs->base_num read.
Ah, thanks. I'll add that in.
Post by Mathieu Desnoyers
You should think about reading the cs->base_num first, and _after_ that
read the real clocksource. Here, the clocksource value is passed as
parameter. It means that the read clocksource may have been read in the
previous RCU window.
Hmm. Ok, still need to wrap my head around that one, but I think it
makes sense.
Post by Mathieu Desnoyers
Post by john stultz
+ cycle_t offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
return offset;
}
@@ -197,14 +207,25 @@
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
Why do we still require xtime_lock here ? Can you tell exactly which
contexts this function will be called from (periodical timer interrupt?)
I guess it is called from one and only one CPU periodically.
Well, the main reason we need the xtime_lock, is because the xtime_lock
still protects the cycle_last and cycle_accumulated values (which are
not lock-free). This is part of the redundancy issue above. We're
updating similar structures, that store different data from the same
source. One of the two can be handled lock-free, the other cannot.

In addition however, doing the update under the lock makes sure we don't
do the update in parallel (serializes writers, basically) if
clocksource_accumulate is called on different cpus (it shouldn't happen
right now, but in the past it has been possible).
Post by Mathieu Desnoyers
Post by john stultz
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1. This is why I use cpu_synth->index ? 0 : 1 in
my code. The two previous lines seems buggy. (I did the same mistake in
my first implementation) ;)
Heh. My first thought to this was just disbelief("WHAAAH? NOOOO!"). But
Steven made clear the logical issue on irc. Thanks for pointing it out.
I've been using that assumption (as well as the !! trick) for so long it
will be a hard habit to break. :)

I'll add in Steven's method to the code.
Post by Mathieu Desnoyers
Post by john stultz
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
Here too.
Post by john stultz
+ cs->base[num].cycle_base_last = now;
Since you deal with shared data (in my algo, I use per-cpu data), you
have to add a wmb() before the base_num value update. Only then will you
ensure that other CPUs will see consistent values.
Ok. Thanks I was worried about that as well.


Thanks so much for the review! I'll go through and make the update
changes you suggested. Do let me know if my explanations above to your
questions make sense.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 02:30:14 UTC
Permalink
Post by john stultz
Post by Mathieu Desnoyers
Post by john stultz
Post by john stultz
Post by Mathieu Desnoyers
If you really want an seqlock free algorithm (I _do_ want this for
tracing!) :) maybe going in the RCU direction could help (I refer to my
RCU-based 32-to-64 bits lockless timestamp counter extension, which
could be turned into the clocksource updater).
Yea. After our earlier discussion and talking w/ Steven, I'm taking a
swing at this now. The lock-free method still doesn't apply to the
update_wall_time function, but does work fine for the monotonic cycle
uses. I'll send a patch for review as soon as I get things building.
So here's my first attempt at adding Mathieu's lock-free method to
Steven's get_monotonic_cycles() interface.
Completely un-tested, but it builds, so I figured I'd send it out for
review.
I'm not super sure the update or the read doesn't need something
additional to force a memory access, but as I didn't see anything
special in Mathieu's implementation, I'm going to guess this is ok.
Mathieu, Let me know if this isn't what you're suggesting.
Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 14:41:31.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
Shouldn't the cycle_last and cycle_accumulated by in the array too ?
No, we're leaving cycle_last and cycle_accumulated alone. They're
relating to the update_wall_time conversion of cycles to xtime.
Post by Mathieu Desnoyers
Post by john stultz
+ cycle_t cycle_last, cycle_accumulated;
+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
We had cycle_raw before, why do we need the following two ?
Post by john stultz
+ cycle_t cycle_base_last, cycle_base;
I'm not quite sure why you need both cycle_base_last and cycle_base...
So on my first shot at this, I tried to layer the concepts. Using the
lock-free method to create a abstracted 64bit counter, as provided by
get_monotonic_cycles(). Then I tried to use that abstraction directly in
the update_wall_time() code, reading the abstracted 64bit counter and
using it to update time.
However, then we start keeping cycle_last in 64bit cycles, rather then
an actual counter read. This then caused changes to be needed in the
arch vsyscall implementations, and that started to get ugly, as we had
to also re-implement the abstracted 64bit counter w/ the lock free
method as well.
So I just backed off and tried to make it simple: We have two sets of
data that counts cycles from the clocksource. One for timekeeping and
one for get_monotoinc_cycles(). It is a little redundant, but I don't
think you can escape that (the layering method above also has
redundancy, but its just hidden until you implement the vsyscall gtod
methods).
Post by Mathieu Desnoyers
I think I'll need a bit of an explanation of what you are trying to
achieve here to see what to expect from the clock source. Are you trying
to deal with non-synchronized TSCs across CPUs in a way that will
generate a monotonic (sometimes stalling) clock ?
No no no.. I'm not touching the non-synced TSC issue. I'm just trying to
take clocksource counters, which may be of different bit-widths (ACPI PM
is 24bits, for instance), and create lock-free method to translate that
into a virtual 64bit wide counter (using an accumulation bucket,
basically).
Post by Mathieu Desnoyers
What I am trying to say is : I know you are trying to make a virtual
clock source where time cannot go backward, but what are your
assumptions about the "real" clock source ?
The assumptions of the real clocksource is the same we keep in the
timekeeping core. It counts forward, at a constant rate and only wraps
after the mask value has been reached.
Post by Mathieu Desnoyers
Is the intent to deal with an HPET suddenly reset to 0 or something
like this ?
Well, dealing with clocksources wrapping short of 64bits.
Ah ok, then the problem is clearer :) The main difference between the
approach I use and yours is that, let's say your clocksource starts a
while after the system started running, then it will start the
accumulator at 0. You therefore have to keep track of the total time
accumulated and the current lower order bits of the last accumulation
upon clocksource_accumulate().

I used a different method to do this. I just need to keep the 64 bits
"synthetic counter" value at each update. The main difference between my
synthetic counter and your accumumated cycles is that my LSBs will
always be exactly the same as the real clocksource itself. In your case,
you always have to read two 64 bits fields and perform an addition and a
substraction, while I only need to do comparisons and bit operations.
Only when I detect a wrap around of the LSB (in the read side) do I have
to add 1 to the higher order bits of the value I return.

So by using my algorithm, you could replace the cycle_base_last and
cycle_base by a single "synthetic_tsc" variable.

So the periodical calls to clocksource_accumulate() would only be there
to update the synthetic TSC to get the MSBs right and to make sure the
following reads would be able to detect LSB overflows.
Post by john stultz
Post by Mathieu Desnoyers
Basically, I wonder why you have to calculate the current cycle count
from the previous update_wall_time event. Is is because you need to be
consistent when a clocksource change occurs ?
Actually, we try to do it from the last clocksource_accumulate() call
(which is called from update_wall_time).
Post by Mathieu Desnoyers
Post by john stultz
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
@@ -175,19 +183,21 @@
}
/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
I would disable preemption in clocksource_get_basecycles. We would not
want to be scheduled out while we hold a pointer to the old array
element.
Ok. This is the part I wasn't so sure about. But yes, that sounds
reasonable.
Post by Mathieu Desnoyers
Post by john stultz
+ int num = cs->base_num;
Since you deal with base_num in a shared manner (not per cpu), you will
need a smp_read_barrier_depend() here after the cs->base_num read.
Ah, thanks. I'll add that in.
Post by Mathieu Desnoyers
You should think about reading the cs->base_num first, and _after_ that
read the real clocksource. Here, the clocksource value is passed as
parameter. It means that the read clocksource may have been read in the
should read the previous line "real clocksource" : (might help
understanding)
Post by john stultz
Post by Mathieu Desnoyers
parameter. It means that the reaL clocksource may have been read in the
previous RCU window.
Hmm. Ok, still need to wrap my head around that one, but I think it
makes sense.
Post by Mathieu Desnoyers
Post by john stultz
+ cycle_t offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
return offset;
}
@@ -197,14 +207,25 @@
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
Why do we still require xtime_lock here ? Can you tell exactly which
contexts this function will be called from (periodical timer interrupt?)
I guess it is called from one and only one CPU periodically.
Well, the main reason we need the xtime_lock, is because the xtime_lock
still protects the cycle_last and cycle_accumulated values (which are
not lock-free). This is part of the redundancy issue above. We're
updating similar structures, that store different data from the same
source. One of the two can be handled lock-free, the other cannot.
In addition however, doing the update under the lock makes sure we don't
do the update in parallel (serializes writers, basically) if
clocksource_accumulate is called on different cpus (it shouldn't happen
right now, but in the past it has been possible).
Note that if two writers have to be serialized, then you do not respect
the required delay between updates that are necessary to make sure a
reader won't see its data overwritten while it still holds reference to
its old data.
Post by john stultz
Post by Mathieu Desnoyers
Post by john stultz
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1. This is why I use cpu_synth->index ? 0 : 1 in
my code. The two previous lines seems buggy. (I did the same mistake in
my first implementation) ;)
Heh. My first thought to this was just disbelief("WHAAAH? NOOOO!"). But
Steven made clear the logical issue on irc. Thanks for pointing it out.
I've been using that assumption (as well as the !! trick) for so long it
will be a hard habit to break. :)
I'll add in Steven's method to the code.
To be continued in the other thread I guess..
Post by john stultz
Post by Mathieu Desnoyers
Post by john stultz
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
Here too.
Post by john stultz
+ cs->base[num].cycle_base_last = now;
Since you deal with shared data (in my algo, I use per-cpu data), you
have to add a wmb() before the base_num value update. Only then will you
ensure that other CPUs will see consistent values.
Ok. Thanks I was worried about that as well.
I think a smp_wmb() would be enough though.
Post by john stultz
Thanks so much for the review! I'll go through and make the update
changes you suggested. Do let me know if my explanations above to your
questions make sense.
We would have to see which method (synthetic tsc vs accumulated cycles)
makes the more sense/is the fastest/etc...

Mathieu
Post by john stultz
thanks
-john
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Linus Torvalds
2008-01-17 01:10:07 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1.
Incorrect.

!0 _is_ necessarily 1. It's how all C logical operators work. If you find
a compiler that turns !x into anything but 0/1, you found a compiler for
another language than C.

It's true that any non-zero value counts as "true", but the that does not
mean that a logical operator can return any non-zero value for true. As a
return value of the logical operations in C, true is *always* 1.

So !, ||, &&, when used as values, will *always* return either 0 or 1 (but
when used as part of a conditional, the compiler will often optimize out
unnecessary stuff, so the CPU may not actually ever see a 0/1 value, if
the value itself was never used, only branched upon).

So doing "!cs->base_num" to turn 0->1 and 1->0 is perfectly fine.

That's not to say it's necessarily the *best* way.

If you *know* that you started with 0/1 in the first place, the best way
to flip it tends to be to do (1-x) (or possibly (x^1)).

And if you can't guarantee that, !x is probably better than x ? 0 : 1,
but you might also decide to use ((x+1)&1) for example.

And obviously, the compiler may sometimes surprise you, and if *it* also
knows it's always 0/1 (for something like the source being a single-bit
bitfield for example), it may end up doing something else than you coded
that is equivalent. And the particular choice of operation the compiler
chooses may well depend on the code _around_ that sequence.

(One reason to potentially prefer (1-x) over (x^1) is that it's often
easier to combine a subtraction with other operations, while an xor seldom
combines with anything around it)

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 01:50:11 UTC
Permalink
Post by Linus Torvalds
Post by Mathieu Desnoyers
Post by john stultz
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
!0 is not necessarily 1.
Incorrect.
Hrm, *digging in my mailbox*, ah, here it is :

http://listserv.shafik.org/pipermail/ltt-dev/2006-June/001548.html

Richard Purdie reviewed my code back in 2006 and made this modification.
Maybe will he have something to add.
Post by Linus Torvalds
!0 _is_ necessarily 1. It's how all C logical operators work. If you find
a compiler that turns !x into anything but 0/1, you found a compiler for
another language than C.
It's true that any non-zero value counts as "true", but the that does not
mean that a logical operator can return any non-zero value for true. As a
return value of the logical operations in C, true is *always* 1.
So !, ||, &&, when used as values, will *always* return either 0 or 1 (but
when used as part of a conditional, the compiler will often optimize out
unnecessary stuff, so the CPU may not actually ever see a 0/1 value, if
the value itself was never used, only branched upon).
So doing "!cs->base_num" to turn 0->1 and 1->0 is perfectly fine.
That's not to say it's necessarily the *best* way.
If you *know* that you started with 0/1 in the first place, the best way
to flip it tends to be to do (1-x) (or possibly (x^1)).
And if you can't guarantee that, !x is probably better than x ? 0 : 1,
but you might also decide to use ((x+1)&1) for example.
And obviously, the compiler may sometimes surprise you, and if *it* also
knows it's always 0/1 (for something like the source being a single-bit
bitfield for example), it may end up doing something else than you coded
that is equivalent. And the particular choice of operation the compiler
chooses may well depend on the code _around_ that sequence.
(One reason to potentially prefer (1-x) over (x^1) is that it's often
easier to combine a subtraction with other operations, while an xor seldom
combines with anything around it)
Ok, I'll adopt (1-x) then. Thanks!

Mathieu
Post by Linus Torvalds
Linus
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-17 02:30:14 UTC
Permalink
Post by Mathieu Desnoyers
I would disable preemption in clocksource_get_basecycles. We would not
want to be scheduled out while we hold a pointer to the old array
element.
Post by john stultz
+ int num = cs->base_num;
Since you deal with base_num in a shared manner (not per cpu), you will
need a smp_read_barrier_depend() here after the cs->base_num read.
You should think about reading the cs->base_num first, and _after_ that
read the real clocksource. Here, the clocksource value is passed as
parameter. It means that the read clocksource may have been read in the
previous RCU window.
Here's an updated version of the patch w/ the suggested memory barrier
changes and favored (1-x) inversion change. ;) Let me know if you see
any other holes, or have any other suggestions or ideas.

Still un-tested (my test box will free up soon, I promise!), but builds.

Signed-off-by: John Stultz <***@us.ibm.com>

Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 18:12:53.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
+ cycle_t cycle_last, cycle_accumulated;

+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
+ cycle_t cycle_base_last, cycle_base;
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;

@@ -175,19 +183,29 @@
}

/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
* @cs: pointer to clocksource being read
* @now: current cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
+ int num;
+ cycle_t now, offset;
+
+ preempt_disable();
+ num = cs->base_num;
+ smp_read_barrier_depends();
+ now = clocksource_read(cs);
+ offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
+ preempt_enable();
+
return offset;
}

@@ -197,14 +215,26 @@
* @now: current cycle value
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = 1 - cs->base_num;
+ cycle_t offset = (now - cs->base[1-num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[1-num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
+ wmb();
+ cs->base_num = num;
+
+ /* Now update the cycle_accumulated portion */
+ offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
- cs->cycle_raw += offset;
}

/**
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 17:51:50.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;

- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
ns_offset = cyc2ns(clock, cycle_delta);

return ns_offset;
@@ -105,35 +107,7 @@

cycle_t notrace get_monotonic_cycles(void)
{
- cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
-
- do {
- /*
- * cycle_raw and cycle_last can change on
- * another CPU and we need the delta calculation
- * of cycle_now and cycle_last happen atomic, as well
- * as the adding to cycle_raw. We don't need to grab
- * any locks, we just keep trying until get all the
- * calculations together in one state.
- *
- * In fact, we __cant__ grab any locks. This
- * function is called from the latency_tracer which can
- * be called anywhere. To grab any locks (including
- * seq_locks) we risk putting ourselves into a deadlock.
- */
- cycle_raw = clock->cycle_raw;
- cycle_last = clock->cycle_last;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - cycle_last) & clock->mask;
-
- } while (cycle_raw != clock->cycle_raw ||
- cycle_last != clock->cycle_last);
-
- return cycle_raw + cycle_delta;
+ return clocksource_get_basecycles(clock);
}

unsigned long notrace cycles_to_usecs(cycle_t cycles)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 02:40:10 UTC
Permalink
Post by john stultz
Post by Mathieu Desnoyers
I would disable preemption in clocksource_get_basecycles. We would not
want to be scheduled out while we hold a pointer to the old array
element.
Post by john stultz
+ int num = cs->base_num;
Since you deal with base_num in a shared manner (not per cpu), you will
need a smp_read_barrier_depend() here after the cs->base_num read.
You should think about reading the cs->base_num first, and _after_ that
read the real clocksource. Here, the clocksource value is passed as
parameter. It means that the read clocksource may have been read in the
previous RCU window.
Here's an updated version of the patch w/ the suggested memory barrier
changes and favored (1-x) inversion change. ;) Let me know if you see
any other holes, or have any other suggestions or ideas.
Still un-tested (my test box will free up soon, I promise!), but builds.
Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 18:12:53.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
+ cycle_t cycle_last, cycle_accumulated;
+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
+ cycle_t cycle_base_last, cycle_base;
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
@@ -175,19 +183,29 @@
}
/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
+ int num;
+ cycle_t now, offset;
+
+ preempt_disable();
+ num = cs->base_num;
+ smp_read_barrier_depends();
+ now = clocksource_read(cs);
+ offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
+ preempt_enable();
+
return offset;
}
@@ -197,14 +215,26 @@
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = 1 - cs->base_num;
(nitpick)
right here, you could probably express 1-num with cs->base_num, since we
are the only ones supposed to touch it.
Post by john stultz
+ cycle_t offset = (now - cs->base[1-num].cycle_base_last);
+ offset &= cs->mask;
here too.
Post by john stultz
+ cs->base[num].cycle_base = cs->base[1-num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
+ wmb();
As I just emailed : smp_smb() *should* be enough. I don't see which
architecture could reorder writes wrt local interrupts ? (please tell me
if I am grossly mistaken)

Mathieu
Post by john stultz
+ cs->base_num = num;
+
+ /* Now update the cycle_accumulated portion */
+ offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
- cs->cycle_raw += offset;
}
/**
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 17:51:50.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
ns_offset = cyc2ns(clock, cycle_delta);
return ns_offset;
@@ -105,35 +107,7 @@
cycle_t notrace get_monotonic_cycles(void)
{
- cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
-
- do {
- /*
- * cycle_raw and cycle_last can change on
- * another CPU and we need the delta calculation
- * of cycle_now and cycle_last happen atomic, as well
- * as the adding to cycle_raw. We don't need to grab
- * any locks, we just keep trying until get all the
- * calculations together in one state.
- *
- * In fact, we __cant__ grab any locks. This
- * function is called from the latency_tracer which can
- * be called anywhere. To grab any locks (including
- * seq_locks) we risk putting ourselves into a deadlock.
- */
- cycle_raw = clock->cycle_raw;
- cycle_last = clock->cycle_last;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - cycle_last) & clock->mask;
-
- } while (cycle_raw != clock->cycle_raw ||
- cycle_last != clock->cycle_last);
-
- return cycle_raw + cycle_delta;
+ return clocksource_get_basecycles(clock);
}
unsigned long notrace cycles_to_usecs(cycle_t cycles)
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-16 23:40:09 UTC
Permalink
Thanks John for doing this!

(comments imbedded)
Post by john stultz
Completely un-tested, but it builds, so I figured I'd send it out for
review.
heh, ok, I'll take it and run it.
Post by john stultz
I'm not super sure the update or the read doesn't need something
additional to force a memory access, but as I didn't see anything
special in Mathieu's implementation, I'm going to guess this is ok.
Mathieu, Let me know if this isn't what you're suggesting.
Index: monotonic-cleanup/include/linux/clocksource.h
===================================================================
--- monotonic-cleanup.orig/include/linux/clocksource.h 2008-01-16 12:22:04.000000000 -0800
+++ monotonic-cleanup/include/linux/clocksource.h 2008-01-16 14:41:31.000000000 -0800
@@ -87,9 +87,17 @@
* more than one cache line.
*/
struct {
- cycle_t cycle_last, cycle_accumulated, cycle_raw;
- } ____cacheline_aligned_in_smp;
+ cycle_t cycle_last, cycle_accumulated;
+ /* base structure provides lock-free read
+ * access to a virtualized 64bit counter
+ * Uses RCU-like update.
+ */
+ struct {
+ cycle_t cycle_base_last, cycle_base;
+ } base[2];
+ int base_num;
+ } ____cacheline_aligned_in_smp;
u64 xtime_nsec;
s64 error;
@@ -175,19 +183,21 @@
}
/**
- * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * clocksource_get_basecycles: - get the clocksource's accumulated cycle value
*
* Uses the clocksource to return the current cycle_t value.
* NOTE!!!: This is different from clocksource_read, because it
- * returns the accumulated cycle value! Must hold xtime lock!
+ * returns a 64bit wide accumulated value.
*/
static inline cycle_t
-clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+clocksource_get_basecycles(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
- offset += cs->cycle_accumulated;
+ int num = cs->base_num;
+ cycle_t offset = (now - cs->base[num].cycle_base_last);
+ offset &= cs->mask;
+ offset += cs->base[num].cycle_base;
return offset;
}
@@ -197,14 +207,25 @@
*
* Used to avoids clocksource hardware overflow by periodically
- * accumulating the current cycle delta. Must hold xtime write lock!
+ * accumulating the current cycle delta. Uses RCU-like update, but
+ * ***still requires the xtime_lock is held for writing!***
*/
static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
{
- cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ /* First update the monotonic base portion.
+ * The dual array update method allows for lock-free reading.
+ */
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
I would think that we would need some sort of barrier here. Otherwise,
base_num could be updated before all the cycle_base. I'd expect a smp_wmb
is needed.
Post by john stultz
+ cs->base_num = num;
+
+ /* Now update the cycle_accumulated portion */
+ offset = (now - cs->cycle_last) & cs->mask;
cs->cycle_last = now;
cs->cycle_accumulated += offset;
- cs->cycle_raw += offset;
}
/**
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
Is the above just to decouple the two methods?
Post by john stultz
ns_offset = cyc2ns(clock, cycle_delta);
return ns_offset;
@@ -105,35 +107,7 @@
cycle_t notrace get_monotonic_cycles(void)
{
- cycle_t cycle_now, cycle_delta, cycle_raw, cycle_last;
-
- do {
- /*
- * cycle_raw and cycle_last can change on
- * another CPU and we need the delta calculation
- * of cycle_now and cycle_last happen atomic, as well
- * as the adding to cycle_raw. We don't need to grab
- * any locks, we just keep trying until get all the
- * calculations together in one state.
- *
- * In fact, we __cant__ grab any locks. This
- * function is called from the latency_tracer which can
- * be called anywhere. To grab any locks (including
- * seq_locks) we risk putting ourselves into a deadlock.
- */
- cycle_raw = clock->cycle_raw;
- cycle_last = clock->cycle_last;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - cycle_last) & clock->mask;
-
- } while (cycle_raw != clock->cycle_raw ||
- cycle_last != clock->cycle_last);
-
- return cycle_raw + cycle_delta;
+ return clocksource_get_basecycles(clock, clocksource_read(clock));
Nice ;-)
Post by john stultz
}
unsigned long notrace cycles_to_usecs(cycle_t cycles)
-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
john stultz
2008-01-17 02:30:09 UTC
Permalink
Post by Steven Rostedt
Thanks John for doing this!
(comments imbedded)
Post by john stultz
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
I would think that we would need some sort of barrier here. Otherwise,
base_num could be updated before all the cycle_base. I'd expect a smp_wmb
is needed.
Hopefully addressed in the current version.
Post by Steven Rostedt
Post by john stultz
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
Is the above just to decouple the two methods?
Yep. clocksource_get_cycles() ended up not being as useful as an helper
function (I was hoping the arch vsyscall implementations could use it,
but they've done too much optimization - although that may reflect a
need up the chain to the clocksource structure).

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 02:50:06 UTC
Permalink
Post by john stultz
Post by Steven Rostedt
Thanks John for doing this!
(comments imbedded)
Post by john stultz
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
I would think that we would need some sort of barrier here. Otherwise,
base_num could be updated before all the cycle_base. I'd expect a smp_wmb
is needed.
Hopefully addressed in the current version.
Post by Steven Rostedt
Post by john stultz
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
Is the above just to decouple the two methods?
Yep. clocksource_get_cycles() ended up not being as useful as an helper
function (I was hoping the arch vsyscall implementations could use it,
but they've done too much optimization - although that may reflect a
need up the chain to the clocksource structure).
The problem with vsyscall is that we will have a hard time disabling
preemption :( Therefore, insuring that the read of the data is done in a
timely manner is hard to do.

Mathieu
Post by john stultz
thanks
-john
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 03:00:10 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
Yep. clocksource_get_cycles() ended up not being as useful as an helper
function (I was hoping the arch vsyscall implementations could use it,
but they've done too much optimization - although that may reflect a
need up the chain to the clocksource structure).
The problem with vsyscall is that we will have a hard time disabling
preemption :( Therefore, insuring that the read of the data is done in a
timely manner is hard to do.
You'll have more than a hard time disabling preemption for vsyscall. We'll
need to come up with a better solution then. vsyscall can not modify any
kernel memory, nor can it disable preemption.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 03:10:05 UTC
Permalink
It would imply the creation of a new vsyscall : vgetschedperiod
It would read a counter that would increment each time the thread is
scheduled out (or in). It would be a per thread counter (not a per cpu
counter) so we can deal appropriately with a stopped thread that would
happen to come back running a loooong time afterward (if we do per-cpu
counters, we could get the same 32 bits counter value falsely if it is
shared with other thread activity).
int period;
do {
period = vgetschedperiod();
perform the clocksource read..
} while (period != vgetschedperiod());
Therefore, we would be sure that we have not been scheduled out while
reading the value. I think this new vsyscall could be useful for others.
Actually, it would make implementation of RCU in user-space possible (as
long as the read-side can retry the read operation).
This is something that I would agree is useful.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 03:10:08 UTC
Permalink
Post by Mathieu Desnoyers
Post by john stultz
Post by Steven Rostedt
Thanks John for doing this!
(comments imbedded)
Post by john stultz
+ int num = !cs->base_num;
+ cycle_t offset = (now - cs->base[!num].cycle_base_last);
+ offset &= cs->mask;
+ cs->base[num].cycle_base = cs->base[!num].cycle_base + offset;
+ cs->base[num].cycle_base_last = now;
I would think that we would need some sort of barrier here. Otherwise,
base_num could be updated before all the cycle_base. I'd expect a smp_wmb
is needed.
Hopefully addressed in the current version.
Post by Steven Rostedt
Post by john stultz
Index: monotonic-cleanup/kernel/time/timekeeping.c
===================================================================
--- monotonic-cleanup.orig/kernel/time/timekeeping.c 2008-01-16 12:21:46.000000000 -0800
+++ monotonic-cleanup/kernel/time/timekeeping.c 2008-01-16 14:15:31.000000000 -0800
@@ -71,10 +71,12 @@
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_delta;
+ cycle_t now, cycle_delta;
s64 ns_offset;
- cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
+ now = clocksource_read(clock);
+ cycle_delta = (now - clock->cycle_last) & clock->mask;
+ cycle_delta += clock->cycle_accumulated;
Is the above just to decouple the two methods?
Yep. clocksource_get_cycles() ended up not being as useful as an helper
function (I was hoping the arch vsyscall implementations could use it,
but they've done too much optimization - although that may reflect a
need up the chain to the clocksource structure).
The problem with vsyscall is that we will have a hard time disabling
preemption :( Therefore, insuring that the read of the data is done in a
timely manner is hard to do.
Sorry for self-reply, but I thought, in the past, of a way to make this
possible.

It would imply the creation of a new vsyscall : vgetschedperiod

It would read a counter that would increment each time the thread is
scheduled out (or in). It would be a per thread counter (not a per cpu
counter) so we can deal appropriately with a stopped thread that would
happen to come back running a loooong time afterward (if we do per-cpu
counters, we could get the same 32 bits counter value falsely if it is
shared with other thread activity).

Then, the clocksource read code would look like :

int period;

do {
period = vgetschedperiod();

perform the clocksource read..

} while (period != vgetschedperiod());

Therefore, we would be sure that we have not been scheduled out while
reading the value. I think this new vsyscall could be useful for others.
Actually, it would make implementation of RCU in user-space possible (as
long as the read-side can retry the read operation).

Mathieu
Post by Mathieu Desnoyers
Post by john stultz
thanks
-john
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Paul Mackerras
2008-01-17 03:30:18 UTC
Permalink
Post by Mathieu Desnoyers
Sorry for self-reply, but I thought, in the past, of a way to make this
possible.
It would imply the creation of a new vsyscall : vgetschedperiod
It would read a counter that would increment each time the thread is
scheduled out (or in). It would be a per thread counter
It's very hard to do a per-thread counter in the VDSO, since threads
in the same process see the same memory, by definition. You'd have to
have an array of counters and have some way for each thread to know
which entry to read. Also you'd have to find space for tens or
hundreds of thousands of counters, since there can be that many
threads in a process sometimes.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 03:40:10 UTC
Permalink
Post by Paul Mackerras
It's very hard to do a per-thread counter in the VDSO, since threads
in the same process see the same memory, by definition. You'd have to
have an array of counters and have some way for each thread to know
which entry to read. Also you'd have to find space for tens or
hundreds of thousands of counters, since there can be that many
threads in a process sometimes.
I was thinking about this. What would also work is just the ability to
read the schedule counter for the current cpu. Now this would require that
the task had a way to know which CPU it was currently on.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 17:40:16 UTC
Permalink
Post by Steven Rostedt
Post by Paul Mackerras
It's very hard to do a per-thread counter in the VDSO, since threads
in the same process see the same memory, by definition. You'd have to
have an array of counters and have some way for each thread to know
which entry to read. Also you'd have to find space for tens or
hundreds of thousands of counters, since there can be that many
threads in a process sometimes.
I was thinking about this. What would also work is just the ability to
read the schedule counter for the current cpu. Now this would require that
the task had a way to know which CPU it was currently on.
-- Steve
The problem with the per cpu schedule counter would be to deal with
stopped tasks that would wake up at the exact wrong moment. With a 32
bits counter, it could happen.

At 1000HZ, given the scheduler is only called once per tick
(approximation, it can also be called explicitly) it would happen after
49.7 days. But then, if the kernel calls schedule() too often, this can
be sooner than that.

Having a per-thread variable would make sure we don't have this problem.

By the way, a task cannot "really know" which CPU it is on : it could be
migrated between the cpu ID read and the moment it uses it as an array
index. Actually, knowing the schedule count would help to implement
algorithms that helps getting the cpu id and knowing it's been valid for
a period of time without pinning to a particular CPU.

Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 17:40:22 UTC
Permalink
Post by Steven Rostedt
Post by Paul Mackerras
It's very hard to do a per-thread counter in the VDSO, since threads
in the same process see the same memory, by definition. You'd have to
have an array of counters and have some way for each thread to know
which entry to read. Also you'd have to find space for tens or
hundreds of thousands of counters, since there can be that many
threads in a process sometimes.
I was thinking about this. What would also work is just the ability to
read the schedule counter for the current cpu. Now this would require that
the task had a way to know which CPU it was currently on.
-- Steve
The problem with the per cpu schedule counter would be to deal with
stopped tasks that would wake up at the exact wrong moment. With a 32
bits counter, it could happen.

At 1000HZ, given the scheduler is only called once per tick
(approximation, it can also be called explicitly) it would happen after
49.7 days. But then, if the kernel calls schedule() too often, this can
be sooner than that.

Having a per-thread variable would make sure we don't have this problem.

By the way, a task cannot "really know" which CPU it is on : it could be
migrated between the cpu ID read and the moment it uses it as an array
index. Actually, knowing the schedule count would help to implement
algorithms that helps getting the cpu id and knowing it's been valid for
a period of time without pinning to a particular CPU.

Mathieu

(sorry for repost, messed up with my mail client)
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2008-01-17 04:20:07 UTC
Permalink
Post by Paul Mackerras
Post by Mathieu Desnoyers
Sorry for self-reply, but I thought, in the past, of a way to make this
possible.
It would imply the creation of a new vsyscall : vgetschedperiod
It would read a counter that would increment each time the thread is
scheduled out (or in). It would be a per thread counter
It's very hard to do a per-thread counter in the VDSO, since threads
in the same process see the same memory, by definition. You'd have to
have an array of counters and have some way for each thread to know
which entry to read. Also you'd have to find space for tens or
hundreds of thousands of counters, since there can be that many
threads in a process sometimes.
Paul.
Crazy ideas :

Could we do something along the lines of the thread local storage ?

Or could we map a per-thread page that would contradict this
"definition" ?

Or can we move down the beginning of the user-space thread stack of 4
bytes (it's already put at a random address anyway) and use these 32
bits to put our variable ? We don't care if userspace also modifies it;
the kernel would blindly increment it, so there would be no security
concerns involved.

Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-17 15:30:11 UTC
Permalink
Post by Mathieu Desnoyers
Could we do something along the lines of the thread local storage ?
Or could we map a per-thread page that would contradict this
"definition" ?
When working on lguest64, I implemented a "per CPU" shadow page. That the
process of a guest running on one real CPU, could never see this page for
a process of the same guest running on another real CPU.

This is a nightmare to manage. The thing is that threads share the same
pgd. With shadow pages, it's doable, but hard to manage. It required
making a "copy" of the pgd for each real CPU.

Here we are not talking about guests, but bare-metal. So this complicates
things even more. The problem is that each thread shares the same pgd. If
two threads run on the same time on two different CPUS, then the two
threads would see the same counter. But we need to make this per cpu.

Now if the thread was given a clue to what CPU it was running on, then we
might be able to accomplish something. But then we have the same issue.
How do you tell a thread what CPU it's on, without running into the same
issues as where to store this data?

-- Steve
Post by Mathieu Desnoyers
Or can we move down the beginning of the user-space thread stack of 4
bytes (it's already put at a random address anyway) and use these 32
bits to put our variable ? We don't care if userspace also modifies it;
the kernel would blindly increment it, so there would be no security
concerns involved.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Linus Torvalds
2008-01-17 18:00:25 UTC
Permalink
Post by Mathieu Desnoyers
Or could we map a per-thread page that would contradict this
"definition" ?
Over my dead body.

It's been done before. Many times. It's horrible, and means that you need
to flush the TLB on context switches between threads and cannot share the
same hw page tables across CPU's (since now different CPU's run different
threads).

It generally makes threads pointless. You might as well implement them as
processes with shared mmap.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-15 22:10:15 UTC
Permalink
Post by Steven Rostedt
Also, it just occurred to me that this is an old patch. I thought I
renamed cycle_raw to cycle_monotonic. But I must have lost that patch :-/
Ah, I changed this in the -rt patch queue, and never moved the patch back
here.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:18 UTC
Permalink
Mark with "notrace" functions in core code that should not be
traced. The "notrace" attribute will prevent gcc from adding
a call to mcount on the annotated funtions.

Signed-off-by: Arnaldo Carvalho de Melo <***@ghostprotocols.net>
Signed-off-by: Steven Rostedt <***@redhat.com>

---
lib/smp_processor_id.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-compile-i386.git/lib/smp_processor_id.c
===================================================================
--- linux-compile-i386.git.orig/lib/smp_processor_id.c 2008-01-09 14:09:36.000000000 -0500
+++ linux-compile-i386.git/lib/smp_processor_id.c 2008-01-09 14:10:11.000000000 -0500
@@ -7,7 +7,7 @@
#include <linux/kallsyms.h>
#include <linux/sched.h>

-unsigned int debug_smp_processor_id(void)
+notrace unsigned int debug_smp_processor_id(void)
{
unsigned long preempt_count = preempt_count();
int this_cpu = raw_smp_processor_id();
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:18 UTC
Permalink
Add timestamps to trace entries.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 16 ++++++++++++++++
lib/tracing/tracer.h | 1 +
2 files changed, 17 insertions(+)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:16:05.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:27.000000000 -0500
@@ -19,12 +19,18 @@
#include <linux/percpu.h>
#include <linux/debugfs.h>
#include <linux/kallsyms.h>
+#include <linux/clocksource.h>
#include <linux/uaccess.h>
#include <linux/mcount.h>

#include "tracer.h"
#include "tracer_interface.h"

+static inline notrace cycle_t now(void)
+{
+ return get_monotonic_cycles();
+}
+
static struct mctracer_trace mctracer_trace;
static DEFINE_PER_CPU(struct mctracer_trace_cpu, mctracer_trace_cpu);

@@ -57,6 +63,7 @@ mctracer_add_trace_entry(struct mctracer
entry->ip = ip;
entry->parent_ip = parent_ip;
entry->pid = tsk->pid;
+ entry->t = now();
memcpy(entry->comm, tsk->comm, TASK_COMM_LEN);
}

@@ -240,6 +247,15 @@ static int s_show(struct seq_file *m, vo
if (iter->ent == NULL) {
seq_printf(m, "mctracer:\n");
} else {
+ unsigned long long t;
+ unsigned long usec_rem;
+ unsigned long secs;
+
+ t = cycles_to_usecs(iter->ent->t);
+ usec_rem = do_div(t, 1000000ULL);
+ secs = (unsigned long)t;
+
+ seq_printf(m, "[%5lu.%06lu] ", secs, usec_rem);
seq_printf(m, "CPU %d: ", iter->cpu);
seq_printf(m, "%s:%d ", iter->ent->comm, iter->ent->pid);
seq_print_ip_sym(m, iter->ent->ip, sym_only);
Index: linux-compile-i386.git/lib/tracing/tracer.h
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.h 2008-01-09 14:16:05.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.h 2008-01-09 15:17:27.000000000 -0500
@@ -5,6 +5,7 @@
#include <linux/sched.h>

struct mctracer_entry {
+ unsigned long long t;
unsigned long idx;
unsigned long ip;
unsigned long parent_ip;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:40:18 UTC
Permalink
The current method of printing out the trace is on every
read, do a linear search for the next entry to print.
This patch remembers the next entry to look at in the
iterator, and if the next read is sequential, it can
start reading from the next location.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
lib/tracing/tracer.c | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)

Index: linux-compile-i386.git/lib/tracing/tracer.c
===================================================================
--- linux-compile-i386.git.orig/lib/tracing/tracer.c 2008-01-09 14:37:13.000000000 -0500
+++ linux-compile-i386.git/lib/tracing/tracer.c 2008-01-09 15:17:24.000000000 -0500
@@ -105,6 +105,7 @@ enum trace_iterator {
struct mctracer_iterator {
struct mctracer_trace *tr;
struct mctracer_entry *ent;
+ loff_t pos;
unsigned long next_idx[NR_CPUS];
int cpu;
int idx;
@@ -176,6 +177,8 @@ static void *s_next(struct seq_file *m,
while (ent && iter->idx < i)
ent = find_next_entry(iter);

+ iter->pos = *pos;
+
return ent;
}

@@ -186,19 +189,25 @@ static void *s_start(struct seq_file *m,
loff_t l = 0;
int i;

- iter->ent = NULL;
- iter->cpu = 0;
- iter->idx = -1;
-
- for (i = 0; i < NR_CPUS; i++)
- iter->next_idx[i] = 0;
-
/* stop the trace while dumping */
if (iter->tr->ctrl)
clear_mcount_function();

- for (p = iter; p && l < *pos; p = s_next(m, p, &l))
- ;
+ if (*pos != iter->pos) {
+ iter->ent = NULL;
+ iter->cpu = 0;
+ iter->idx = -1;
+
+ for (i = 0; i < NR_CPUS; i++)
+ iter->next_idx[i] = 0;
+
+ for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+ ;
+
+ } else {
+ l = *pos;
+ p = s_next(m, p, &l);
+ }

return p;
}
@@ -286,6 +295,7 @@ static int mctrace_open(struct inode *in
return -ENOMEM;

iter->tr = &mctracer_trace;
+ iter->pos = -1;

/* TODO stop tracer */
ret = seq_open(file, &mctrace_seq_ops);
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-09 23:50:08 UTC
Permalink
This patch adds latency tracing for critical timings.
In /debugfs/tracing/ three files are added:

max_irq_latency
holds the max latency thus far (in usecs)
(default to large number so one must start latency tracing)

irq_thresh
threshold (in usecs) to always print out if irqs off
is detected to be longer than stated here.
If irq_thresh is non-zero, then max_irq_latency
is ignored.

irqsoff_trace
Trace of where the latecy was detected.

irqsoff_fn_trace_ctrl
0 - don't use mcount
1 - use mcount to trace

Here's an example of a trace with irqsoff_fn_trace_ctrl == 0

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
-----------------
| task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
-----------------
=> started at: _spin_lock_irqsave+0x2a/0xb7
=> ended at: _spin_unlock_irqrestore+0x32/0x5f

_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
swapper-0 1d.s3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
swapper-0 1d.s3 100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
swapper-0 1d.s3 100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)


vim:ft=help
=======


And this is a trace with irqsoff_fn_trace_ctrl == 1


=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
-----------------
| task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
-----------------
=> started at: _spin_lock_irqsave+0x2a/0xb7
=> ended at: _spin_unlock_irqrestore+0x32/0x5f

_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
swapper-0 1dNs3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
swapper-0 1dNs3 46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000])
swapper-0 1dNs3 46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000])
swapper-0 1dNs3 46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000])
swapper-0 1dNs3 47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000])
swapper-0 1dNs3 47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47)
swapper-0 1dNs3 97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50)
swapper-0 1dNs3 98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000])
swapper-0 1dNs3 99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000])
swapper-0 1dNs3 101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000])
swapper-0 1dNs3 102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
swapper-0 1dNs3 102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)


vim:ft=help
=======


Signed-off-by: Steven Rostedt <***@redhat.com>
---
arch/x86/kernel/process_64.c | 3
arch/x86/lib/thunk_64.S | 18 +
include/asm-x86/irqflags_32.h | 4
include/asm-x86/irqflags_64.h | 4
include/linux/irqflags.h | 37 ++
include/linux/mcount.h | 29 +-
kernel/fork.c | 2
kernel/lockdep.c | 25 +
lib/tracing/Kconfig | 20 +
lib/tracing/Makefile | 1
lib/tracing/trace_function.c | 89 +-----
lib/tracing/trace_irqsoff.c | 545 ++++++++++++++++++++++++++++++++++++++++++
lib/tracing/tracer.c | 101 +++++--
lib/tracing/tracer.h | 40 ++-
14 files changed, 792 insertions(+), 126 deletions(-)

Index: linux-compile.git/lib/tracing/Kconfig
===================================================================
--- linux-compile.git.orig/lib/tracing/Kconfig 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/lib/tracing/Kconfig 2008-01-09 17:04:10.000000000 -0500
@@ -21,3 +21,23 @@ config FUNCTION_TRACER
insert a call to an architecture specific __mcount routine,
that the debugging mechanism using this facility will hook by
providing a set of inline routines.
+
+config CRITICAL_IRQSOFF_TIMING
+ bool "Interrupts-off critical section latency timing"
+ default n
+ depends on TRACE_IRQFLAGS_SUPPORT
+ depends on GENERIC_TIME
+ select TRACE_IRQFLAGS
+ help
+ This option measures the time spent in irqs-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started
+ via:
+
+ echo 0 > /debug/mctracer/preempt_max_latency
+
+ (Note that kernel size and overhead increases with this option
+ enabled. This option and the preempt-off timing option can be
+ used together or separately.)
Index: linux-compile.git/kernel/lockdep.c
===================================================================
--- linux-compile.git.orig/kernel/lockdep.c 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/kernel/lockdep.c 2008-01-09 17:04:10.000000000 -0500
@@ -39,6 +39,7 @@
#include <linux/irqflags.h>
#include <linux/utsname.h>
#include <linux/hash.h>
+#include <linux/mcount.h>

#include <asm/sections.h>

@@ -2009,7 +2010,7 @@ void early_boot_irqs_on(void)
/*
* Hardirqs will be enabled:
*/
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
{
struct task_struct *curr = current;
unsigned long ip;
@@ -2050,14 +2051,27 @@ void trace_hardirqs_on(void)
curr->hardirq_enable_ip = ip;
curr->hardirq_enable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_on_events);
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ time_hardirqs_on(CALLER_ADDR0, a0);
+#endif
}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);

+void notrace trace_hardirqs_on(void) {
+ trace_hardirqs_on_caller(CALLER_ADDR0);
+}
EXPORT_SYMBOL(trace_hardirqs_on);

+void notrace trace_hardirqs_off(void) {
+ trace_hardirqs_off_caller(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
{
struct task_struct *curr = current;

@@ -2075,10 +2089,17 @@ void trace_hardirqs_off(void)
curr->hardirq_disable_ip = _RET_IP_;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_off_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ time_hardirqs_off(CALLER_ADDR0, a0);
+#endif
} else
debug_atomic_inc(&redundant_hardirqs_off);
}

+void notrace trace_hardirqs_off(void) {
+ trace_hardirqs_off_caller(CALLER_ADDR0);
+}
+
EXPORT_SYMBOL(trace_hardirqs_off);

/*
Index: linux-compile.git/lib/tracing/trace_irqsoff.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/lib/tracing/trace_irqsoff.c 2008-01-09 17:04:10.000000000 -0500
@@ -0,0 +1,545 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007 Steven Rostedt <***@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/mcount.h>
+
+#include "tracer.h"
+
+static struct tracing_trace irqsoff_trace;
+static struct tracing_trace max_tr __read_mostly;
+static DEFINE_PER_CPU(struct tracing_trace_cpu, irqsoff_trace_cpu);
+static DEFINE_PER_CPU(struct tracing_trace_cpu, max_data);
+static unsigned long preempt_max_latency = (cycle_t)ULONG_MAX;
+static unsigned long preempt_thresh;
+static __cacheline_aligned_in_smp DEFINE_MUTEX(max_mutex);
+
+/*
+ * max trace is switched with this buffer.
+ */
+static void *max_buffer;
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+ if (preempt_thresh) {
+ if (delta < preempt_thresh)
+ return 0;
+ } else {
+ if (delta <= preempt_max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct tracing_trace *tr,
+ struct tracing_trace_cpu *data,
+ int cpu)
+{
+ struct tracing_trace_cpu *save;
+ int i;
+
+#ifdef CONFIG_PREEMPT
+ WARN_ON(!preempt_count() && !irqs_disabled());
+#endif
+
+ max_tr.cpu = cpu;
+ save = max_tr.data[cpu];
+
+ /* clear out all the previous traces */
+ for_each_possible_cpu(i) {
+ if (max_tr.data[i]->trace)
+ max_tr.data[i]->trace = NULL;
+ }
+
+ max_tr.time_start = data->preempt_timestamp;
+
+ memcpy(save, data, sizeof(*data));
+ save->saved_latency = preempt_max_latency;
+
+ memcpy(save->comm, current->comm, TASK_COMM_LEN);
+ save->pid = current->pid;
+ save->uid = current->uid;
+ save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+ save->policy = current->policy;
+ save->rt_priority = current->rt_priority;
+
+ /* from memcpy above: save->trace = data->trace */
+ data->trace = max_buffer;
+ max_buffer = save->trace;
+}
+
+cycle_t notrace usecs_to_cycles(unsigned long usecs);
+
+static void notrace
+check_critical_timing(struct tracing_trace *tr,
+ struct tracing_trace_cpu *data,
+ unsigned long parent_ip,
+ int cpu)
+{
+ unsigned long latency, t0, t1;
+ cycle_t T0, T1, T2, delta;
+ unsigned long flags;
+
+ /*
+ * usecs conversion is slow so we try to delay the conversion
+ * as long as possible:
+ */
+ T0 = data->preempt_timestamp;
+ T1 = now();
+ delta = T1-T0;
+
+ local_save_flags(flags);
+
+ if (!report_latency(delta))
+ goto out;
+
+ tracing_function_trace(tr, data, CALLER_ADDR0, parent_ip, flags);
+ /*
+ * Update the timestamp, because the trace entry above
+ * might change it (it can only get larger so the latency
+ * is fair to be reported):
+ */
+ T2 = now();
+
+ delta = T2-T0;
+
+ latency = cycles_to_usecs(delta);
+
+ if (data->critical_sequence != max_sequence ||
+ !mutex_trylock(&max_mutex))
+ goto out;
+
+ preempt_max_latency = delta;
+ t0 = cycles_to_usecs(T0);
+ t1 = cycles_to_usecs(T1);
+
+ data->critical_end = parent_ip;
+
+ update_max_tr(tr, data, cpu);
+
+ if (preempt_thresh)
+ printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
+ "violates %lu us threshold.\n"
+ " => started at timestamp %lu: ",
+ current->comm, current->pid,
+ raw_smp_processor_id(),
+ latency, cycles_to_usecs(preempt_thresh), t0);
+ else
+ printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum-latency "
+ "critical section.\n => started at timestamp %lu: ",
+ current->comm, current->pid,
+ raw_smp_processor_id(),
+ latency, t0);
+
+ print_symbol(KERN_CONT "<%s>\n", data->critical_start);
+ printk(KERN_CONT " => ended at timestamp %lu: ", t1);
+ print_symbol(KERN_CONT "<%s>\n", data->critical_end);
+ dump_stack();
+ t1 = cycles_to_usecs(now());
+ printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1);
+
+ max_sequence++;
+
+ mutex_unlock(&max_mutex);
+
+out:
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = now();
+ tracing_reset(data);
+ tracing_function_trace(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void notrace
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu = raw_smp_processor_id();
+ struct tracing_trace *tr = &irqsoff_trace;
+ struct tracing_trace_cpu *data = tr->data[cpu];
+ unsigned long flags;
+
+ if (unlikely(!data) || unlikely(!data->trace) ||
+ data->critical_start || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = now();
+ data->critical_start = ip;
+ tracing_reset(data);
+
+ local_save_flags(flags);
+ tracing_function_trace(tr, data, ip, parent_ip, flags);
+
+ atomic_dec(&data->disabled);
+}
+
+static inline void notrace
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu = raw_smp_processor_id();
+ struct tracing_trace *tr = &irqsoff_trace;
+ struct tracing_trace_cpu *data = tr->data[cpu];
+ unsigned long flags;
+
+ if (unlikely(!data) || unlikely(!data->trace) ||
+ !data->critical_start || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+ local_save_flags(flags);
+ tracing_function_trace(tr, data, ip, parent_ip, flags);
+ check_critical_timing(tr, data, ip, cpu);
+ data->critical_start = 0;
+ atomic_dec(&data->disabled);
+}
+
+void notrace start_critical_timings(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ start_critical_timing(CALLER_ADDR0, 0);
+}
+
+void notrace stop_critical_timings(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ stop_critical_timing(CALLER_ADDR0, 0);
+}
+
+#ifdef CONFIG_LOCKDEP
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ stop_critical_timing(CALLER_ADDR0, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ start_critical_timing(CALLER_ADDR0, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (irqs_disabled_flags(flags))
+ start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_LOCKDEP */
+
+
+static void notrace irqsoff_trace_call(unsigned long ip,
+ unsigned long parent_ip)
+{
+ struct tracing_trace *tr = &irqsoff_trace;
+ struct tracing_trace_cpu *data;
+ unsigned long flags;
+ int cpu;
+
+ local_save_flags(flags);
+
+ if (!irqs_disabled_flags(flags))
+ return;
+
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ atomic_inc(&data->disabled);
+
+ if (likely(atomic_read(&data->disabled) == 1))
+ tracing_function_trace(tr, data, ip, parent_ip, flags);
+
+ atomic_dec(&data->disabled);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void irqsoff_start(struct tracing_iterator *iter)
+{
+ mutex_lock(&max_mutex);
+}
+
+static void irqsoff_stop(struct tracing_iterator *iter)
+{
+ mutex_unlock(&max_mutex);
+}
+
+static ssize_t max_irq_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long *ptr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, 64, "%ld\n", *ptr == -1 ? : cycles_to_usecs(*ptr));
+ if (r > 64)
+ r = 64;
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+}
+static ssize_t max_irq_lat_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long *ptr = filp->private_data;
+ long val;
+ char buf[64];
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ val = simple_strtoul(buf, NULL, 10);
+
+ *ptr = usecs_to_cycles(val);
+
+ return cnt;
+}
+
+static struct file_operations max_irq_lat_fops = {
+ .open = tracing_open_generic,
+ .read = max_irq_lat_read,
+ .write = max_irq_lat_write,
+};
+
+static void irqsoff_trace_ctrl_update(struct tracing_trace *tr,
+ unsigned long val)
+{
+ val = !!val;
+
+ if (tr->ctrl ^ val) {
+ if (val)
+ register_mcount_function(irqsoff_trace_call);
+ else
+ clear_mcount_function();
+ tr->ctrl = val;
+ }
+}
+
+static __init void irqsoff_trace_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ irqsoff_trace.ctrl_update = irqsoff_trace_ctrl_update;
+
+ entry = debugfs_create_file("irqsoff_fn_trace_ctrl", 0644, d_tracer,
+ &irqsoff_trace, &tracing_ctrl_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs"
+ " 'irqsoff_fn_trace' entry\n");
+
+ entry = debugfs_create_file("max_irq_latency", 0644, d_tracer,
+ &preempt_max_latency, &max_irq_lat_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'ctrl' entry\n");
+
+ entry = debugfs_create_file("irq_thresh", 0644, d_tracer,
+ &preempt_thresh, &max_irq_lat_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'ctrl' entry\n");
+
+ entry = debugfs_create_file("irqsoff_trace", 0444, d_tracer,
+ &max_tr, &tracing_lt_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'irqsoff_trace' entry\n");
+}
+
+#endif /* CONFIG_DEBUGFS */
+
+static void notrace irqsoff_trace_open(struct tracing_iterator *iter)
+{
+#ifdef CONFIG_MCOUNT
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl)
+ clear_mcount_function();
+#endif
+}
+
+static void notrace irqsoff_trace_close(struct tracing_iterator *iter)
+{
+#ifdef CONFIG_MCOUNT
+ if (iter->tr->ctrl)
+ register_mcount_function(irqsoff_trace_call);
+#endif
+}
+
+__init static int trace_irqsoff_alloc_buffers(void)
+{
+ const int order = page_order(TRACING_NR_ENTRIES * TRACING_ENTRY_SIZE);
+ const unsigned long size = (1UL << order) << PAGE_SHIFT;
+ struct tracing_entry *array;
+ int i;
+
+ for_each_possible_cpu(i) {
+ irqsoff_trace.data[i] = &per_cpu(irqsoff_trace_cpu, i);
+ max_tr.data[i] = &per_cpu(max_data, i);
+
+ array = (struct tracing_entry *)
+ __get_free_pages(GFP_KERNEL, order);
+ if (array == NULL) {
+ printk(KERN_ERR "irqsoff tracer: failed to allocate"
+ " %ld bytes for trace buffer!\n", size);
+ goto free_buffers;
+ }
+ irqsoff_trace.data[i]->trace = array;
+ }
+
+ array = (struct tracing_entry *)
+ __get_free_pages(GFP_KERNEL, order);
+ if (array == NULL) {
+ printk(KERN_ERR "irqsoff tracer: failed to allocate"
+ " %ld bytes for trace buffer!\n", size);
+ goto free_buffers;
+ }
+ max_buffer = array;
+
+ /*
+ * Since we allocate by orders of pages, we may be able to
+ * round up a bit.
+ */
+ irqsoff_trace.entries = size / TRACING_ENTRY_SIZE;
+ max_tr.entries = irqsoff_trace.entries;
+ max_tr.start = irqsoff_start;
+ max_tr.stop = irqsoff_stop;
+
+ pr_info("irqs off tracer: %ld bytes allocated for %ld",
+ size, TRACING_NR_ENTRIES);
+ pr_info(" entries of %d bytes\n", (int)TRACING_ENTRY_SIZE);
+ pr_info(" actual entries %ld\n", irqsoff_trace.entries);
+
+ irqsoff_trace_init_debugfs();
+
+ irqsoff_trace.open = irqsoff_trace_open;
+ irqsoff_trace.close = irqsoff_trace_close;
+
+ return 0;
+
+ free_buffers:
+ for (i-- ; i >= 0; i--) {
+ if (irqsoff_trace.data[i] && irqsoff_trace.data[i]->trace) {
+ free_pages((unsigned long)irqsoff_trace.data[i]->trace,
+ order);
+ irqsoff_trace.data[i]->trace = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+device_initcall(trace_irqsoff_alloc_buffers);
Index: linux-compile.git/include/linux/mcount.h
===================================================================
--- linux-compile.git.orig/include/linux/mcount.h 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/include/linux/mcount.h 2008-01-09 17:04:10.000000000 -0500
@@ -1,15 +1,36 @@
#ifndef _LINUX_MCOUNT_H
#define _LINUX_MCOUNT_H

+#ifdef CONFIG_FRAME_POINTER
+/* TODO: need to fix this for ARM */
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+ extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1) do { } while (0)
+# define time_hardirqs_off(a0, a1) do { } while (0)
+#endif
+
#ifdef CONFIG_MCOUNT
extern int mcount_enabled;

#include <linux/linkage.h>

-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
-#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
-
typedef void (*mcount_func_t)(unsigned long ip, unsigned long parent_ip);

extern void mcount(void);
Index: linux-compile.git/lib/tracing/Makefile
===================================================================
--- linux-compile.git.orig/lib/tracing/Makefile 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/lib/tracing/Makefile 2008-01-09 17:04:10.000000000 -0500
@@ -2,5 +2,6 @@ obj-$(CONFIG_MCOUNT) += libmcount.o

obj-$(CONFIG_TRACING) += tracer.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_function.o
+obj-$(CONFIG_CRITICAL_IRQSOFF_TIMING) += trace_irqsoff.o

libmcount-y := mcount.o
Index: linux-compile.git/lib/tracing/trace_function.c
===================================================================
--- linux-compile.git.orig/lib/tracing/trace_function.c 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/lib/tracing/trace_function.c 2008-01-09 17:04:10.000000000 -0500
@@ -23,69 +23,40 @@ static notrace void function_trace_reset
int cpu;

tr->time_start = now();
- tr->saved_latency = 0;
- tr->critical_start = 0;
- tr->critical_end = 0;
-
- for_each_online_cpu(cpu) {
- tr->data[cpu]->trace_idx = 0;
- atomic_set(&tr->data[cpu]->underrun, 0);
- }
-}

-#ifdef CONFIG_DEBUG_FS
-static ssize_t function_trace_ctrl_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct tracing_trace *tr = filp->private_data;
- char buf[16];
- int r;
-
- r = sprintf(buf, "%ld\n", tr->ctrl);
- return simple_read_from_buffer(ubuf, cnt, ppos,
- buf, r);
+ for_each_online_cpu(cpu)
+ tracing_reset(tr->data[cpu]);
}

static void notrace function_trace_call(unsigned long ip,
unsigned long parent_ip)
{
struct tracing_trace *tr = &function_trace;
+ struct tracing_trace_cpu *data;
+ unsigned long flags;
+ int cpu;

- tracing_function_trace(tr, ip, parent_ip);
-}
-
-static ssize_t function_trace_ctrl_write(struct file *filp,
- const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct tracing_trace *tr = filp->private_data;
- long val;
- char buf[16];
-
- if (cnt > 15)
- cnt = 15;
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ atomic_inc(&data->disabled);

- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ if (likely(atomic_read(&data->disabled) == 1))
+ tracing_function_trace(tr, data, ip, parent_ip, flags);

- buf[cnt] = 0;
+ atomic_dec(&data->disabled);
+ raw_local_irq_restore(flags);
+}

- val = !!simple_strtoul(buf, NULL, 10);
+#ifdef CONFIG_DEBUG_FS
+static void function_trace_ctrl_update(struct tracing_trace *tr,
+ unsigned long val)
+{
+ val = !!val;

/* When starting a new trace, reset the buffers */
if (val)
function_trace_reset(tr);
- else {
- /* pretty meaningless for now */
- tr->time_end = now();
- tr->saved_latency = tr->time_end - tr->time_start;
- memcpy(tr->comm, current->comm, TASK_COMM_LEN);
- tr->pid = current->pid;
- tr->uid = current->uid;
- tr->nice = current->static_prio - 20 - MAX_RT_PRIO;
- tr->policy = current->policy;
- tr->rt_priority = current->rt_priority;
- }

if (tr->ctrl ^ val) {
if (val)
@@ -94,18 +65,8 @@ static ssize_t function_trace_ctrl_write
clear_mcount_function();
tr->ctrl = val;
}
-
- filp->f_pos += cnt;
-
- return cnt;
}

-static struct file_operations function_trace_ctrl_fops = {
- .open = tracing_open_generic,
- .read = function_trace_ctrl_read,
- .write = function_trace_ctrl_write,
-};
-
static __init void function_trace_init_debugfs(void)
{
struct dentry *d_tracer;
@@ -113,8 +74,10 @@ static __init void function_trace_init_d

d_tracer = tracing_init_dentry();

+ function_trace.ctrl_update = function_trace_ctrl_update;
+
entry = debugfs_create_file("fn_trace_ctrl", 0644, d_tracer,
- &function_trace, &function_trace_ctrl_fops);
+ &function_trace, &tracing_ctrl_fops);
if (!entry)
pr_warning("Could not create debugfs 'ctrl' entry\n");

@@ -154,12 +117,6 @@ static void function_trace_close(struct
register_mcount_function(function_trace_call);
}

-static notrace int page_order(const unsigned long size)
-{
- const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
- return ilog2(roundup_pow_of_two(nr_pages));
-}
-
__init static int function_trace_alloc_buffers(void)
{
const int order = page_order(TRACING_NR_ENTRIES * TRACING_ENTRY_SIZE);
@@ -187,7 +144,7 @@ __init static int function_trace_alloc_b

pr_info("function tracer: %ld bytes allocated for %ld",
size, TRACING_NR_ENTRIES);
- pr_info(" entries of %d bytes\n", TRACING_ENTRY_SIZE);
+ pr_info(" entries of %d bytes\n", (int)TRACING_ENTRY_SIZE);
pr_info(" actual entries %ld\n", function_trace.entries);

function_trace_init_debugfs();
Index: linux-compile.git/lib/tracing/tracer.c
===================================================================
--- linux-compile.git.orig/lib/tracing/tracer.c 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/lib/tracing/tracer.c 2008-01-09 17:04:10.000000000 -0500
@@ -44,6 +44,12 @@ enum trace_flag_type {
TRACE_FLAG_IRQS_HARD_OFF = 0x20,
};

+void notrace tracing_reset(struct tracing_trace_cpu *data)
+{
+ data->trace_idx = 0;
+ atomic_set(&data->underrun, 0);
+}
+
static inline notrace struct tracing_entry *
tracing_get_trace_entry(struct tracing_trace *tr,
struct tracing_trace_cpu *data)
@@ -89,30 +95,18 @@ tracing_generic_entry_update(struct trac
}

notrace void tracing_function_trace(struct tracing_trace *tr,
+ struct tracing_trace_cpu *data,
unsigned long ip,
- unsigned long parent_ip)
+ unsigned long parent_ip,
+ unsigned long flags)
{
- unsigned long flags;
- int cpu;
-
- raw_local_irq_save(flags);
- cpu = raw_smp_processor_id();
-
- atomic_inc(&tr->data[cpu]->disabled);
- if (likely(atomic_read(&tr->data[cpu]->disabled) == 1)) {
- struct tracing_entry *entry;
- struct tracing_trace_cpu *data = tr->data[cpu];
-
- entry = tracing_get_trace_entry(tr, data);
- tracing_generic_entry_update(entry, flags);
- entry->type = TRACE_FN;
- entry->fn.ip = ip;
- entry->fn.parent_ip = parent_ip;
- }
-
- atomic_dec(&tr->data[cpu]->disabled);
+ struct tracing_entry *entry;

- raw_local_irq_restore(flags);
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, flags);
+ entry->type = TRACE_FN;
+ entry->fn.ip = ip;
+ entry->fn.parent_ip = parent_ip;
}

#ifdef CONFIG_DEBUG_FS
@@ -231,6 +225,10 @@ static void *s_start(struct seq_file *m,
loff_t l = 0;
int i;

+ /* let the tracer grab locks here if needed */
+ if (iter->tr->start)
+ iter->tr->start(iter);
+
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -252,6 +250,11 @@ static void *s_start(struct seq_file *m,

static void s_stop(struct seq_file *m, void *p)
{
+ struct tracing_iterator *iter = m->private;
+
+ /* let the tracer release locks here if needed */
+ if (iter->tr->stop)
+ iter->tr->stop(iter);
}

#ifdef CONFIG_KALLSYMS
@@ -303,6 +306,7 @@ static void notrace print_trace_header(s
struct tracing_iterator *iter)
{
struct tracing_trace *tr = iter->tr;
+ struct tracing_trace_cpu *data = tr->data[tr->cpu];
unsigned long underruns = 0;
unsigned long underrun;
unsigned long entries = 0;
@@ -326,7 +330,7 @@ static void notrace print_trace_header(s
"---------------------------------\n");
seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
" (M:%s VP:%d, KP:%d, SP:%d HP:%d",
- cycles_to_usecs(tr->saved_latency),
+ cycles_to_usecs(data->saved_latency),
entries,
(entries + underruns),
smp_processor_id(),
@@ -349,15 +353,15 @@ static void notrace print_trace_header(s
seq_puts(m, " -----------------\n");
seq_printf(m, " | task: %.16s-%d "
"(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
- tr->comm, tr->pid, tr->uid, tr->nice,
- tr->policy, tr->rt_priority);
+ data->comm, data->pid, data->uid, data->nice,
+ data->policy, data->rt_priority);
seq_puts(m, " -----------------\n");

- if (tr->critical_start) {
+ if (data->critical_start) {
seq_puts(m, " => started at: ");
- seq_print_ip_sym(m, tr->critical_start, sym_only);
+ seq_print_ip_sym(m, data->critical_start, sym_only);
seq_puts(m, "\n => ended at: ");
- seq_print_ip_sym(m, tr->critical_end, sym_only);
+ seq_print_ip_sym(m, data->critical_end, sym_only);
seq_puts(m, "\n");
}

@@ -700,6 +704,49 @@ static struct file_operations tracing_it
.write = tracing_iter_ctrl_write,
};

+static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct tracing_trace *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%ld\n", tr->ctrl);
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, r);
+}
+
+static ssize_t tracing_ctrl_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct tracing_trace *tr = filp->private_data;
+ long val;
+ char buf[64];
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ val = simple_strtoul(buf, NULL, 10);
+
+ tr->ctrl_update(tr, val);
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+struct file_operations tracing_ctrl_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_ctrl_read,
+ .write = tracing_ctrl_write,
+};
+
static struct dentry *d_tracer;

struct dentry *tracing_init_dentry(void)
Index: linux-compile.git/lib/tracing/tracer.h
===================================================================
--- linux-compile.git.orig/lib/tracing/tracer.h 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/lib/tracing/tracer.h 2008-01-09 17:04:10.000000000 -0500
@@ -26,6 +26,17 @@ struct tracing_trace_cpu {
unsigned long trace_idx;
atomic_t disabled;
atomic_t underrun;
+ unsigned long saved_latency;
+ unsigned long critical_start;
+ unsigned long critical_end;
+ unsigned long critical_sequence;
+ unsigned long nice;
+ unsigned long policy;
+ unsigned long rt_priority;
+ cycle_t preempt_timestamp;
+ pid_t pid;
+ uid_t uid;
+ char comm[TASK_COMM_LEN];
};

struct tracing_iterator;
@@ -33,19 +44,14 @@ struct tracing_iterator;
struct tracing_trace {
unsigned long entries;
long ctrl;
- char comm[TASK_COMM_LEN];
- pid_t pid;
- uid_t uid;
- unsigned long nice;
- unsigned long policy;
- unsigned long rt_priority;
- unsigned long saved_latency;
- unsigned long critical_start;
- unsigned long critical_end;
- unsigned long long time_start;
- unsigned long long time_end;
+ int cpu;
+ cycle_t time_start;
void (*open)(struct tracing_iterator *iter);
void (*close)(struct tracing_iterator *iter);
+ void (*start)(struct tracing_iterator *iter);
+ void (*stop)(struct tracing_iterator *iter);
+ void (*ctrl_update)(struct tracing_trace *tr,
+ unsigned long val);
struct tracing_trace_cpu *data[NR_CPUS];
};

@@ -62,18 +68,28 @@ struct tracing_iterator {
#define TRACING_ENTRY_SIZE sizeof(struct tracing_entry)
#define TRACING_NR_ENTRIES (65536UL)

+void notrace tracing_reset(struct tracing_trace_cpu *data);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *tracing_init_dentry(void);
void tracing_function_trace(struct tracing_trace *tr,
+ struct tracing_trace_cpu *data,
unsigned long ip,
- unsigned long parent_ip);
+ unsigned long parent_ip,
+ unsigned long flags);

extern struct file_operations tracing_fops;
extern struct file_operations tracing_lt_fops;
+extern struct file_operations tracing_ctrl_fops;

static inline notrace cycle_t now(void)
{
return get_monotonic_cycles();
}

+static inline notrace int page_order(const unsigned long size)
+{
+ const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+ return ilog2(roundup_pow_of_two(nr_pages));
+}
+
#endif /* _LINUX_MCOUNT_TRACER_H */
Index: linux-compile.git/kernel/fork.c
===================================================================
--- linux-compile.git.orig/kernel/fork.c 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/kernel/fork.c 2008-01-09 17:04:10.000000000 -0500
@@ -1010,7 +1010,7 @@ static struct task_struct *copy_process(

rt_mutex_init_task(p);

-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
Index: linux-compile.git/arch/x86/kernel/process_64.c
===================================================================
--- linux-compile.git.orig/arch/x86/kernel/process_64.c 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/arch/x86/kernel/process_64.c 2008-01-09 17:04:10.000000000 -0500
@@ -233,7 +233,10 @@ void cpu_idle (void)
*/
local_irq_disable();
enter_idle();
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
idle();
+ start_critical_timings();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
Index: linux-compile.git/arch/x86/lib/thunk_64.S
===================================================================
--- linux-compile.git.orig/arch/x86/lib/thunk_64.S 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/arch/x86/lib/thunk_64.S 2008-01-09 17:04:10.000000000 -0500
@@ -47,8 +47,22 @@
thunk __up_wakeup,__up

#ifdef CONFIG_TRACE_IRQFLAGS
- thunk trace_hardirqs_on_thunk,trace_hardirqs_on
- thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+ /* put return address in rdi (arg1) */
+ .macro thunk_ra name,func
+ .globl \name
+\name:
+ CFI_STARTPROC
+ SAVE_ARGS
+ /* SAVE_ARGS pushs 9 elements */
+ /* the next element would be the rip */
+ movq 9*8(%rsp), %rdi
+ call \func
+ jmp restore
+ CFI_ENDPROC
+ .endm
+
+ thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+ thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
Index: linux-compile.git/include/linux/irqflags.h
===================================================================
--- linux-compile.git.orig/include/linux/irqflags.h 2008-01-09 17:03:59.000000000 -0500
+++ linux-compile.git/include/linux/irqflags.h 2008-01-09 17:36:00.000000000 -0500
@@ -12,10 +12,21 @@
#define _LINUX_TRACE_IRQFLAGS_H

#ifdef CONFIG_TRACE_IRQFLAGS
- extern void trace_hardirqs_on(void);
- extern void trace_hardirqs_off(void);
+# include <linux/mcount.h>
+ extern void trace_hardirqs_on_caller(unsigned long ip);
+ extern void trace_hardirqs_off_caller(unsigned long ip);
extern void trace_softirqs_on(unsigned long ip);
extern void trace_softirqs_off(unsigned long ip);
+ extern void trace_hardirqs_on(void);
+ extern void trace_hardirqs_off(void);
+ static inline void notrace __trace_hardirqs_on(void)
+ {
+ trace_hardirqs_on_caller(CALLER_ADDR0);
+ }
+ static inline void notrace __trace_hardirqs_off(void)
+ {
+ trace_hardirqs_off_caller(CALLER_ADDR0);
+ }
# define trace_hardirq_context(p) ((p)->hardirq_context)
# define trace_softirq_context(p) ((p)->softirq_context)
# define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled)
@@ -28,6 +39,8 @@
#else
# define trace_hardirqs_on() do { } while (0)
# define trace_hardirqs_off() do { } while (0)
+# define __trace_hardirqs_on() do { } while (0)
+# define __trace_hardirqs_off() do { } while (0)
# define trace_softirqs_on(ip) do { } while (0)
# define trace_softirqs_off(ip) do { } while (0)
# define trace_hardirq_context(p) 0
@@ -41,24 +54,32 @@
# define INIT_TRACE_IRQFLAGS
#endif

+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT

#include <asm/irqflags.h>

#define local_irq_enable() \
- do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
+ do { __trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
#define local_irq_disable() \
- do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
+ do { raw_local_irq_disable(); __trace_hardirqs_off(); } while (0)
#define local_irq_save(flags) \
- do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0)
+ do { raw_local_irq_save(flags); __trace_hardirqs_off(); } while (0)

#define local_irq_restore(flags) \
do { \
if (raw_irqs_disabled_flags(flags)) { \
raw_local_irq_restore(flags); \
- trace_hardirqs_off(); \
+ __trace_hardirqs_off(); \
} else { \
- trace_hardirqs_on(); \
+ __trace_hardirqs_on(); \
raw_local_irq_restore(flags); \
} \
} while (0)
@@ -76,7 +97,7 @@
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
#define safe_halt() \
do { \
- trace_hardirqs_on(); \
+ __trace_hardirqs_on(); \
raw_safe_halt(); \
} while (0)

Index: linux-compile.git/include/asm-x86/irqflags_32.h
===================================================================
--- linux-compile.git.orig/include/asm-x86/irqflags_32.h 2007-11-12 09:25:50.000000000 -0500
+++ linux-compile.git/include/asm-x86/irqflags_32.h 2008-01-09 17:04:10.000000000 -0500
@@ -139,9 +139,9 @@ static inline int raw_irqs_disabled(void
static inline void trace_hardirqs_fixup_flags(unsigned long flags)
{
if (raw_irqs_disabled_flags(flags))
- trace_hardirqs_off();
+ __trace_hardirqs_off();
else
- trace_hardirqs_on();
+ __trace_hardirqs_on();
}

static inline void trace_hardirqs_fixup(void)
Index: linux-compile.git/include/asm-x86/irqflags_64.h
===================================================================
--- linux-compile.git.orig/include/asm-x86/irqflags_64.h 2007-11-12 09:25:50.000000000 -0500
+++ linux-compile.git/include/asm-x86/irqflags_64.h 2008-01-09 17:07:15.000000000 -0500
@@ -120,9 +120,9 @@ static inline int raw_irqs_disabled(void
static inline void trace_hardirqs_fixup_flags(unsigned long flags)
{
if (raw_irqs_disabled_flags(flags))
- trace_hardirqs_off();
+ __trace_hardirqs_off();
else
- trace_hardirqs_on();
+ __trace_hardirqs_on();
}

static inline void trace_hardirqs_fixup(void)
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Daniel Walker
2008-01-10 04:10:11 UTC
Permalink
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
idle();
+ start_critical_timings();
Is there another way to hook into the idle routine? Right above these
lines there is a call to "enter_idle()" which I'm wondering about ..
Would be nice to have some sort of generic hook or method for this ..

Daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Steven Rostedt
2008-01-10 14:50:13 UTC
Permalink
Post by Daniel Walker
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
idle();
+ start_critical_timings();
Is there another way to hook into the idle routine? Right above these
lines there is a call to "enter_idle()" which I'm wondering about ..
Would be nice to have some sort of generic hook or method for this ..
enter_idle isn't generic.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Loading...