Discussion:
[PATCH 10/13] x86_64: make pda a percpu variable
(too old to reply)
Tejun Heo
2009-01-13 10:39:23 UTC
Permalink
As pda is now allocated in percpu area, it can easily be made a proper
percpu variable. Make it so by defining per cpu symbol from linker
script and declaring it in C code for SMP and simply defining it for
UP. This change cleans up code and brings SMP and UP closer a bit.

A lot of this patch is taken from Mike Travis' "x86_64: Fold pda into
per cpu area" and "x86_64: Reference zero-based percpu variables
offset from gs" patches.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/include/asm/pda.h | 5 +++--
arch/x86/kernel/cpu/common.c | 3 ---
arch/x86/kernel/head64.c | 10 ----------
arch/x86/kernel/head_64.S | 5 +++--
arch/x86/kernel/setup_percpu.c | 16 ++++++++++++++--
arch/x86/kernel/vmlinux_64.lds.S | 4 +++-
6 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index e91558e..66ae104 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -7,6 +7,7 @@
#include <linux/cache.h>
#include <linux/threads.h>
#include <asm/page.h>
+#include <asm/percpu.h>

/* Per processor datastructure. %gs points to it while the kernel runs */
struct x8664_pda {
@@ -39,10 +40,10 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda *_cpu_pda[NR_CPUS];
+DECLARE_PER_CPU(struct x8664_pda, __pda);
extern void pda_init(int);

-#define cpu_pda(i) (_cpu_pda[i])
+#define cpu_pda(cpu) (&per_cpu(__pda, cpu))

/*
* There is no fast way to get the base address of the PDA, all the accesses
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cdd419a..bd38a0f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -859,9 +859,6 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

#ifdef CONFIG_X86_64
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9194074..71b6f6e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,18 +26,8 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>

-#ifndef CONFIG_SMP
-/* boot cpu pda, referenced by head_64.S to initialize %gs on UP */
-struct x8664_pda _boot_cpu_pda __read_mostly;
-#endif
-
void __init x86_64_init_pda(void)
{
-#ifdef CONFIG_SMP
- cpu_pda(0) = (void *)__per_cpu_load;
-#else
- cpu_pda(0) = &_boot_cpu_pda;
-#endif
pda_init(0);
}

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 519185f..ce5975c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>

#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -250,7 +251,7 @@ ENTRY(secondary_startup_64)
* secondary CPU,initial_gs should be set to its pda address
* before the CPU runs this code.
*
- * On UP, initial_gs points to _boot_cpu_pda and doesn't
+ * On UP, initial_gs points to PER_CPU_VAR(__pda) and doesn't
* change.
*/
movl $MSR_GS_BASE,%ecx
@@ -284,7 +285,7 @@ ENTRY(secondary_startup_64)
#ifdef CONFIG_SMP
.quad __per_cpu_load
#else
- .quad _boot_cpu_pda
+ .quad PER_CPU_VAR(__pda)
#endif
__FINITDATA

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 76a2498..9edc081 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -66,6 +66,16 @@ static void __init setup_node_to_cpumask_map(void);
static inline void setup_node_to_cpumask_map(void) { }
#endif

+/*
+ * Define load_pda_offset() and per-cpu __pda for x86_64.
+ * load_pda_offset() is responsible for loading the offset of pda into
+ * %gs.
+ *
+ * On SMP, pda offset also duals as percpu base address and thus it
+ * should be at the start of per-cpu area. To achieve this, it's
+ * preallocated in vmlinux_64.lds.S directly instead of using
+ * DEFINE_PER_CPU().
+ */
#ifdef CONFIG_X86_64
void __cpuinit load_pda_offset(int cpu)
{
@@ -74,6 +84,10 @@ void __cpuinit load_pda_offset(int cpu)
wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
mb();
}
+#ifndef CONFIG_SMP
+DEFINE_PER_CPU(struct x8664_pda, __pda);
+#endif
+EXPORT_PER_CPU_SYMBOL(__pda);
#endif

#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
@@ -161,8 +175,6 @@ void __init setup_per_cpu_areas(void)
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
#ifdef CONFIG_X86_64
- cpu_pda(cpu) = (void *)ptr;
-
/*
* CPU0 modified pda in the init data area, reload pda
* offset for CPU0 and clear the area for others.
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 962f21f..d2a0baa 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -217,10 +217,12 @@ SECTIONS
* percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
* output PHDR, so the next output section - __data_nosave - should
* switch it back to data.init. Also, pda should be at the head of
- * percpu area. Preallocate it.
+ * percpu area. Preallocate it and define the percpu offset symbol
+ * so that it can be accessed as a percpu variable.
*/
. = ALIGN(PAGE_SIZE);
PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
+ per_cpu____pda = __per_cpu_start;
#else
PERCPU(PAGE_SIZE)
#endif
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:39:40 UTC
Permalink
_cpu_pda array first uses statically allocated storage in data.init
and then switches to allocated bootmem to conserve space. However,
after folding pda area into percpu area, _cpu_pda array will be
removed completely. Drop the reallocation part to simplify the code
for soon-to-follow changes.

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/include/asm/pda.h | 3 ++-
arch/x86/kernel/cpu/common.c | 2 +-
arch/x86/kernel/head64.c | 12 ------------
arch/x86/kernel/setup_percpu.c | 14 +++-----------
4 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index cbd3f48..2d5b49c 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -5,6 +5,7 @@
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/cache.h>
+#include <linux/threads.h>
#include <asm/page.h>

/* Per processor datastructure. %gs points to it while the kernel runs */
@@ -39,7 +40,7 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda **_cpu_pda;
+extern struct x8664_pda *_cpu_pda[NR_CPUS];
extern void pda_init(int);

#define cpu_pda(i) (_cpu_pda[i])
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 42e0853..5c3d6b3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -859,7 +859,7 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(_cpu_pda);

struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 54f52cf..84b96b9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -29,20 +29,8 @@
/* boot cpu pda, referenced by head_64.S to initialize %gs for boot CPU */
struct x8664_pda _boot_cpu_pda __read_mostly;

-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
void __init x86_64_init_pda(void)
{
- _cpu_pda = __cpu_pda;
cpu_pda(0) = &_boot_cpu_pda;
cpu_pda(0)->data_offset =
(unsigned long)(__per_cpu_load - __per_cpu_start);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8a22c94..bd47bbd 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -114,7 +114,6 @@ static inline void setup_cpu_pda_map(void) { }
static void __init setup_cpu_pda_map(void)
{
char *pda;
- struct x8664_pda **new_cpu_pda;
unsigned long size;
int cpu;

@@ -122,28 +121,21 @@ static void __init setup_cpu_pda_map(void)

/* allocate cpu_pda array and pointer table */
{
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
unsigned long asize = size * (nr_cpu_ids - 1);

- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
+ pda = alloc_bootmem(asize);
}

/* initialize pointer table to static pda's */
for_each_possible_cpu(cpu) {
if (cpu == 0) {
/* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
continue;
}
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
+ cpu_pda(cpu) = (struct x8664_pda *)pda;
+ cpu_pda(cpu)->in_bootmem = 1;
pda += size;
}
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
}
#endif
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:39:56 UTC
Permalink
This patch makes percpu symbols zerobased on x86_32 SMP by using
PERCPU_VADDR() with 0 vaddr in vmlinux_32.lds.S. A new PHDR is added
as existing ones cannot contain sections near address zero.

The following adjustments have been made to accomodate the address
change.

* code to locate percpu gdt_page in head_64.S is updated to add the
load address to the gdt_page offset.

* for boot CPU, we can't use __KERNEL_DS for %fs. initialize
gdt_page.gdt[GDT_ENTRY_PERCPU] with flags and limit and set the base
address to __per_cpu_load from head_32.S and load it. the base
address needs to be set manually because linker can't shift bits as
required for segment descriptors.

* __per_cpu_offset[0] is now initialized to __per_cpu_load like on
x86_64.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/kernel/cpu/common.c | 8 ++++++++
arch/x86/kernel/head_32.S | 37 ++++++++++++++++++++++++++++++++++---
arch/x86/kernel/setup_percpu.c | 4 ----
arch/x86/kernel/vmlinux_32.lds.S | 16 +++++++++++++++-
4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index bd38a0f..376e142 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -90,7 +90,15 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },

[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+#ifdef CONFIG_SMP
+ /*
+ * Linker can't handle the address bit shifting. Address will
+ * be set in head_32.S for boot CPU and init_gdt for others.
+ */
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+#else
[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
+#endif
} };
#endif
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4e..3284199 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,6 +19,7 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>

/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -424,12 +425,39 @@ is386: movl $2,%ecx # set MP
movl %eax,%cr0

call check_x87
+#ifdef CONFIG_SMP
+ /*
+ * early_gdt_base should point to the gdt_page in static percpu init
+ * data area. Computing this requires two symbols - __per_cpu_load
+ * and per_cpu__gdt_page. As linker can't do no such relocation, do
+ * it by hand. As early_gdt_descr is manipulated by C code for
+ * secondary CPUs, this should be done only once for the boot CPU
+ * when early_gdt_descr_base contains zero.
+ *
+ * Also, manually load __per_cpu_load into address fields of PERCPU
+ * segment descriptor for boot CPU. Linker can't do it.
+ */
+ movl early_gdt_descr_base, %eax
+ testl %eax, %eax
+ jnz 3f
+ movl $__per_cpu_load, %eax
+ movl %eax, %ecx
+ addl $per_cpu__gdt_page, %eax
+ movl %eax, early_gdt_descr_base
+
+ movw %cx, 8 * GDT_ENTRY_PERCPU + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_PERCPU + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_PERCPU + 7(%eax)
+3:
+#endif /* CONFIG_SMP */
lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
- movl %eax,%fs # gets reset once there's real percpu
+ movl $(__KERNEL_PERCPU),%eax
+ movl %eax,%fs # will be reloaded for boot cpu

movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
@@ -446,8 +474,6 @@ is386: movl $2,%ecx # set MP
movb $1, ready
cmpb $0,%cl # the first CPU calls start_kernel
je 1f
- movl $(__KERNEL_PERCPU), %eax
- movl %eax,%fs # set this cpu's percpu
movl (stack_start), %esp
1:
#endif /* CONFIG_SMP */
@@ -702,7 +728,12 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
+#ifdef CONFIG_SMP
+early_gdt_descr_base:
+ .long 0x00000000
+#else
.long per_cpu__gdt_page /* Overwritten for secondary CPUs */
+#endif

/*
* The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9edc081..6f47227 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -119,13 +119,9 @@ static void __init setup_per_cpu_maps(void)
#endif
}

-#ifdef CONFIG_X86_64
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
[0] = (unsigned long)__per_cpu_load,
};
-#else
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-#endif
EXPORT_SYMBOL(__per_cpu_offset);

/*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 3eba7f7..03abadf 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -24,6 +24,9 @@ jiffies = jiffies_64;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
@@ -178,7 +181,18 @@ SECTIONS
__initramfs_end = .;
}
#endif
+
+#ifdef CONFIG_SMP
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - bss - should switch it
+ * back to data.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
PERCPU(PAGE_SIZE)
+#endif
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */

@@ -193,7 +207,7 @@ SECTIONS
/* This is where the kernel creates the early boot page tables */
. = ALIGN(PAGE_SIZE);
pg0 = . ;
- }
+ } :data /* switch back to data, see PERCPU_VADDR() above */

/* Sections to be discarded */
/DISCARD/ : {
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Rusty Russell
2009-01-14 00:19:23 UTC
Permalink
Post by Tejun Heo
This patch makes percpu symbols zerobased on x86_32 SMP by using
PERCPU_VADDR() with 0 vaddr in vmlinux_32.lds.S. A new PHDR is added
as existing ones cannot contain sections near address zero.
..
Post by Tejun Heo
---
arch/x86/kernel/cpu/common.c | 8 ++++++++
arch/x86/kernel/head_32.S | 37 ++++++++++++++++++++++++++++++++++---
arch/x86/kernel/setup_percpu.c | 4 ----
arch/x86/kernel/vmlinux_32.lds.S | 16 +++++++++++++++-
4 files changed, 57 insertions(+), 8 deletions(-)
Hmm, the only reason for this change is to unify with 64-bit, yes? Yet it
doesn't actually win us anything on that front, as this diffstat shows.

If gcc's -mcmodel=kernel had used a weak symbol for the offset of the stack
canary, we would have been happy. Unfortunately generic per-cpu and x86-64
PDA were developed separately, so noone realize the problem until too late.

The basic series looks good: it will clash with my per-cpu work (mainly
because I remove the per_cpu__ prefix) in a purely-textual way though.

Thanks for this!
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-14 02:04:25 UTC
Permalink
Hello, Rusty.
Post by Rusty Russell
Post by Tejun Heo
This patch makes percpu symbols zerobased on x86_32 SMP by using
PERCPU_VADDR() with 0 vaddr in vmlinux_32.lds.S. A new PHDR is added
as existing ones cannot contain sections near address zero.
...
Post by Tejun Heo
---
arch/x86/kernel/cpu/common.c | 8 ++++++++
arch/x86/kernel/head_32.S | 37 ++++++++++++++++++++++++++++++++++---
arch/x86/kernel/setup_percpu.c | 4 ----
arch/x86/kernel/vmlinux_32.lds.S | 16 +++++++++++++++-
4 files changed, 57 insertions(+), 8 deletions(-)
Hmm, the only reason for this change is to unify with 64-bit, yes? Yet it
doesn't actually win us anything on that front, as this diffstat shows.
Yeah and to simplify things for future changes. In this patch, the
only actual unification is in __per_cpu_offset initialization but the
lack of unification is partly due to the usage of pda and differences
in initialization sequence, which again couldn't be unified because
percpu handling was different, so it's a tied knot and this patch
helps untangling it. Also, with cpualloc (or whatever) scheduled, I
think it's better to have two percpu models, up and smp, even if it
costs 49 more lines.
Post by Rusty Russell
If gcc's -mcmodel=kernel had used a weak symbol for the offset of the stack
canary, we would have been happy. Unfortunately generic per-cpu and x86-64
PDA were developed separately, so noone realize the problem until too late.
And we seem to be stuck with it if we want to keep compatibility with
compilers. :-(
Post by Rusty Russell
The basic series looks good: it will clash with my per-cpu work (mainly
because I remove the per_cpu__ prefix) in a purely-textual way though.
I thought about removing all explicit per_cpu__ prefixes with
per_cpu_var() but the usage seemed prevalent so left it alone. Why
remove it BTW?

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:40:24 UTC
Permalink
Currently pdas and percpu areas are allocated separately. %gs points
to local pda and percpu area can be reached using pda->data_offset.
This patch folds pda into percpu area.

Due to strange gcc requirement, pda needs to be at the beginning of
the percpu area so that pda->stack_canary is at %gs:40. To achieve
this, a new percpu output section macro - PERCPU_VADDR_PREALLOC() - is
added and used to reserve pda sized chunk at the start of the percpu
area.

After this change, for boot cpu, %gs first points to pda in the
data.init area and later during setup_per_cpu_areas() gets updated to
point to the actual pda. This means that setup_per_cpu_areas() need
to reload %gs for CPU0 while clearing pda area for other cpus as cpu0
already has modified it when control reaches setup_per_cpu_areas().

This patch also removes now unnecessary get_local_pda() and its call
sites.

A lot of this patch is taken from Mike Travis' "x86_64: Fold pda into
per cpu area" patch.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/include/asm/percpu.h | 8 ++++
arch/x86/include/asm/smp.h | 2 -
arch/x86/kernel/asm-offsets_64.c | 1 +
arch/x86/kernel/cpu/common.c | 6 +--
arch/x86/kernel/head64.c | 8 ++++-
arch/x86/kernel/head_64.S | 15 ++++++--
arch/x86/kernel/setup_percpu.c | 65 ++++++++++++++----------------------
arch/x86/kernel/smpboot.c | 60 +---------------------------------
arch/x86/kernel/vmlinux_64.lds.S | 6 ++-
arch/x86/xen/smp.c | 10 ------
include/asm-generic/vmlinux.lds.h | 25 +++++++++++++-
11 files changed, 83 insertions(+), 123 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index df644f3..0ed77cf 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,6 +1,14 @@
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H

+#ifndef __ASSEMBLY__
+#ifdef CONFIG_X86_64
+extern void load_pda_offset(int cpu);
+#else
+static inline void load_pda_offset(int cpu) { }
+#endif
+#endif
+
#ifdef CONFIG_X86_64
#include <linux/compiler.h>

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811c..288a89e 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -25,8 +25,6 @@ extern cpumask_t cpu_callin_map;
extern void (*mtrr_hook)(void);
extern void zap_low_mappings(void);

-extern int __cpuinit get_local_pda(int cpu);
-
extern int smp_num_siblings;
extern unsigned int num_processors;
extern cpumask_t cpu_initialized;
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f..f8d1b04 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,7 @@ int main(void)
ENTRY(cpunumber);
ENTRY(irqstackptr);
ENTRY(data_offset);
+ DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
#undef ENTRY
#ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5c3d6b3..cdd419a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -873,10 +873,8 @@ void __cpuinit pda_init(int cpu)
/* Setup up data that may be needed in __get_free_pages early */
loadsegment(fs, 0);
loadsegment(gs, 0);
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
+
+ load_pda_offset(cpu);

pda->cpunumber = cpu;
pda->irqcount = -1;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 84b96b9..ff39bdd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,12 +26,18 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>

-/* boot cpu pda, referenced by head_64.S to initialize %gs for boot CPU */
+#ifndef CONFIG_SMP
+/* boot cpu pda, referenced by head_64.S to initialize %gs on UP */
struct x8664_pda _boot_cpu_pda __read_mostly;
+#endif

void __init x86_64_init_pda(void)
{
+#ifdef CONFIG_SMP
+ cpu_pda(0) = (void *)__per_cpu_load;
+#else
cpu_pda(0) = &_boot_cpu_pda;
+#endif
cpu_pda(0)->data_offset =
(unsigned long)(__per_cpu_load - __per_cpu_start);
pda_init(0);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3f56a8f..519185f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -245,10 +245,13 @@ ENTRY(secondary_startup_64)

/* Set up %gs.
*
- * %gs should point to the pda. For initial boot, make %gs point
- * to the _boot_cpu_pda in data section. For a secondary CPU,
- * initial_gs should be set to its pda address before the CPU runs
- * this code.
+ * On SMP, %gs should point to the per-cpu area. For initial
+ * boot, make %gs point to the init data section. For a
+ * secondary CPU,initial_gs should be set to its pda address
+ * before the CPU runs this code.
+ *
+ * On UP, initial_gs points to _boot_cpu_pda and doesn't
+ * change.
*/
movl $MSR_GS_BASE,%ecx
movq initial_gs(%rip),%rax
@@ -278,7 +281,11 @@ ENTRY(secondary_startup_64)
ENTRY(initial_code)
.quad x86_64_start_kernel
ENTRY(initial_gs)
+#ifdef CONFIG_SMP
+ .quad __per_cpu_load
+#else
.quad _boot_cpu_pda
+#endif
__FINITDATA

ENTRY(stack_start)
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index bd47bbd..24d3e0d 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -14,6 +14,7 @@
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/highmem.h>
+#include <asm/proto.h>

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -65,6 +66,16 @@ static void __init setup_node_to_cpumask_map(void);
static inline void setup_node_to_cpumask_map(void) { }
#endif

+#ifdef CONFIG_X86_64
+void __cpuinit load_pda_offset(int cpu)
+{
+ /* Memory clobbers used to order pda/percpu accesses */
+ mb();
+ wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
+ mb();
+}
+#endif
+
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Copy data used in early init routines from the initial arrays to the
@@ -101,42 +112,6 @@ static void __init setup_per_cpu_maps(void)
*/
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- pda = alloc_bootmem(asize);
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- continue;
- }
- cpu_pda(cpu) = (struct x8664_pda *)pda;
- cpu_pda(cpu)->in_bootmem = 1;
- pda += size;
- }
-}
#endif

/*
@@ -151,9 +126,6 @@ void __init setup_per_cpu_areas(void)
int cpu;
unsigned long align = 1;

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
old_size = PERCPU_ENOUGH_ROOM;
align = max_t(unsigned long, PAGE_SIZE, align);
@@ -185,8 +157,21 @@ void __init setup_per_cpu_areas(void)
cpu, node, __pa(ptr));
}
#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+ cpu_pda(cpu) = (void *)ptr;
+
+ /*
+ * CPU0 modified pda in the init data area, reload pda
+ * offset for CPU0 and clear the area for others.
+ */
+ if (cpu == 0)
+ load_pda_offset(0);
+ else
+ memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
+#endif
+ per_cpu_offset(cpu) = ptr - __per_cpu_start;

DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7cd97d9..3a0fec6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -750,52 +750,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -813,16 +767,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
};
INIT_WORK(&c_idle.work, do_fork_idle);

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);
@@ -937,9 +881,7 @@ do_rest:
inquire_remote_apic(apicid);
}
}
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
+
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index f50280d..962f21f 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
#define LOAD_OFFSET __START_KERNEL_map

#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
#include <asm/page.h>

#undef i386 /* in case the preprocessor is a 32bit one */
@@ -215,10 +216,11 @@ SECTIONS
/*
* percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
* output PHDR, so the next output section - __data_nosave - should
- * switch it back to data.init.
+ * switch it back to data.init. Also, pda should be at the head of
+ * percpu area. Preallocate it.
*/
. = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
+ PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
#else
PERCPU(PAGE_SIZE)
#endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b67..ae22284 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -280,16 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fc2f55f..e53319c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -441,9 +441,10 @@
. = __per_cpu_load + SIZEOF(.data.percpu);

/**
- * PERCPU_VADDR - define output section for percpu area
+ * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
* @vaddr: explicit base address (optional)
* @phdr: destination PHDR (optional)
+ * @prealloc: the size of prealloc area
*
* Macro which expands to output section for percpu area. If @vaddr
* is not blank, it specifies explicit base address and all percpu
@@ -455,11 +456,33 @@
* section in the linker script will go there too. @phdr should have
* a leading colon.
*
+ * If @prealloc is non-zero, the specified number of bytes will be
+ * reserved at the start of percpu area. As the prealloc area is
+ * likely to break alignment, this macro puts areas in increasing
+ * alignment order.
+ *
* This macro defines three symbols, __per_cpu_load, __per_cpu_start
* and __per_cpu_end. The first one is the vaddr of loaded percpu
* init data. __per_cpu_start equals @vaddr and __per_cpu_end is the
* end offset.
*/
+#define PERCPU_VADDR_PREALLOC(vaddr, segment, prealloc) \
+ PERCPU_PROLOG(vaddr) \
+ . += prealloc; \
+ *(.data.percpu) \
+ *(.data.percpu.shared_aligned) \
+ *(.data.percpu.page_aligned) \
+ PERCPU_EPILOG(segment)
+
+/**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area. Mostly
+ * identical to PERCPU_VADDR_PREALLOC(@vaddr, @phdr, 0) other than
+ * using slighly different layout.
+ */
#define PERCPU_VADDR(vaddr, phdr) \
PERCPU_PROLOG(vaddr) \
*(.data.percpu.page_aligned) \
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:40:48 UTC
Permalink
Make early_per_cpu() a lvalue as per_cpu() is and use it where
applicable.

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/include/asm/percpu.h | 6 +++---
arch/x86/include/asm/topology.h | 5 +----
arch/x86/kernel/apic.c | 13 ++-----------
3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece7205..df644f3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -195,9 +195,9 @@ do { \
#define early_per_cpu_ptr(_name) (_name##_early_ptr)
#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
#define early_per_cpu(_name, _cpu) \
- (early_per_cpu_ptr(_name) ? \
- early_per_cpu_ptr(_name)[_cpu] : \
- per_cpu(_name, _cpu))
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
+ &per_cpu(_name, _cpu))

#else /* !CONFIG_SMP */
#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ff386ff..3ac51b5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -96,10 +96,7 @@ static inline int cpu_to_node(int cpu)
/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- return per_cpu(x86_cpu_to_node_map, cpu);
+ return early_per_cpu(x86_cpu_to_node_map);
}

/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index b5229af..c2f4cbd 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -1865,17 +1865,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
#endif

#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
#endif

cpu_set(cpu, cpu_possible_map);
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 11:44:45 UTC
Permalink
Make early_per_cpu() a lvalue as per_cpu() is and use it where
applicable.

Meriusz spotted that early_per_cpu() conversion in topology.h was
missing @cpu argument. Fixed.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mariusz Ceier <***@gmail.com>
---
git tree has been regenerated accordingly.

Thanks.

arch/x86/include/asm/percpu.h | 6 +++---
arch/x86/include/asm/topology.h | 5 +----
arch/x86/kernel/apic.c | 13 ++-----------
3 files changed, 6 insertions(+), 18 deletions(-)

Index: work/arch/x86/include/asm/topology.h
===================================================================
--- work.orig/arch/x86/include/asm/topology.h
+++ work/arch/x86/include/asm/topology.h
@@ -96,10 +96,7 @@ static inline int cpu_to_node(int cpu)
/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- return per_cpu(x86_cpu_to_node_map, cpu);
+ return early_per_cpu(x86_cpu_to_node_map, cpu);
}

/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
Index: work/arch/x86/kernel/apic.c
===================================================================
--- work.orig/arch/x86/kernel/apic.c
+++ work/arch/x86/kernel/apic.c
@@ -1865,17 +1865,8 @@ void __cpuinit generic_processor_info(in
#endif

#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
#endif

cpu_set(cpu, cpu_possible_map);
Index: work/arch/x86/include/asm/percpu.h
===================================================================
--- work.orig/arch/x86/include/asm/percpu.h
+++ work/arch/x86/include/asm/percpu.h
@@ -195,9 +195,9 @@ do { \
#define early_per_cpu_ptr(_name) (_name##_early_ptr)
#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
#define early_per_cpu(_name, _cpu) \
- (early_per_cpu_ptr(_name) ? \
- early_per_cpu_ptr(_name)[_cpu] : \
- per_cpu(_name, _cpu))
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
+ &per_cpu(_name, _cpu))

#else /* !CONFIG_SMP */
#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:41:04 UTC
Permalink
From: Mike Travis <***@sgi.com>

From: Mike Travis <***@sgi.com>

* Ruggedize some calls in setup_percpu.c to prevent mishaps
in early calls, particularly for non-critical functions.

* Cleanup DEBUG_PER_CPU_MAPS usages and some comments.

Based on linux-2.6.tip/master with following patches applied:

cpumask: Make cpumask_of_cpu_map generic
cpumask: Put cpumask_of_cpu_map in the initdata section
cpumask: Change cpumask_of_cpu_ptr to use new cpumask_of_cpu

Signed-off-by: Mike Travis <***@sgi.com>
Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/kernel/setup_percpu.c | 60 ++++++++++++++++++++++++++++------------
1 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3..ee55e3b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -15,6 +15,12 @@
#include <asm/apicdef.h>
#include <asm/highmem.h>

+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int num_processors;
unsigned disabled_cpus __cpuinitdata;
@@ -27,31 +33,39 @@ EXPORT_SYMBOL(boot_cpu_physical_apicid);
physid_mask_t phys_cpu_present_map;
#endif

-/* map cpu index to physical APIC ID */
+/*
+ * Map cpu index to physical APIC ID
+ */
DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);

#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define X86_64_NUMA 1
+#define X86_64_NUMA 1 /* (used later) */

-/* map cpu index to node index */
+/*
+ * Map cpu index to node index
+ */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

-/* which logical CPUs are on which nodes */
+/*
+ * Which logical CPUs are on which nodes
+ */
cpumask_t *node_to_cpumask_map;
EXPORT_SYMBOL(node_to_cpumask_map);

-/* setup node_to_cpumask_map */
+/*
+ * Setup node_to_cpumask_map
+ */
static void __init setup_node_to_cpumask_map(void);

#else
static inline void setup_node_to_cpumask_map(void) { }
#endif

-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Copy data used in early init routines from the initial arrays to the
* per cpu data areas. These arrays then become expendable and the
@@ -181,10 +195,12 @@ void __init setup_per_cpu_areas(void)
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+ DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}

- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
- NR_CPUS, nr_cpu_ids, nr_node_ids);
+ printk(KERN_INFO "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids: %d\n",
+ NR_CPUS, nr_cpu_ids, nr_node_ids);

/* Setup percpu data maps */
setup_per_cpu_maps();
@@ -202,6 +218,7 @@ void __init setup_per_cpu_areas(void)
* Requires node_possible_map to be valid.
*
* Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
static void __init setup_node_to_cpumask_map(void)
{
@@ -217,6 +234,7 @@ static void __init setup_node_to_cpumask_map(void)

/* allocate the map */
map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+ DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);

pr_debug("Node to cpumask map at %p for %d nodes\n",
map, nr_node_ids);
@@ -229,17 +247,23 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
-
- if (cpu_to_node_map)
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
cpu_to_node_map[cpu] = node;
+ return;
+ }

- else if (per_cpu_offset(cpu))
- per_cpu(x86_cpu_to_node_map, cpu) = node;
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;

- else
- pr_debug("Setting node for non-present cpu %d\n", cpu);
+ if (node != NUMA_NO_NODE)
+ cpu_pda(cpu)->nodenumber = node;
}

void __cpuinit numa_clear_node(int cpu)
@@ -256,7 +280,7 @@ void __cpuinit numa_add_cpu(int cpu)

void __cpuinit numa_remove_cpu(int cpu)
{
- cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+ cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

#else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -266,7 +290,7 @@ void __cpuinit numa_remove_cpu(int cpu)
*/
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = cpu_to_node(cpu);
+ int node = early_cpu_to_node(cpu);
cpumask_t *mask;
char buf[64];
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:41:35 UTC
Permalink
This patch makes percpu symbols zerobased on x86_64 SMP by adding
PERCPU_VADDR() to vmlinux.lds.h which helps setting explicit vaddr on
the percpu output section and using it in vmlinux_64.lds.S. A new
PHDR is added as existing ones cannot contain sections near address
zero. PERCPU_VADDR() also adds a new symbol __per_cpu_load which
always points to the vaddr of the loaded percpu data.init region.

The following adjustments have been made to accomodate the address
change.

* code to locate percpu gdt_page in head_64.S is updated to add the
load address to the gdt_page offset.

* __per_cpu_load is used in places where access to the init data area
is necessary.

* pda->data_offset is initialized soon after C code is entered as zero
value doesn't work anymore.

This patch is mostly taken from Mike Travis' "x86_64: Base percpu
variables at zero" patch.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/kernel/head64.c | 2 +
arch/x86/kernel/head_64.S | 24 ++++++++++++++++-
arch/x86/kernel/setup_percpu.c | 2 +-
arch/x86/kernel/vmlinux_64.lds.S | 17 +++++++++++-
include/asm-generic/sections.h | 2 +-
include/asm-generic/vmlinux.lds.h | 51 ++++++++++++++++++++++++++++++++----
6 files changed, 88 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 388e05a..a63261b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -44,6 +44,8 @@ void __init x86_64_init_pda(void)
{
_cpu_pda = __cpu_pda;
cpu_pda(0) = &_boot_cpu_pda;
+ cpu_pda(0)->data_offset =
+ (unsigned long)(__per_cpu_load - __per_cpu_start);
pda_init(0);
}

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 26cfdc1..5ab77b3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -204,6 +204,23 @@ ENTRY(secondary_startup_64)
pushq $0
popfq

+#ifdef CONFIG_SMP
+ /*
+ * early_gdt_base should point to the gdt_page in static percpu init
+ * data area. Computing this requires two symbols - __per_cpu_load
+ * and per_cpu__gdt_page. As linker can't do no such relocation, do
+ * it by hand. As early_gdt_descr is manipulated by C code for
+ * secondary CPUs, this should be done only once for the boot CPU
+ * when early_gdt_descr_base contains zero.
+ */
+ movq early_gdt_descr_base(%rip), %rax
+ testq %rax, %rax
+ jnz 1f
+ movq $__per_cpu_load, %rax
+ addq $per_cpu__gdt_page, %rax
+ movq %rax, early_gdt_descr_base(%rip)
+1:
+#endif
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
@@ -401,7 +418,12 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
- .quad per_cpu__gdt_page
+#ifdef CONFIG_SMP
+early_gdt_descr_base:
+ .quad 0x0000000000000000
+#else
+ .quad per_cpu__gdt_page
+#endif

ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ee55e3b..8a22c94 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -194,7 +194,7 @@ void __init setup_per_cpu_areas(void)
}
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);

DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0..f50280d 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,6 +19,9 @@ PHDRS {
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
@@ -208,14 +211,26 @@ SECTIONS
__initramfs_end = .;
#endif

+#ifdef CONFIG_SMP
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - __data_nosave - should
+ * switch it back to data.init.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
PERCPU(PAGE_SIZE)
+#endif

. = ALIGN(PAGE_SIZE);
__init_end = .;

. = ALIGN(PAGE_SIZE);
__nosave_begin = .;
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ *(.data.nosave)
+ } :data.init /* switch back to data.init, see PERCPU_VADDR() above */
. = ALIGN(PAGE_SIZE);
__nosave_end = .;

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 79a7ff9..4ce48e8 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
extern char __init_begin[], __init_end[];
extern char _sinittext[], _einittext[];
extern char _end[];
-extern char __per_cpu_start[], __per_cpu_end[];
+extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __initdata_begin[], __initdata_end[];
extern char __start_rodata[], __end_rodata[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1..fc2f55f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -430,12 +430,51 @@
*(.initcall7.init) \
*(.initcall7s.init)

-#define PERCPU(align) \
- . = ALIGN(align); \
- VMLINUX_SYMBOL(__per_cpu_start) = .; \
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
+#define PERCPU_PROLOG(vaddr) \
+ VMLINUX_SYMBOL(__per_cpu_load) = .; \
+ .data.percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \
+ VMLINUX_SYMBOL(__per_cpu_start) = .;
+
+#define PERCPU_EPILOG(phdr) \
+ VMLINUX_SYMBOL(__per_cpu_end) = .; \
+ } phdr \
+ . = __per_cpu_load + SIZEOF(.data.percpu);
+
+/**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area. If @vaddr
+ * is not blank, it specifies explicit base address and all percpu
+ * symbols will be offset from the given address. If blank, @vaddr
+ * always equals @laddr + LOAD_OFFSET.
+ *
+ * @phdr defines the output PHDR to use if not blank. Be warned that
+ * output PHDR is sticky. If @phdr is specified, the next output
+ * section in the linker script will go there too. @phdr should have
+ * a leading colon.
+ *
+ * This macro defines three symbols, __per_cpu_load, __per_cpu_start
+ * and __per_cpu_end. The first one is the vaddr of loaded percpu
+ * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the
+ * end offset.
+ */
+#define PERCPU_VADDR(vaddr, phdr) \
+ PERCPU_PROLOG(vaddr) \
*(.data.percpu.page_aligned) \
*(.data.percpu) \
*(.data.percpu.shared_aligned) \
- } \
- VMLINUX_SYMBOL(__per_cpu_end) = .;
+ PERCPU_EPILOG(phdr)
+
+/**
+ * PERCPU - define output section for percpu area, simple version
+ * @align: required alignment
+ *
+ * Align to @align and outputs output section for percpu area. This
+ * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+ * __per_cpu_start will be identical.
+ */
+#define PERCPU(align) \
+ . = ALIGN(align); \
+ PERCPU_VADDR( , )
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:41:55 UTC
Permalink
Make vmlinux_32.lds.S use the generic PERCPU() macro instead of open
coding it. This will ease future changes.

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/kernel/vmlinux_32.lds.S | 9 +--------
1 files changed, 1 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c6755..3eba7f7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -178,14 +178,7 @@ SECTIONS
__initramfs_end = .;
}
#endif
- . = ALIGN(PAGE_SIZE);
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
- __per_cpu_start = .;
- *(.data.percpu.page_aligned)
- *(.data.percpu)
- *(.data.percpu.shared_aligned)
- __per_cpu_end = .;
- }
+ PERCPU(PAGE_SIZE)
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:42:18 UTC
Permalink
Do the following cleanups.

* kill x86_64_init_pda() which now is equivalent to pda_init()

* use per_cpu_offset() instead of cpu_pda() when initializing
initial_gs

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/include/asm/setup.h | 1 -
arch/x86/kernel/acpi/sleep.c | 2 +-
arch/x86/kernel/head64.c | 7 +------
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/xen/enlighten.c | 2 +-
5 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4fcd53f..2f3e50e 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;

#else
-void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fe23ff8..94c9b75 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
- initial_gs = (unsigned long)cpu_pda(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 71b6f6e..af67d32 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,6 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>

-void __init x86_64_init_pda(void)
-{
- pda_init(0);
-}
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -96,7 +91,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");

- x86_64_init_pda();
+ pda_init(0);

x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3a0fec6..3179bb9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -804,7 +804,7 @@ do_rest:
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
- initial_gs = (unsigned long)cpu_pda(cpu);
+ initial_gs = per_cpu_offset(cpu);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea2152..76e092d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1652,7 +1652,7 @@ asmlinkage void __init xen_start_kernel(void)
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
- x86_64_init_pda();
+ pda_init(0);
#endif

xen_smp_init();
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:42:36 UTC
Permalink
Now that pda is allocated as part of percpu, percpu doesn't need to be
accessed through pda. Unify x86_64 SMP percpu access with x86_32 SMP
one. Other than the segment register, operand size and the base of
percpu symbols, they behave identical now.

This patch replaces now unnecessary pda->data_offset with a dummy
field which is necessary to keep stack_canary at its place. This
patch also moves per_cpu_offset initialization out of init_gdt() into
setup_per_cpu_areas(). Note that this change also necessitates
explicit per_cpu_offset initializations in voyager_smp.c.

With this change, x86_OP_percpu()'s are as efficient on x86_64 as on
x86_32 and also x86_64 can use assembly PER_CPU macros.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/include/asm/pda.h | 3 +-
arch/x86/include/asm/percpu.h | 127 +++++++++++------------------------
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/entry_64.S | 7 +-
arch/x86/kernel/head64.c | 2 -
arch/x86/kernel/setup_percpu.c | 15 ++--
arch/x86/kernel/smpcommon.c | 3 +-
arch/x86/mach-voyager/voyager_smp.c | 2 +
8 files changed, 55 insertions(+), 105 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2d5b49c..e91558e 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -11,8 +11,7 @@
/* Per processor datastructure. %gs points to it while the kernel runs */
struct x8664_pda {
struct task_struct *pcurrent; /* 0 Current process */
- unsigned long data_offset; /* 8 Per cpu data offset from linker
- address */
+ unsigned long dummy;
unsigned long kernelstack; /* 16 top of kernel stack for current */
unsigned long oldrsp; /* 24 user rsp for system call */
int irqcount; /* 32 Irq nesting counter. Starts -1 */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0ed77cf..556f84b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,62 +1,13 @@
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H

-#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_64
-extern void load_pda_offset(int cpu);
+#define __percpu_seg gs
+#define __percpu_mov_op movq
#else
-static inline void load_pda_offset(int cpu) { }
-#endif
-#endif
-
-#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
+#define __percpu_seg fs
+#define __percpu_mov_op movl
#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment. x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime. The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect. However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var) \
- ({ \
- typeof(per_cpu_var(var)) __tmp; \
- preempt_disable(); \
- __tmp = __get_cpu_var(var); \
- preempt_enable(); \
- __tmp; \
- })
-
-#define x86_write_percpu(var, val) \
- do { \
- preempt_disable(); \
- __get_cpu_var(var) = (val); \
- preempt_enable(); \
- } while(0)
-
-#else /* CONFIG_X86_64 */

#ifdef __ASSEMBLY__

@@ -73,42 +24,26 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
* PER_CPU(cpu_gdt_descr, %ebx)
*/
#ifdef CONFIG_SMP
-#define PER_CPU(var, reg) \
- movl %fs:per_cpu__##this_cpu_off, reg; \
+#define PER_CPU(var, reg) \
+ __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var) %fs:per_cpu__##var
+#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
#else /* ! SMP */
-#define PER_CPU(var, reg) \
- movl $per_cpu__##var, reg
+#define PER_CPU(var, reg) \
+ __percpu_mov_op $per_cpu__##var, reg
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */

#else /* ...!ASSEMBLY */

-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- * var - variable name
- * cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
-#define __percpu_seg "%%fs:"
+#include <linux/stringify.h>

-#else /* !SMP */
-
-#define __percpu_seg ""
-
-#endif /* SMP */
+#ifdef CONFIG_SMP
+#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
+#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+#else
+#define __percpu_seg_str
+#endif

#include <asm-generic/percpu.h>

@@ -128,20 +63,25 @@ do { \
} \
switch (sizeof(var)) { \
case 1: \
- asm(op "b %1,"__percpu_seg"%0" \
+ asm(op "b %1,"__percpu_seg_str"%0" \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 2: \
- asm(op "w %1,"__percpu_seg"%0" \
+ asm(op "w %1,"__percpu_seg_str"%0" \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 4: \
- asm(op "l %1,"__percpu_seg"%0" \
+ asm(op "l %1,"__percpu_seg_str"%0" \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
+ case 8: \
+ asm(op "q %1,"__percpu_seg_str"%0" \
+ : "+m" (var) \
+ : "r" ((T__)val)); \
+ break; \
default: __bad_percpu_size(); \
} \
} while (0)
@@ -151,17 +91,22 @@ do { \
typeof(var) ret__; \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_seg"%1,%0" \
+ asm(op "b "__percpu_seg_str"%1,%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_seg"%1,%0" \
+ asm(op "w "__percpu_seg_str"%1,%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_seg"%1,%0" \
+ asm(op "l "__percpu_seg_str"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_seg_str"%1,%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
@@ -175,8 +120,14 @@ do { \
#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+
+#ifdef CONFIG_X86_64
+extern void load_pda_offset(int cpu);
+#else
+static inline void load_pda_offset(int cpu) { }
+#endif
+
#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f8d1b04..f4cc81b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -55,7 +55,6 @@ int main(void)
ENTRY(irqcount);
ENTRY(cpunumber);
ENTRY(irqstackptr);
- ENTRY(data_offset);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
#undef ENTRY
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e28c7a9..4833f3a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -52,6 +52,7 @@
#include <asm/irqflags.h>
#include <asm/paravirt.h>
#include <asm/ftrace.h>
+#include <asm/percpu.h>

/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -1072,10 +1073,10 @@ ENTRY(\sym)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- movq %gs:pda_data_offset, %rbp
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ PER_CPU(init_tss, %rbp)
+ subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
call \do_sym
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index ff39bdd..9194074 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -38,8 +38,6 @@ void __init x86_64_init_pda(void)
#else
cpu_pda(0) = &_boot_cpu_pda;
#endif
- cpu_pda(0)->data_offset =
- (unsigned long)(__per_cpu_load - __per_cpu_start);
pda_init(0);
}

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 24d3e0d..76a2498 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -105,14 +105,14 @@ static void __init setup_per_cpu_maps(void)
#endif
}

-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
+#ifdef CONFIG_X86_64
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+ [0] = (unsigned long)__per_cpu_load,
+};
+#else
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
#endif
+EXPORT_SYMBOL(__per_cpu_offset);

/*
* Great future plan:
@@ -159,6 +159,7 @@ void __init setup_per_cpu_areas(void)
#endif

memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+ per_cpu_offset(cpu) = ptr - __per_cpu_start;
#ifdef CONFIG_X86_64
cpu_pda(cpu) = (void *)ptr;

@@ -171,7 +172,7 @@ void __init setup_per_cpu_areas(void)
else
memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);

DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 397e309..84395fa 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -4,10 +4,10 @@
#include <linux/module.h>
#include <asm/smp.h>

-#ifdef CONFIG_X86_32
DEFINE_PER_CPU(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

+#ifdef CONFIG_X86_32
/*
* Initialize the CPU's GDT. This is either the boot CPU doing itself
* (still using the master per-cpu area), or a CPU doing it for a
@@ -24,7 +24,6 @@ __cpuinit void init_gdt(int cpu)
write_gdt_entry(get_cpu_gdt_table(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);

- per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
per_cpu(cpu_number, cpu) = cpu;
}
#endif
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 5214500..7d79a52 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -539,6 +539,7 @@ static void __init do_boot_cpu(__u8 cpu)
stack_start.sp = (void *)idle->thread.sp;

init_gdt(cpu);
+ per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
per_cpu(current_task, cpu) = idle;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
irq_ctx_init(cpu);
@@ -1756,6 +1757,7 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
static void __cpuinit voyager_smp_prepare_boot_cpu(void)
{
init_gdt(smp_processor_id());
+ per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
switch_to_new_gdt();

cpu_set(smp_processor_id(), cpu_online_map);
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:42:55 UTC
Permalink
CPU startup code in head_64.S loaded address of a zero page into %gs
for temporary use till pda is loaded but address to the actual pda is
available at the point. Load the real address directly instead.

This will help unifying percpu and pda handling later on.

This patch is mostly taken from Mike Travis' "x86_64: Fold pda into
per cpu area" patch.

Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Mike Travis <***@sgi.com>
---
arch/x86/include/asm/trampoline.h | 1 +
arch/x86/kernel/acpi/sleep.c | 1 +
arch/x86/kernel/head64.c | 4 ++--
arch/x86/kernel/head_64.S | 15 ++++++++++-----
arch/x86/kernel/smpboot.c | 1 +
5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 780ba0a..90f06c2 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base;

extern unsigned long init_rsp;
extern unsigned long initial_code;
+extern unsigned long initial_gs;

#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
#define TRAMPOLINE_BASE 0x6000
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 806b4e9..fe23ff8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
+ initial_gs = (unsigned long)cpu_pda(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a63261b..54f52cf 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,8 +26,8 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+/* boot cpu pda, referenced by head_64.S to initialize %gs for boot CPU */
+struct x8664_pda _boot_cpu_pda __read_mostly;

#ifdef CONFIG_SMP
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 5ab77b3..3f56a8f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -243,12 +243,15 @@ ENTRY(secondary_startup_64)
movl %eax,%fs
movl %eax,%gs

- /*
- * Setup up a dummy PDA. this is just for some early bootup code
- * that does in_interrupt()
- */
+ /* Set up %gs.
+ *
+ * %gs should point to the pda. For initial boot, make %gs point
+ * to the _boot_cpu_pda in data section. For a secondary CPU,
+ * initial_gs should be set to its pda address before the CPU runs
+ * this code.
+ */
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_gs(%rip),%rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
@@ -274,6 +277,8 @@ ENTRY(secondary_startup_64)
.align 8
ENTRY(initial_code)
.quad x86_64_start_kernel
+ ENTRY(initial_gs)
+ .quad _boot_cpu_pda
__FINITDATA

ENTRY(stack_start)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f8500c9..7cd97d9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -860,6 +860,7 @@ do_rest:
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ initial_gs = (unsigned long)cpu_pda(cpu);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:43:22 UTC
Permalink
pda is now a percpu variable and there's no reason it can't use plain
x86 percpu accessors. Add x86_test_and_clear_bit_percpu() and replace
pda op implementations with wrappers around x86 percpu accessors.

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/include/asm/pda.h | 88 +++-----------------------------------
arch/x86/include/asm/percpu.h | 10 ++++
arch/x86/kernel/vmlinux_64.lds.S | 1 -
arch/x86/kernel/x8664_ksyms_64.c | 2 -
4 files changed, 16 insertions(+), 85 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 66ae104..e3d3a08 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -45,91 +45,15 @@ extern void pda_init(int);

#define cpu_pda(cpu) (&per_cpu(__pda, cpu))

-/*
- * There is no fast way to get the base address of the PDA, all the accesses
- * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
- */
-extern void __bad_pda_field(void) __attribute__((noreturn));
-
-/*
- * proxy_pda doesn't actually exist, but tell gcc it is accessed for
- * all PDA accesses so it gets read/write dependencies right.
- */
-extern struct x8664_pda _proxy_pda;
-
-#define pda_offset(field) offsetof(struct x8664_pda, field)
-
-#define pda_to_op(op, field, val) \
-do { \
- typedef typeof(_proxy_pda.field) T__; \
- if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- case 4: \
- asm(op "l %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i" (pda_offset(field))); \
- break; \
- case 8: \
- asm(op "q %1,%%gs:%c2": \
- "+m" (_proxy_pda.field) : \
- "r" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
-} while (0)
-
-#define pda_from_op(op, field) \
-({ \
- typeof(_proxy_pda.field) ret__; \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %%gs:%c1,%0" : \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 4: \
- asm(op "l %%gs:%c1,%0": \
- "=r" (ret__): \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 8: \
- asm(op "q %%gs:%c1,%0": \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
- ret__; \
-})
-
-#define read_pda(field) pda_from_op("mov", field)
-#define write_pda(field, val) pda_to_op("mov", field, val)
-#define add_pda(field, val) pda_to_op("add", field, val)
-#define sub_pda(field, val) pda_to_op("sub", field, val)
-#define or_pda(field, val) pda_to_op("or", field, val)
+#define read_pda(field) x86_read_percpu(__pda.field)
+#define write_pda(field, val) x86_write_percpu(__pda.field, val)
+#define add_pda(field, val) x86_add_percpu(__pda.field, val)
+#define sub_pda(field, val) x86_sub_percpu(__pda.field, val)
+#define or_pda(field, val) x86_or_percpu(__pda.field, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define test_and_clear_bit_pda(bit, field) \
-({ \
- int old__; \
- asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (_proxy_pda.field) \
- : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
- old__; \
-})
+ x86_test_and_clear_bit_percpu(bit, __pda.field)

#endif

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 556f84b..328b31a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -121,6 +121,16 @@ do { \
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)

+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define x86_test_and_clear_bit_percpu(bit, var) \
+({ \
+ int old__; \
+ asm volatile("btr %1,"__percpu_seg_str"%c2\n\tsbbl %0,%0" \
+ : "=r" (old__) \
+ : "dIr" (bit), "i" (&per_cpu__##var) : "memory"); \
+ old__; \
+})
+
#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index d2a0baa..a09abb8 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -14,7 +14,6 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
-_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426..3909e3b 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:43:41 UTC
Permalink
There's no instruction to move a 64bit immediate into memory location.
Drop "i".

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/include/asm/pda.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..cbd3f48 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -78,7 +78,7 @@ do { \
case 8: \
asm(op "q %1,%%gs:%c2": \
"+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
+ "r" ((T__)val), \
"i"(pda_offset(field))); \
break; \
default: \
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 10:49:03 UTC
Permalink
Each step was tested with gcc-4.1.3, 4.2.3 and 4.3.1 on x86_64, 4.3.1
on i386 for both SMP and UP configurations. The final state is
currently being tested using 4.2.3 on both x86_64 and 32.
Aieee.. two sentences mixed up in my head. Correction:

Each step was tested with gcc-4.3.1 for both x86_64 and 32. The final
step for x86_64 has been tested with gcc-4.1.3, 4.2.3 and 4.3.1.
Currently, I'm burn-testing both x86_64 and 32 using gcc-4.2.3 with
all patches applied.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Brian Gerst
2009-01-13 13:28:23 UTC
Permalink
Post by Tejun Heo
Each step was tested with gcc-4.1.3, 4.2.3 and 4.3.1 on x86_64, 4.3.1
on i386 for both SMP and UP configurations. The final state is
currently being tested using 4.2.3 on both x86_64 and 32.
Each step was tested with gcc-4.3.1 for both x86_64 and 32. The final
step for x86_64 has been tested with gcc-4.1.3, 4.2.3 and 4.3.1.
Currently, I'm burn-testing both x86_64 and 32 using gcc-4.2.3 with
all patches applied.
Thanks.
I've been working on a patchset that does something similar, but
eliminating the PDA completely. The start is already in tip/x86/pda.
The plan is to change all PDA variables to be normal per-cpu
variables, merging with 32-bit where possible. Once the PDA is empty,
I'll base %gs at the start of the per-cpu area. I've been working out
the bugs with the last patch (zero-basing the percpu area) before
submitting, but I probably won't have the time until this weekend to
polish it off. I could submit all but the last patch if you'd like.
They are functionally correct, but because the per-cpu area isn't
zero-based yet the generated code is a bit bloated due to having to
calculate the delta for the %gs offset.

--
Brian Gerst
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 14:06:54 UTC
Permalink
Hello, Brian.
Post by Brian Gerst
I've been working on a patchset that does something similar, but
eliminating the PDA completely. The start is already in tip/x86/pda.
The plan is to change all PDA variables to be normal per-cpu
variables, merging with 32-bit where possible.
I think the two changes aren't exclusive at all. The order of things
could be different but in the end, yeah, zero-based percpu symbols w/
mostly empty pda is the goal.
Post by Brian Gerst
Once the PDA is empty, I'll base %gs at the start of the per-cpu
area. I've been working out the bugs with the last patch
(zero-basing the percpu area) before submitting, but I probably
won't have the time until this weekend to polish it off. I could
submit all but the last patch if you'd like.
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
Post by Brian Gerst
They are functionally correct, but because the per-cpu area isn't
zero-based yet the generated code is a bit bloated due to having to
calculate the delta for the %gs offset.
BTW, what did you do about the dreaded stack_canary?

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Brian Gerst
2009-01-13 14:26:59 UTC
Permalink
Post by Tejun Heo
Hello, Brian.
Post by Brian Gerst
I've been working on a patchset that does something similar, but
eliminating the PDA completely. The start is already in tip/x86/pda.
The plan is to change all PDA variables to be normal per-cpu
variables, merging with 32-bit where possible.
I think the two changes aren't exclusive at all. The order of things
could be different but in the end, yeah, zero-based percpu symbols w/
mostly empty pda is the goal.
Post by Brian Gerst
Once the PDA is empty, I'll base %gs at the start of the per-cpu
area. I've been working out the bugs with the last patch
(zero-basing the percpu area) before submitting, but I probably
won't have the time until this weekend to polish it off. I could
submit all but the last patch if you'd like.
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
Are your patches available via git by chance?
Post by Tejun Heo
Post by Brian Gerst
They are functionally correct, but because the per-cpu area isn't
zero-based yet the generated code is a bit bloated due to having to
calculate the delta for the %gs offset.
BTW, what did you do about the dreaded stack_canary?
I put the irqstack at the start of the per-cpu area and overlaid the
canary on the bottom 48 bytes of the 16k stack.

--
Brian Gerst
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-13 14:38:25 UTC
Permalink
Post by Brian Gerst
Post by Tejun Heo
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
Are your patches available via git by chance?
Sure,

http://git.kernel.org/?p=linux/kernel/git/tj/misc.git;a=shortlog;h=x86-percpu-zerobased
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git x86-percpu-zerobased

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
H. Peter Anvin
2009-01-14 06:58:31 UTC
Permalink
Post by Tejun Heo
I think the two changes aren't exclusive at all. The order of things
could be different but in the end, yeah, zero-based percpu symbols w/
mostly empty pda is the goal.
Post by Brian Gerst
Once the PDA is empty, I'll base %gs at the start of the per-cpu
area. I've been working out the bugs with the last patch
(zero-basing the percpu area) before submitting, but I probably
won't have the time until this weekend to polish it off. I could
submit all but the last patch if you'd like.
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
A merged tree here would be absolutely wonderful. I've kept an eye on
the discussion so far, but it looks like you guys are handling it fine.

-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-14 09:39:43 UTC
Permalink
Post by H. Peter Anvin
Post by Tejun Heo
I think the two changes aren't exclusive at all. The order of things
could be different but in the end, yeah, zero-based percpu symbols w/
mostly empty pda is the goal.
Post by Brian Gerst
Once the PDA is empty, I'll base %gs at the start of the per-cpu
area. I've been working out the bugs with the last patch
(zero-basing the percpu area) before submitting, but I probably
won't have the time until this weekend to polish it off. I could
submit all but the last patch if you'd like.
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
A merged tree here would be absolutely wonderful. I've kept an eye on
the discussion so far, but it looks like you guys are handling it fine.
agreed, it looks really good.

Tejun, could you please also add the patch below to your lineup too?

It is an optimization and a cleanup, and adds the following new generic
percpu methods:

percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable, instead
of this sequence:

return __get_cpu_var(var);

ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx
ffffffff8102ca32: 81
ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax
ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

return percpu_read(var);

ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use these
new generic percpu primitives.

It looks quite hard to convince the compiler to generate the optimized
single-instruction sequence for us out of __get_cpu_var(var) - or can you
perhaps see a way to do it?

The patch is against your latest zero-based percpu / pda unification tree.
Untested.

Ingo

Signed-off-by: Ingo Molnar <***@elte.hu>
---
arch/x86/include/asm/current.h | 2 +-
arch/x86/include/asm/irq_regs_32.h | 4 ++--
arch/x86/include/asm/mmu_context_32.h | 12 ++++++------
arch/x86/include/asm/pda.h | 10 +++++-----
arch/x86/include/asm/percpu.h | 23 ++++++++++++-----------
arch/x86/include/asm/smp.h | 2 +-
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/tlb_32.c | 10 +++++-----
arch/x86/mach-voyager/voyager_smp.c | 4 ++--
arch/x86/xen/enlighten.c | 14 +++++++-------
arch/x86/xen/irq.c | 8 ++++----
arch/x86/xen/mmu.c | 2 +-
arch/x86/xen/multicalls.h | 2 +-
arch/x86/xen/smp.c | 2 +-
include/asm-generic/percpu.h | 28 ++++++++++++++++++++++++++++
16 files changed, 90 insertions(+), 48 deletions(-)

Index: linux/arch/x86/include/asm/current.h
===================================================================
--- linux.orig/arch/x86/include/asm/current.h
+++ linux/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
- return x86_read_percpu(current_task);
+ return percpu_read(current_task);
}

#else /* X86_32 */
Index: linux/arch/x86/include/asm/irq_regs_32.h
===================================================================
--- linux.orig/arch/x86/include/asm/irq_regs_32.h
+++ linux/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_re

static inline struct pt_regs *get_irq_regs(void)
{
- return x86_read_percpu(irq_regs);
+ return percpu_read(irq_regs);
}

static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_re
struct pt_regs *old_regs;

old_regs = get_irq_regs();
- x86_write_percpu(irq_regs, new_regs);
+ percpu_write(irq_regs, new_regs);

return old_regs;
}
Index: linux/arch/x86/include/asm/mmu_context_32.h
===================================================================
--- linux.orig/arch/x86/include/asm/mmu_context_32.h
+++ linux/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}

@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_s
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- x86_write_percpu(cpu_tlbstate.active_mm, next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);

@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_s
}
#ifdef CONFIG_SMP
else {
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
Index: linux/arch/x86/include/asm/pda.h
===================================================================
--- linux.orig/arch/x86/include/asm/pda.h
+++ linux/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);

#define cpu_pda(cpu) (&per_cpu(__pda, cpu))

-#define read_pda(field) x86_read_percpu(__pda.field)
-#define write_pda(field, val) x86_write_percpu(__pda.field, val)
-#define add_pda(field, val) x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val) x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val) x86_or_percpu(__pda.field, val)
+#define read_pda(field) percpu_read(__pda.field)
+#define write_pda(field, val) percpu_write(__pda.field, val)
+#define add_pda(field, val) percpu_add(__pda.field, val)
+#define sub_pda(field, val) percpu_sub(__pda.field, val)
+#define or_pda(field, val) percpu_or(__pda.field, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define test_and_clear_bit_pda(bit, field) \
Index: linux/arch/x86/include/asm/percpu.h
===================================================================
--- linux.orig/arch/x86/include/asm/percpu.h
+++ linux/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@

#ifdef CONFIG_SMP
#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset percpu_read(this_cpu_off)
#else
#define __percpu_seg_str
#endif

-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -115,11 +110,12 @@ do { \
ret__; \
})

-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
@@ -131,6 +127,11 @@ do { \
old__; \
})

+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else
Index: linux/arch/x86/include/asm/smp.h
===================================================================
--- linux.orig/arch/x86/include/asm/smp.h
+++ linux/arch/x86/include/asm/smp.h
@@ -163,7 +163,7 @@ extern unsigned disabled_cpus __cpuinitd
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
extern int safe_smp_processor_id(void);

#elif defined(CONFIG_X86_64_SMP)
Index: linux/arch/x86/kernel/process_32.c
===================================================================
--- linux.orig/arch/x86/kernel/process_32.c
+++ linux/arch/x86/kernel/process_32.c
@@ -592,7 +592,7 @@ __switch_to(struct task_struct *prev_p,
if (prev->gs | next->gs)
loadsegment(gs, next->gs);

- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);

return prev_p;
}
Index: linux/arch/x86/kernel/tlb_32.c
===================================================================
--- linux.orig/arch/x86/kernel/tlb_32.c
+++ linux/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
*/
void leave_mm(int cpu)
{
- BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
- cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+ BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_
* BUG();
*/

- if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -237,7 +237,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();

__flush_tlb_all();
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}

Index: linux/arch/x86/mach-voyager/voyager_smp.c
===================================================================
--- linux.orig/arch/x86/mach-voyager/voyager_smp.c
+++ linux/arch/x86/mach-voyager/voyager_smp.c
@@ -410,7 +410,7 @@ void __init find_smp_config(void)
VOYAGER_SUS_IN_CONTROL_PORT);

current_thread_info()->cpu = boot_cpu_id;
- x86_write_percpu(cpu_number, boot_cpu_id);
+ percpu_write(cpu_number, boot_cpu_id);
}

/*
@@ -1790,7 +1790,7 @@ static void __init voyager_smp_cpus_done
void __init smp_setup_processor_id(void)
{
current_thread_info()->cpu = hard_smp_processor_id();
- x86_write_percpu(cpu_number, hard_smp_processor_id());
+ percpu_write(cpu_number, hard_smp_processor_id());
}

static void voyager_send_call_func(cpumask_t callmask)
Index: linux/arch/x86/xen/enlighten.c
===================================================================
--- linux.orig/arch/x86/xen/enlighten.c
+++ linux/arch/x86/xen/enlighten.c
@@ -702,17 +702,17 @@ static void xen_write_cr0(unsigned long

static void xen_write_cr2(unsigned long cr2)
{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+ percpu_read(xen_vcpu)->arch.cr2 = cr2;
}

static unsigned long xen_read_cr2(void)
{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
+ return percpu_read(xen_vcpu)->arch.cr2;
}

static unsigned long xen_read_cr2_direct(void)
{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
+ return percpu_read(xen_vcpu_info.arch.cr2);
}

static void xen_write_cr4(unsigned long cr4)
@@ -725,12 +725,12 @@ static void xen_write_cr4(unsigned long

static unsigned long xen_read_cr3(void)
{
- return x86_read_percpu(xen_cr3);
+ return percpu_read(xen_cr3);
}

static void set_current_cr3(void *v)
{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ percpu_write(xen_current_cr3, (unsigned long)v);
}

static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -755,7 +755,7 @@ static void __xen_write_cr3(bool kernel,
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

if (kernel) {
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

/* Update xen_current_cr3 once the batch has actually
been submitted. */
@@ -771,7 +771,7 @@ static void xen_write_cr3(unsigned long

/* Update while interrupts are disabled, so its atomic with
respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

__xen_write_cr3(true, cr3);

Index: linux/arch/x86/xen/irq.c
===================================================================
--- linux.orig/arch/x86/xen/irq.c
+++ linux/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
struct vcpu_info *vcpu;
unsigned long flags;

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);

/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();

@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}

@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;

/* Doesn't matter if we get preempted here, because any
Index: linux/arch/x86/xen/mmu.c
===================================================================
--- linux.orig/arch/x86/xen/mmu.c
+++ linux/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info

/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
load_cr3(swapper_pg_dir);
arch_flush_lazy_cpu_mode();
}
Index: linux/arch/x86/xen/multicalls.h
===================================================================
--- linux.orig/arch/x86/xen/multicalls.h
+++ linux/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned
xen_mc_flush();

/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(percpu_read(xen_mc_irq_flags));
}

/* Set up a callback to be called when the current batch is flushed */
Index: linux/arch/x86/xen/smp.c
===================================================================
--- linux.orig/arch/x86/xen/smp.c
+++ linux/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();

cpu_set(cpu, cpu_online_map);
- x86_write_percpu(cpu_state, CPU_ONLINE);
+ percpu_write(cpu_state, CPU_ONLINE);
wmb();

/* We can take interrupts now: we're officially "up". */
Index: linux/include/asm-generic/percpu.h
===================================================================
--- linux.orig/include/asm-generic/percpu.h
+++ linux/include/asm-generic/percpu.h
@@ -80,4 +80,32 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)

+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access:
+ */
+
+#ifndef percpu_read
+# define percpu_read(var) __get_cpu_var(var)
+#endif
+
+#ifndef percpu_write
+# define percpu_write(var, val) ({ __get_cpu_var(var) = (val); })
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
+#endif
+
+#ifndef percpu_sub
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val) ({ __get_cpu_var(var) |= (val); })
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val) ({ __get_cpu_var(var) ^= (val); })
+#endif
+
#endif /* _ASM_GENERIC_PERCPU_H_ */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-14 09:46:49 UTC
Permalink
Post by Ingo Molnar
Tejun, could you please also add the patch below to your lineup too?
It is an optimization and a cleanup, and adds the following new generic
percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()
and implements support for them on x86. (other architectures will fall
back to a default implementation)
btw., this patch means that we can move Brian's original PDA cleanups
fully and without kernel size bloat: instead of pda_read() we can use
percpu_read(), and not bloat the kernel at all.

and thus the pda ops can go away completely.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 09:56:25 UTC
Permalink
Post by Ingo Molnar
Post by H. Peter Anvin
Post by Tejun Heo
I think the two changes aren't exclusive at all. The order of things
could be different but in the end, yeah, zero-based percpu symbols w/
mostly empty pda is the goal.
Post by Brian Gerst
Once the PDA is empty, I'll base %gs at the start of the per-cpu
area. I've been working out the bugs with the last patch
(zero-basing the percpu area) before submitting, but I probably
won't have the time until this weekend to polish it off. I could
submit all but the last patch if you'd like.
Any chance you can rebase those patches on top of mine? If you don't
have time, just send them to me, I'll try to integrate them this week.
A merged tree here would be absolutely wonderful. I've kept an eye on
the discussion so far, but it looks like you guys are handling it fine.
agreed, it looks really good.
Tejun, could you please also add the patch below to your lineup too?
It is an optimization and a cleanup, and adds the following new generic
percpu_read()
I already have this in my patch series.
hm, where is that exactly? Do you have a commit ID (or could send a patch)
that i could have a look at?
Frankly, it starts to get marginal in generic code after percpu_read(),
so I didn't do them (and the per-cpu interfaces are already pretty
wide).
Tejun, is now a good time to rebase my alloc_percpu series on top of
yours? I'm more than happy to hand them over to someone with more
cycles...
We can pick it up into a topic in -tip if you dont mind.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
roel kluin
2009-01-15 10:04:34 UTC
Permalink
Post by Ingo Molnar
Index: linux/include/asm-generic/percpu.h
===================================================================
--- linux.orig/include/asm-generic/percpu.h
+++ linux/include/asm-generic/percpu.h
@@ -80,4 +80,32 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ */
+
+#ifndef percpu_read
+# define percpu_read(var) __get_cpu_var(var)
+#endif
+
+#ifndef percpu_write
+# define percpu_write(var, val) ({ __get_cpu_var(var) = (val); })
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
+#endif
+
+#ifndef percpu_sub
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
this should be:

define percpu_sub(var, val) ({ __get_cpu_var(var) -= (val); })
Post by Ingo Molnar
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val) ({ __get_cpu_var(var) |= (val); })
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val) ({ __get_cpu_var(var) ^= (val); })
+#endif
+
#endif /* _ASM_GENERIC_PERCPU_H_ */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 10:27:54 UTC
Permalink
Post by roel kluin
Post by Ingo Molnar
Index: linux/include/asm-generic/percpu.h
===================================================================
--- linux.orig/include/asm-generic/percpu.h
+++ linux/include/asm-generic/percpu.h
@@ -80,4 +80,32 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ */
+
+#ifndef percpu_read
+# define percpu_read(var) __get_cpu_var(var)
+#endif
+
+#ifndef percpu_write
+# define percpu_write(var, val) ({ __get_cpu_var(var) = (val); })
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
+#endif
+
+#ifndef percpu_sub
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
define percpu_sub(var, val) ({ __get_cpu_var(var) -= (val); })
Thanks. Will fold into the patch.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 11:33:25 UTC
Permalink
Post by Tejun Heo
Post by roel kluin
Post by Ingo Molnar
Index: linux/include/asm-generic/percpu.h
===================================================================
--- linux.orig/include/asm-generic/percpu.h
+++ linux/include/asm-generic/percpu.h
@@ -80,4 +80,32 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ */
+
+#ifndef percpu_read
+# define percpu_read(var) __get_cpu_var(var)
+#endif
+
+#ifndef percpu_write
+# define percpu_write(var, val) ({ __get_cpu_var(var) = (val); })
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
+#endif
+
+#ifndef percpu_sub
+# define percpu_add(var, val) ({ __get_cpu_var(var) += (val); })
define percpu_sub(var, val) ({ __get_cpu_var(var) -= (val); })
well spotted!
Post by Tejun Heo
Thanks. Will fold into the patch.
thanks!

Is there any interim tree for us to pull into -tip? I'd rather not let
this grow too big, it will be harder and harder to debug any regressions.
Gradual progress is a lot more debuggable. Your initial patchset is
fantastic already (gives a ~0.2% kernel image size saving for defconfig),
so it's a very good start.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 11:37:33 UTC
Permalink
Post by Ingo Molnar
Is there any interim tree for us to pull into -tip? I'd rather not let
this grow too big, it will be harder and harder to debug any regressions.
Gradual progress is a lot more debuggable. Your initial patchset is
fantastic already (gives a ~0.2% kernel image size saving for defconfig),
so it's a very good start.
Sans the last patch (I'm still working on it), the git tree is at...

http://git.kernel.org/?p=linux/kernel/git/tj/misc.git;a=shortlog;h=x86-percpu-zerobased
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git x86-percpu-zerobased

And this happens to be the third time I wrote the above addresses in
this thread. :-)

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 12:23:31 UTC
Permalink
Post by Tejun Heo
Post by Ingo Molnar
Is there any interim tree for us to pull into -tip? I'd rather not let
this grow too big, it will be harder and harder to debug any regressions.
Gradual progress is a lot more debuggable. Your initial patchset is
fantastic already (gives a ~0.2% kernel image size saving for defconfig),
so it's a very good start.
Sans the last patch (I'm still working on it), the git tree is at...
http://git.kernel.org/?p=linux/kernel/git/tj/misc.git;a=shortlog;h=x86-percpu-zerobased
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git x86-percpu-zerobased
And this happens to be the third time I wrote the above addresses in
this thread. :-)
You know the drill: you have to explicitly ask for stuff to be pulled :-)

( Otherwise if i just pull it and you rebased it or were unhappy about it
in some fashion we all add stress that could have been avoided. )

So i've picked up these commits from you:

d060676: x86_64: misc clean up after the percpu update
6d459d9: x86_64: convert pda ops to wrappers around x86 percpu accessors
d7051ef: x86_64: make pda a percpu variable
4fe7fdf: x86_64: merge 64 and 32 SMP percpu handling
44f5fbd: x86_64: fold pda into percpu area on SMP
cc1d354: x86_64: use static _cpu_pda array
c701268: x86_64: load pointer to pda into %gs while brining up a CPU
7e36da9: x86_64: make percpu symbols zerobased on SMP
3fc860d: x86_32: make vmlinux_32.lds.S use PERCPU() macro
bc497c7: x86_64: Cleanup early setup_percpu references
769d780: x86: make early_per_cpu() a lvalue and use it
66cbc8e: x86_64: fix pda_to_op()

into tip/x86/percpu.

I did the following small reorganizations:

- i rebased them on top of tip/cpus4096, tip/x86/cleanups and
tip/x86/urgent - that is a stable base.

- i resolved the conflicts that arose due to recent cpumask and upstream
changes.

- i standardized the commit logs to the usual x86 style and added
Original-From: Mike Travis tags to those patches that were derived from
Mike's patches.

If they pass testing they'll be stable commits from that point on, and you
can base your git trees on that topic branch.

Could you please have a look at the end result? Here are the Git
coordinates for it:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86/percpu

If we are happy with it then these commits can be stable commits from now
on, and you can base your future changes on this Git tree, and i can pull
updates from you without having to rebase them.

Does that workflow sound OK to you?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 13:10:50 UTC
Permalink
Post by Ingo Molnar
Post by Tejun Heo
And this happens to be the third time I wrote the above addresses in
this thread. :-)
You know the drill: you have to explicitly ask for stuff to be pulled :-)
Heh... Jeff usually just regenerates patches from the git tree or just
takes the mails, so not too familiar with pull requests. :-)
Post by Ingo Molnar
( Otherwise if i just pull it and you rebased it or were unhappy about it
in some fashion we all add stress that could have been avoided. )
d060676: x86_64: misc clean up after the percpu update
6d459d9: x86_64: convert pda ops to wrappers around x86 percpu accessors
d7051ef: x86_64: make pda a percpu variable
4fe7fdf: x86_64: merge 64 and 32 SMP percpu handling
44f5fbd: x86_64: fold pda into percpu area on SMP
cc1d354: x86_64: use static _cpu_pda array
c701268: x86_64: load pointer to pda into %gs while brining up a CPU
7e36da9: x86_64: make percpu symbols zerobased on SMP
3fc860d: x86_32: make vmlinux_32.lds.S use PERCPU() macro
bc497c7: x86_64: Cleanup early setup_percpu references
769d780: x86: make early_per_cpu() a lvalue and use it
66cbc8e: x86_64: fix pda_to_op()
into tip/x86/percpu.
- i rebased them on top of tip/cpus4096, tip/x86/cleanups and
tip/x86/urgent - that is a stable base.
- i resolved the conflicts that arose due to recent cpumask and upstream
changes.
- i standardized the commit logs to the usual x86 style and added
Original-From: Mike Travis tags to those patches that were derived from
Mike's patches.
If they pass testing they'll be stable commits from that point on, and you
can base your git trees on that topic branch.
Could you please have a look at the end result? Here are the Git
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86/percpu
Yeap, looks fine to me.
Post by Ingo Molnar
If we are happy with it then these commits can be stable commits from now
on, and you can base your future changes on this Git tree, and i can pull
updates from you without having to rebase them.
Does that workflow sound OK to you?
Yes, perfectly okay.

Thanks. :-)
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 13:34:23 UTC
Permalink
FYI, -tip testing found the following bug with your percpu stuff:

There's an early exception during bootup, on 64-bit x86:

PANIC: early exception 0e rip 10:ffffffff80276855: error ? cr2 6688

- gcc version 4.3.2 20081007 (Red Hat 4.3.2-6) (GCC)
- binutils-2.18.50.0.6-2.x86_64

config attached. You can find the disassembly of lock_release_holdtime()
below - that's where it crashed:

ffffffff80276851: 48 8d 04 06 lea (%rsi,%rax,1),%rax
ffffffff80276855: 4c 3b a0 a8 00 00 00 cmp 0xa8(%rax),%r12
ffffffff8027685c: 7e 07 jle ffffffff80276865 <lock_release_holdtime+0x155>

it probably went wrong here (due to the PDA changes):

ffffffff80276784: 65 48 8b 15 f4 d9 d8 mov %gs:0x7fd8d9f4(%rip),%rdx # 4180 <per_cpu__this_cpu_off>
ffffffff8027678b: 7f

and we jumped to ffffffff80276840 after that and crashed.

Since the crash is so early, you can build the attached config on any
64-bit test-system and try to boot into it - it should crash all the time.
Let me know if you have trouble reproducing it.

Ingo


ffffffff80276710 <lock_release_holdtime>:
ffffffff80276710: 55 push %rbp
ffffffff80276711: 48 89 e5 mov %rsp,%rbp
ffffffff80276714: 48 83 ec 10 sub $0x10,%rsp
ffffffff80276718: 8b 05 42 6f a2 00 mov 0xa26f42(%rip),%eax # ffffffff80c9d660 <lock_stat>
ffffffff8027671e: 48 89 1c 24 mov %rbx,(%rsp)
ffffffff80276722: 4c 89 64 24 08 mov %r12,0x8(%rsp)
ffffffff80276727: 48 89 fb mov %rdi,%rbx
ffffffff8027672a: 85 c0 test %eax,%eax
ffffffff8027672c: 75 12 jne ffffffff80276740 <lock_release_holdtime+0x30>
ffffffff8027672e: 48 8b 1c 24 mov (%rsp),%rbx
ffffffff80276732: 4c 8b 64 24 08 mov 0x8(%rsp),%r12
ffffffff80276737: c9 leaveq
ffffffff80276738: c3 retq
ffffffff80276739: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
ffffffff80276740: e8 0b d7 f9 ff callq ffffffff80213e50 <sched_clock>
ffffffff80276745: 49 89 c4 mov %rax,%r12
ffffffff80276748: 0f b7 43 30 movzwl 0x30(%rbx),%eax
ffffffff8027674c: 4c 2b 63 28 sub 0x28(%rbx),%r12
ffffffff80276750: 66 25 ff 1f and $0x1fff,%ax
ffffffff80276754: 0f 84 76 01 00 00 je ffffffff802768d0 <lock_release_holdtime+0x1c0>
ffffffff8027675a: 0f b7 c0 movzwl %ax,%eax
ffffffff8027675d: 48 8d 04 80 lea (%rax,%rax,4),%rax
ffffffff80276761: 48 8d 04 80 lea (%rax,%rax,4),%rax
ffffffff80276765: 48 c1 e0 04 shl $0x4,%rax
ffffffff80276769: 48 2d 90 01 00 00 sub $0x190,%rax
ffffffff8027676f: 48 8d 88 e0 b5 21 81 lea -0x7ede4a20(%rax),%rcx
ffffffff80276776: 48 81 e9 e0 b5 21 81 sub $0xffffffff8121b5e0,%rcx
ffffffff8027677d: 48 c7 c0 e0 65 00 00 mov $0x65e0,%rax
ffffffff80276784: 65 48 8b 15 f4 d9 d8 mov %gs:0x7fd8d9f4(%rip),%rdx # 4180 <per_cpu__this_cpu_off>
ffffffff8027678b: 7f
ffffffff8027678c: 48 c1 f9 04 sar $0x4,%rcx
ffffffff80276790: 48 8d 34 10 lea (%rax,%rdx,1),%rsi
ffffffff80276794: 48 b8 29 5c 8f c2 f5 mov $0x8f5c28f5c28f5c29,%rax
ffffffff8027679b: 28 5c 8f
ffffffff8027679e: 48 0f af c8 imul %rax,%rcx
ffffffff802767a2: f6 43 32 03 testb $0x3,0x32(%rbx)
ffffffff802767a6: 0f 84 94 00 00 00 je ffffffff80276840 <lock_release_holdtime+0x130>
ffffffff802767ac: 48 89 ca mov %rcx,%rdx
ffffffff802767af: 48 89 c8 mov %rcx,%rax
ffffffff802767b2: 48 c1 e2 05 shl $0x5,%rdx
ffffffff802767b6: 48 c1 e0 08 shl $0x8,%rax
ffffffff802767ba: 48 29 d0 sub %rdx,%rax
ffffffff802767bd: 48 8d 04 06 lea (%rsi,%rax,1),%rax
ffffffff802767c1: 4c 3b a0 88 00 00 00 cmp 0x88(%rax),%r12
ffffffff802767c8: 7e 07 jle ffffffff802767d1 <lock_release_holdtime+0xc1>
ffffffff802767ca: 4c 89 a0 88 00 00 00 mov %r12,0x88(%rax)
ffffffff802767d1: 48 89 ca mov %rcx,%rdx
ffffffff802767d4: 48 89 c8 mov %rcx,%rax
ffffffff802767d7: 48 c1 e2 05 shl $0x5,%rdx
ffffffff802767db: 48 c1 e0 08 shl $0x8,%rax
ffffffff802767df: 48 29 d0 sub %rdx,%rax
ffffffff802767e2: 48 8b 84 06 80 00 00 mov 0x80(%rsi,%rax,1),%rax
ffffffff802767e9: 00
ffffffff802767ea: 49 39 c4 cmp %rax,%r12
ffffffff802767ed: 7c 05 jl ffffffff802767f4 <lock_release_holdtime+0xe4>
ffffffff802767ef: 48 85 c0 test %rax,%rax
ffffffff802767f2: 75 19 jne ffffffff8027680d <lock_release_holdtime+0xfd>
ffffffff802767f4: 48 89 ca mov %rcx,%rdx
ffffffff802767f7: 48 89 c8 mov %rcx,%rax
ffffffff802767fa: 48 c1 e2 05 shl $0x5,%rdx
ffffffff802767fe: 48 c1 e0 08 shl $0x8,%rax
ffffffff80276802: 48 29 d0 sub %rdx,%rax
ffffffff80276805: 4c 89 a4 06 80 00 00 mov %r12,0x80(%rsi,%rax,1)
ffffffff8027680c: 00
ffffffff8027680d: 48 89 ca mov %rcx,%rdx
ffffffff80276810: 48 89 c8 mov %rcx,%rax
ffffffff80276813: 48 c1 e2 05 shl $0x5,%rdx
ffffffff80276817: 48 c1 e0 08 shl $0x8,%rax
ffffffff8027681b: 48 29 d0 sub %rdx,%rax
ffffffff8027681e: 48 8d 04 06 lea (%rsi,%rax,1),%rax
ffffffff80276822: 4c 01 a0 90 00 00 00 add %r12,0x90(%rax)
ffffffff80276829: 48 83 80 98 00 00 00 addq $0x1,0x98(%rax)
ffffffff80276830: 01
ffffffff80276831: e9 f8 fe ff ff jmpq ffffffff8027672e <lock_release_holdtime+0x1e>
ffffffff80276836: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
ffffffff8027683d: 00 00 00
ffffffff80276840: 48 89 ca mov %rcx,%rdx
ffffffff80276843: 48 89 c8 mov %rcx,%rax
ffffffff80276846: 48 c1 e2 05 shl $0x5,%rdx
ffffffff8027684a: 48 c1 e0 08 shl $0x8,%rax
ffffffff8027684e: 48 29 d0 sub %rdx,%rax
ffffffff80276851: 48 8d 04 06 lea (%rsi,%rax,1),%rax
ffffffff80276855: 4c 3b a0 a8 00 00 00 cmp 0xa8(%rax),%r12
ffffffff8027685c: 7e 07 jle ffffffff80276865 <lock_release_holdtime+0x155>
ffffffff8027685e: 4c 89 a0 a8 00 00 00 mov %r12,0xa8(%rax)
ffffffff80276865: 48 89 ca mov %rcx,%rdx
ffffffff80276868: 48 89 c8 mov %rcx,%rax
ffffffff8027686b: 48 c1 e2 05 shl $0x5,%rdx
ffffffff8027686f: 48 c1 e0 08 shl $0x8,%rax
ffffffff80276873: 48 29 d0 sub %rdx,%rax
ffffffff80276876: 48 8b 84 06 a0 00 00 mov 0xa0(%rsi,%rax,1),%rax
ffffffff8027687d: 00
ffffffff8027687e: 49 39 c4 cmp %rax,%r12
ffffffff80276881: 7c 05 jl ffffffff80276888 <lock_release_holdtime+0x178>
ffffffff80276883: 48 85 c0 test %rax,%rax
ffffffff80276886: 75 19 jne ffffffff802768a1 <lock_release_holdtime+0x191>
ffffffff80276888: 48 89 ca mov %rcx,%rdx
ffffffff8027688b: 48 89 c8 mov %rcx,%rax
ffffffff8027688e: 48 c1 e2 05 shl $0x5,%rdx
ffffffff80276892: 48 c1 e0 08 shl $0x8,%rax
ffffffff80276896: 48 29 d0 sub %rdx,%rax
ffffffff80276899: 4c 89 a4 06 a0 00 00 mov %r12,0xa0(%rsi,%rax,1)
ffffffff802768a0: 00
ffffffff802768a1: 48 89 ca mov %rcx,%rdx
ffffffff802768a4: 48 89 c8 mov %rcx,%rax
ffffffff802768a7: 48 c1 e2 05 shl $0x5,%rdx
ffffffff802768ab: 48 c1 e0 08 shl $0x8,%rax
ffffffff802768af: 48 29 d0 sub %rdx,%rax
ffffffff802768b2: 48 8d 04 06 lea (%rsi,%rax,1),%rax
ffffffff802768b6: 4c 01 a0 b0 00 00 00 add %r12,0xb0(%rax)
ffffffff802768bd: 48 83 80 b8 00 00 00 addq $0x1,0xb8(%rax)
ffffffff802768c4: 01
ffffffff802768c5: e9 64 fe ff ff jmpq ffffffff8027672e <lock_release_holdtime+0x1e>
ffffffff802768ca: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
ffffffff802768d0: 8b 05 6a 30 d3 00 mov 0xd3306a(%rip),%eax # ffffffff80fa9940 <oops_in_progress>
ffffffff802768d6: 85 c0 test %eax,%eax
ffffffff802768d8: 74 07 je ffffffff802768e1 <lock_release_holdtime+0x1d1>
ffffffff802768da: 31 c9 xor %ecx,%ecx
ffffffff802768dc: e9 95 fe ff ff jmpq ffffffff80276776 <lock_release_holdtime+0x66>
ffffffff802768e1: e8 aa 2c 20 00 callq ffffffff80479590 <debug_locks_off>
ffffffff802768e6: 85 c0 test %eax,%eax
ffffffff802768e8: 74 f0 je ffffffff802768da <lock_release_holdtime+0x1ca>
ffffffff802768ea: 8b 05 d0 ed 51 01 mov 0x151edd0(%rip),%eax # ffffffff817956c0 <debug_locks_silent>
ffffffff802768f0: 85 c0 test %eax,%eax
ffffffff802768f2: 75 e6 jne ffffffff802768da <lock_release_holdtime+0x1ca>
ffffffff802768f4: 31 d2 xor %edx,%edx
ffffffff802768f6: be 83 00 00 00 mov $0x83,%esi
ffffffff802768fb: 48 c7 c7 2c 73 b9 80 mov $0xffffffff80b9732c,%rdi
ffffffff80276902: 31 c0 xor %eax,%eax
ffffffff80276904: e8 d7 5b fd ff callq ffffffff8024c4e0 <warn_slowpath>
ffffffff80276909: 31 c9 xor %ecx,%ecx
ffffffff8027690b: e9 66 fe ff ff jmpq ffffffff80276776 <lock_release_holdtime+0x66>
Ingo Molnar
2009-01-15 13:35:12 UTC
Permalink
Post by Ingo Molnar
PANIC: early exception 0e rip 10:ffffffff80276855: error ? cr2 6688
- gcc version 4.3.2 20081007 (Red Hat 4.3.2-6) (GCC)
- binutils-2.18.50.0.6-2.x86_64
config attached. You can find the disassembly of lock_release_holdtime()
the tree i tested it with was:

d5fd351: x86: misc clean up after the percpu update

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 13:40:02 UTC
Permalink
Post by Ingo Molnar
PANIC: early exception 0e rip 10:ffffffff80276855: error ? cr2 6688
- gcc version 4.3.2 20081007 (Red Hat 4.3.2-6) (GCC)
- binutils-2.18.50.0.6-2.x86_64
config attached. You can find the disassembly of lock_release_holdtime()
Gut feeling: we must have gotten a window where the PDA is not right. Note
how it crashes in lock_release_holdtime() - that is CONFIG_LOCK_STAT
instrumentation - very lowlevel and pervasive. Function tracing is also
enabled although it should be inactive at the early stages.

So i'd take a good look at the PDA setup portion of the boot code, and see
whether some spinlock acquire/release can slip in while the
PDA/percpu-area is not reliable.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 22:57:59 UTC
Permalink
Post by Ingo Molnar
Post by Ingo Molnar
PANIC: early exception 0e rip 10:ffffffff80276855: error ? cr2 6688
- gcc version 4.3.2 20081007 (Red Hat 4.3.2-6) (GCC)
- binutils-2.18.50.0.6-2.x86_64
config attached. You can find the disassembly of lock_release_holdtime()
Gut feeling: we must have gotten a window where the PDA is not right.
Note how it crashes in lock_release_holdtime() - that is
CONFIG_LOCK_STAT instrumentation - very lowlevel and pervasive. Function
tracing is also enabled although it should be inactive at the early
stages.
So i'd take a good look at the PDA setup portion of the boot code, and
see whether some spinlock acquire/release can slip in while the
PDA/percpu-area is not reliable.
Btw., if this turns out to be a genuine linker bug due to us crossing
RIP-relative references from minus-1.9G-ish negative addresses up to
slightly-above-zero positive addresses (or something similar - we are
pushing the linker here quite a bit), we still have a contingency plan:

We can relocate the percpu symbols to small-negative addresses, and place
it so that the end of the dynamic percpu area ends at zero.

Then we could bootmem alloc the percpu size plus 4096 bytes. Voila: we've
got a page free for the canary field, at the end of the percpu area, and
placed just correctly for gcc to see it at %gs:40.

But this isnt as totally clean and simple as your current patchset, so
lets only do it if needed. It will also put some constraints on how we can
shape dynamic percpu areas, and an overflow near the end of the dynamic
percpu area could affect the canary - but still, it's a workable path as
well, should the zero-based approach fail.

OTOH, this crash does not have the feeling of a linker bug to me - it has
the feeling of a setup ordering anomaly.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-16 01:28:48 UTC
Permalink
EXPORT_PER_CPU_SYMBOL() got misplaced during merge leading to build
failure. Fix it.

Signed-off-by: Tejun Heo <***@kernel.org>
---
arch/x86/kernel/setup_percpu.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/mach-integrator/clock.h b/arch/arm/mach-integrator/clock.h
deleted file mode 100644
index e69de29..0000000
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index daeedf8..b5c35af 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -86,9 +86,8 @@ void __cpuinit load_pda_offset(int cpu)
}
#ifndef CONFIG_SMP
DEFINE_PER_CPU(struct x8664_pda, __pda);
-EXPORT_PER_CPU_SYMBOL(__pda);
#endif
-
+EXPORT_PER_CPU_SYMBOL(__pda);
#endif /* CONFIG_SMP && CONFIG_X86_64 */

#ifdef CONFIG_X86_64
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-16 03:26:25 UTC
Permalink
On x86_64, if get_per_cpu_var() is used before per cpu area is setup
(if lockdep is turned on, it happens), it needs this_cpu_off to point
to __per_cpu_load. Initialize accordingly.

Signed-off-by: Tejun Heo <***@kernel.org>
---
Okay, this fixes the problem. Tested with gcc-4.3.2 and binutils-2.19
(openSUSE 11.1). For all four variants I can test (64-smp, 64-up,
32-smp, 32-up). This and the previous merge fix patch are available
in the following git branch.

git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

arch/x86/kernel/smpcommon.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 84395fa..7e15781 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -3,8 +3,13 @@
*/
#include <linux/module.h>
#include <asm/smp.h>
+#include <asm/sections.h>

+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = (unsigned long)__per_cpu_load;
+#else
DEFINE_PER_CPU(unsigned long, this_cpu_off);
+#endif
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

#ifdef CONFIG_X86_32
--
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 13:16:39 UTC
Permalink
Post by Tejun Heo
On x86_64, if get_per_cpu_var() is used before per cpu area is setup
(if lockdep is turned on, it happens), it needs this_cpu_off to point
to __per_cpu_load. Initialize accordingly.
---
Okay, this fixes the problem. Tested with gcc-4.3.2 and binutils-2.19
(openSUSE 11.1). For all four variants I can test (64-smp, 64-up,
32-smp, 32-up). This and the previous merge fix patch are available
in the following git branch.
cool!
Post by Tejun Heo
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
arch/x86/kernel/smpcommon.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 84395fa..7e15781 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -3,8 +3,13 @@
*/
#include <linux/module.h>
#include <asm/smp.h>
+#include <asm/sections.h>
+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = (unsigned long)__per_cpu_load;
+#else
DEFINE_PER_CPU(unsigned long, this_cpu_off);
+#endif
I've pulled your tree, but couldnt we do this symmetrically in the 32-bit
case too and avoid the ugly #ifdef somehow?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-16 13:48:13 UTC
Permalink
Post by Ingo Molnar
Post by Tejun Heo
+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = (unsigned long)__per_cpu_load;
+#else
DEFINE_PER_CPU(unsigned long, this_cpu_off);
+#endif
I've pulled your tree, but couldnt we do this symmetrically in the 32-bit
case too and avoid the ugly #ifdef somehow?
You can take the "x86_32: make percpu symbols zerobased on SMP" patch
and the above ifdef won't be necessary along with similar one in
setup_percpu.c. The above ifdef is because 64 is zero based while 32
is not. If you want something, say, __per_cpu_base_off or something
to be defined conditionally and used in these two cases, I'm not sure
whether that would clean up the code or obfuscate it. :-)

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 21:55:59 UTC
Permalink
Hello, Ingo.
Post by Ingo Molnar
Post by Ingo Molnar
PANIC: early exception 0e rip 10:ffffffff80276855: error ? cr2 6688
- gcc version 4.3.2 20081007 (Red Hat 4.3.2-6) (GCC)
- binutils-2.18.50.0.6-2.x86_64
config attached. You can find the disassembly of lock_release_holdtime()
Gut feeling: we must have gotten a window where the PDA is not right. Note
how it crashes in lock_release_holdtime() - that is CONFIG_LOCK_STAT
instrumentation - very lowlevel and pervasive. Function tracing is also
enabled although it should be inactive at the early stages.
So i'd take a good look at the PDA setup portion of the boot code, and see
whether some spinlock acquire/release can slip in while the
PDA/percpu-area is not reliable.
Heh... I tought I got this covered pretty good. I'll get onto
debugging in a few hours. Thanks for testing.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-15 21:03:03 UTC
Permalink
Post by Ingo Molnar
- i standardized the commit logs to the usual x86 style and added
Original-From: Mike Travis tags to those patches that were derived from
Mike's patches.
Most of that is based on earlier patches by me.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-16 09:42:30 UTC
Permalink
Post by Christoph Lameter
Post by Ingo Molnar
- i standardized the commit logs to the usual x86 style and added
Original-From: Mike Travis tags to those patches that were derived from
Mike's patches.
Most of that is based on earlier patches by me.
Sorry, I didn't know that. Ingo, please feel free to re-base the tree
with updated credits.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 13:24:39 UTC
Permalink
Post by Tejun Heo
Post by Christoph Lameter
Post by Ingo Molnar
- i standardized the commit logs to the usual x86 style and added
Original-From: Mike Travis tags to those patches that were derived from
Mike's patches.
Most of that is based on earlier patches by me.
Sorry, I didn't know that. Ingo, please feel free to re-base the tree
with updated credits.
Sure, i've rebased it and added this to the front of every affected patch:

[ Based on original patch from Christoph Lameter and Mike Travis. ]

I've also first pulled your latest tree, please base new changes and new
pull requests on the tip/x86/percpu tree i just pushed out.

I also merged tip/x86/percpu into tip/master and resumed testing it.

I suspect the next step will be to integrate Brian's "get rid of the rest
of the PDA" patches?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-16 13:50:04 UTC
Permalink
Hello,
Post by Ingo Molnar
Post by Tejun Heo
Post by Christoph Lameter
Most of that is based on earlier patches by me.
Sorry, I didn't know that. Ingo, please feel free to re-base the tree
with updated credits.
[ Based on original patch from Christoph Lameter and Mike Travis. ]
Good.
Post by Ingo Molnar
I've also first pulled your latest tree, please base new changes and new
pull requests on the tip/x86/percpu tree i just pushed out.
I also merged tip/x86/percpu into tip/master and resumed testing it.
Great.
Post by Ingo Molnar
I suspect the next step will be to integrate Brian's "get rid of the rest
of the PDA" patches?
Yeap, Brian, any progress? If you don't have the time, feel free to
just shoot the patches my way.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-17 06:31:31 UTC
Permalink
The newly added PERCPU_*() macros define and use __per_cpu_load but
VMLINUX_SYMBOL() was missing from usages causing build failures on
archs where linker visible symbol is different from C symbols
(e.g. blackfin).

Signed-off-by: Tejun Heo <***@kernel.org>
---
Just realized this was reply to a private mail. Restoring cc list
and resending.

Thanks.

include/asm-generic/vmlinux.lds.h | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e53319c..aa6b9b1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -432,13 +432,14 @@

#define PERCPU_PROLOG(vaddr) \
VMLINUX_SYMBOL(__per_cpu_load) = .; \
- .data.percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \
+ .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
+ - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__per_cpu_start) = .;

#define PERCPU_EPILOG(phdr) \
VMLINUX_SYMBOL(__per_cpu_end) = .; \
} phdr \
- . = __per_cpu_load + SIZEOF(.data.percpu);
+ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);

/**
* PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
--
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-17 06:32:55 UTC
Permalink
arm, arm/mach-integrator and powerpc were missing
data.percpu.page_aligned in their percpu output section definitions.
Add it.

Signed-off-by: Tejun Heo <***@kernel.org>
---
This and the previous patch is available in the following git branch.

git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

The tree is on top of the rebased tip/core/percpu (cd3adf52).

Thanks.

arch/arm/kernel/vmlinux.lds.S | 1 +
arch/ia64/kernel/vmlinux.lds.S | 1 +
arch/powerpc/kernel/vmlinux.lds.S | 1 +
3 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 0021607..85598f7 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -65,6 +65,7 @@ SECTIONS
#endif
. = ALIGN(4096);
__per_cpu_start = .;
+ *(.data.percpu.page_aligned)
*(.data.percpu)
*(.data.percpu.shared_aligned)
__per_cpu_end = .;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 10a7d47..f45e4e5 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -219,6 +219,7 @@ SECTIONS
.data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
{
__per_cpu_start = .;
+ *(.data.percpu.page_aligned)
*(.data.percpu)
*(.data.percpu.shared_aligned)
__per_cpu_end = .;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 47bf15c..04e8ece 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -182,6 +182,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;
+ *(.data.percpu.page_aligned)
*(.data.percpu)
*(.data.percpu.shared_aligned)
__per_cpu_end = .;
--
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-17 08:22:15 UTC
Permalink
Post by Tejun Heo
arm, arm/mach-integrator and powerpc were missing
.data.percpu.page_aligned in their percpu output section definitions.
Add it.
---
This and the previous patch is available in the following git branch.
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
The tree is on top of the rebased tip/core/percpu (cd3adf52).
Thanks.
Pulled into tip/core/percpu, thanks Tejun!

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-22 00:35:24 UTC
Permalink
Reviewed-by: Christoph Lameter <***@linux-foundation.org>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-21 23:49:06 UTC
Permalink
Yup we forgot that one.

Reviewed-by: Christoph Lameter <***@linux-foundation.org>

(not sure about the exact tag to use. AFAICT I wrote some early version
of this code)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 10:27:38 UTC
Permalink
Hello, Ingo.
Post by Ingo Molnar
Tejun, could you please also add the patch below to your lineup too?
Sure thing.
Post by Ingo Molnar
It is an optimization and a cleanup, and adds the following new generic
percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()
and implements support for them on x86. (other architectures will fall
back to a default implementation)
The advantage is that for example to read a local percpu variable, instead
return __get_cpu_var(var);
ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx
ffffffff8102ca32: 81
ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax
ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax
return percpu_read(var);
ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax
I also cleaned up the x86-specific APIs and made the x86 code use these
new generic percpu primitives.
It looks quite hard to convince the compiler to generate the optimized
single-instruction sequence for us out of __get_cpu_var(var) - or can you
perhaps see a way to do it?
Yeah, I thought about that too but couldn't think of a way to persuade
the compiler because the compiler doesn't know how to access the
address. I'll play with it a bit more but the clumsy percpu_*()
accessors probably might be the only way. :-(

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 11:31:38 UTC
Permalink
Post by Tejun Heo
Hello, Ingo.
Post by Ingo Molnar
Tejun, could you please also add the patch below to your lineup too?
Sure thing.
Post by Ingo Molnar
It is an optimization and a cleanup, and adds the following new generic
percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()
and implements support for them on x86. (other architectures will fall
back to a default implementation)
The advantage is that for example to read a local percpu variable, instead
return __get_cpu_var(var);
ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx
ffffffff8102ca32: 81
ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax
ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax
return percpu_read(var);
ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax
I also cleaned up the x86-specific APIs and made the x86 code use these
new generic percpu primitives.
It looks quite hard to convince the compiler to generate the optimized
single-instruction sequence for us out of __get_cpu_var(var) - or can you
perhaps see a way to do it?
Yeah, I thought about that too but couldn't think of a way to persuade
the compiler because the compiler doesn't know how to access the
address. I'll play with it a bit more but the clumsy percpu_*()
accessors probably might be the only way. :-(
the new ops are a pretty nice and clean solution i think.

Firstly, accessing the current CPU is the only safe shortcut anyway (there
is where we can do %fs/%gs / rip-relative addressing modes), and the
generic per_cpu() APIs dont really provide that guarantee for us. We might
be able to hook into __get_cpu_var() but those both require to be an
lvalue and are also relatively rarely used.

So introducing the new, rather straightforward APIs and using them
wherever they matter for performance is good. Your patchset already shaved
off an instruction from ordinary per_cpu() accesses, so it's all moving
rather close to the most-optimal situation already.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 11:39:56 UTC
Permalink
Hello,
The new ops are a pretty nice and clean solution i think.
Firstly, accessing the current CPU is the only safe shortcut anyway (there
is where we can do %fs/%gs / rip-relative addressing modes), and the
generic per_cpu() APIs dont really provide that guarantee for us. We might
be able to hook into __get_cpu_var() but those both require to be an
lvalue and are also relatively rarely used.
So introducing the new, rather straightforward APIs and using them
wherever they matter for performance is good. Your patchset already shaved
off an instruction from ordinary per_cpu() accesses, so it's all moving
rather close to the most-optimal situation already.
Yeah, I don't think we can do much better than those ops. I have two
issues tho.

1. percpu_and() is missing. I added it for completeness's sake.

2. The generic percpu_op() should be local to the cpu, so it should
expand to...

do { get_cpu_var(var) OP (val); put_cpu_var(var) } while (0)

as the original x86_OP_percpu() did. Right?

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 12:27:09 UTC
Permalink
Post by Tejun Heo
Hello,
The new ops are a pretty nice and clean solution i think.
Firstly, accessing the current CPU is the only safe shortcut anyway (there
is where we can do %fs/%gs / rip-relative addressing modes), and the
generic per_cpu() APIs dont really provide that guarantee for us. We might
be able to hook into __get_cpu_var() but those both require to be an
lvalue and are also relatively rarely used.
So introducing the new, rather straightforward APIs and using them
wherever they matter for performance is good. Your patchset already shaved
off an instruction from ordinary per_cpu() accesses, so it's all moving
rather close to the most-optimal situation already.
Yeah, I don't think we can do much better than those ops. I have two
issues tho.
1. percpu_and() is missing. I added it for completeness's sake.
Sure - it would be commonly used as well. Perhaps we dont need
percpu_xor() at all? (or and and ops already give a complete algebra)
Post by Tejun Heo
2. The generic percpu_op() should be local to the cpu, so it should
expand to...
do { get_cpu_var(var) OP (val); put_cpu_var(var) } while (0)
as the original x86_OP_percpu() did. Right?
Thanks.
hm, that removes much of its appeal - a preempt off+on sequence is quite
bloaty. Most percpu usage sites are already within critical sections.

I think they are more analogous to per_cpu(var, cpu), which does not
disable preemption either. There's no 'get/put' in them, which signals
that they dont auto-disable preemption.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 13:06:18 UTC
Permalink
Post by Ingo Molnar
Post by Tejun Heo
Yeah, I don't think we can do much better than those ops. I have two
issues tho.
1. percpu_and() is missing. I added it for completeness's sake.
Sure - it would be commonly used as well. Perhaps we dont need
percpu_xor() at all? (or and and ops already give a complete algebra)
Maybe... I don't know.
Post by Ingo Molnar
Post by Tejun Heo
2. The generic percpu_op() should be local to the cpu, so it should
expand to...
do { get_cpu_var(var) OP (val); put_cpu_var(var) } while (0)
as the original x86_OP_percpu() did. Right?
Thanks.
hm, that removes much of its appeal - a preempt off+on sequence is quite
bloaty. Most percpu usage sites are already within critical sections.
I think they are more analogous to per_cpu(var, cpu), which does not
disable preemption either. There's no 'get/put' in them, which signals
that they dont auto-disable preemption.
Yeah, that's exactly the reason why x86 has those specific ops in the
first place because if they can be made single instruction, there's no
need to mess with preemption thus reducing the cost of the operations
greatly but for generic implementation it should be guarded by
preemption disable/enable to guarantee correct operation to maintain
the same/correct semantics (being preempted inbetween the load and
store instructions implementing += can be disastrous for example).

From the comment I added on top of the generic ops....

/*
* Optional methods for optimized non-lvalue per-cpu variable access.
*
* @var can be a percpu variable or a field of it and its size should
* equal char, int or long. percpu_read() evaluates to a lvalue and
* all others to void.
*
* These operations are guaranteed to be atomic w.r.t. preemption.
* The generic versions use plain get/put_cpu_var(). Archs are
* encouraged to implement single-instruction alternatives which don't
* require preemption protection.
*/

And the updated patch looks like...

---
From: Ingo Molnar <***@elte.hu>
Subject: percpu: add optimized generic percpu accessors

It is an optimization and a cleanup, and adds the following new
generic percpu methods:

percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

return __get_cpu_var(var);

ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx
ffffffff8102ca32: 81
ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax
ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

return percpu_read(var);

ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
* added percpu_and() for completeness's sake
* made generic percpu ops atomic against preemption

Signed-off-by: Ingo Molnar <***@elte.hu>
Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Roel Kluin <***@gmail.com>
---
arch/x86/include/asm/current.h | 2 -
arch/x86/include/asm/irq_regs_32.h | 4 +-
arch/x86/include/asm/mmu_context_32.h | 12 +++----
arch/x86/include/asm/pda.h | 10 +++---
arch/x86/include/asm/percpu.h | 24 ++++++++-------
arch/x86/include/asm/smp.h | 2 -
arch/x86/kernel/process_32.c | 2 -
arch/x86/kernel/tlb_32.c | 10 +++---
arch/x86/mach-voyager/voyager_smp.c | 4 +-
arch/x86/xen/enlighten.c | 14 ++++-----
arch/x86/xen/irq.c | 8 ++---
arch/x86/xen/mmu.c | 2 -
arch/x86/xen/multicalls.h | 2 -
arch/x86/xen/smp.c | 2 -
include/asm-generic/percpu.h | 52 ++++++++++++++++++++++++++++++++++
15 files changed, 102 insertions(+), 48 deletions(-)

Index: work/arch/x86/include/asm/current.h
===================================================================
--- work.orig/arch/x86/include/asm/current.h
+++ work/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
- return x86_read_percpu(current_task);
+ return percpu_read(current_task);
}

#else /* X86_32 */
Index: work/arch/x86/include/asm/irq_regs_32.h
===================================================================
--- work.orig/arch/x86/include/asm/irq_regs_32.h
+++ work/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_re

static inline struct pt_regs *get_irq_regs(void)
{
- return x86_read_percpu(irq_regs);
+ return percpu_read(irq_regs);
}

static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_re
struct pt_regs *old_regs;

old_regs = get_irq_regs();
- x86_write_percpu(irq_regs, new_regs);
+ percpu_write(irq_regs, new_regs);

return old_regs;
}
Index: work/arch/x86/include/asm/mmu_context_32.h
===================================================================
--- work.orig/arch/x86/include/asm/mmu_context_32.h
+++ work/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}

@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_s
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- x86_write_percpu(cpu_tlbstate.active_mm, next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);

@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_s
}
#ifdef CONFIG_SMP
else {
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
Index: work/arch/x86/include/asm/pda.h
===================================================================
--- work.orig/arch/x86/include/asm/pda.h
+++ work/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);

#define cpu_pda(cpu) (&per_cpu(__pda, cpu))

-#define read_pda(field) x86_read_percpu(__pda.field)
-#define write_pda(field, val) x86_write_percpu(__pda.field, val)
-#define add_pda(field, val) x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val) x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val) x86_or_percpu(__pda.field, val)
+#define read_pda(field) percpu_read(__pda.field)
+#define write_pda(field, val) percpu_write(__pda.field, val)
+#define add_pda(field, val) percpu_add(__pda.field, val)
+#define sub_pda(field, val) percpu_sub(__pda.field, val)
+#define or_pda(field, val) percpu_or(__pda.field, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define test_and_clear_bit_pda(bit, field) \
Index: work/arch/x86/include/asm/percpu.h
===================================================================
--- work.orig/arch/x86/include/asm/percpu.h
+++ work/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@

#ifdef CONFIG_SMP
#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset percpu_read(this_cpu_off)
#else
#define __percpu_seg_str
#endif

-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do { \
ret__; \
})

-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
@@ -131,6 +128,11 @@ do { \
old__; \
})

+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else
Index: work/arch/x86/include/asm/smp.h
===================================================================
--- work.orig/arch/x86/include/asm/smp.h
+++ work/arch/x86/include/asm/smp.h
@@ -163,7 +163,7 @@ extern unsigned disabled_cpus __cpuinitd
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
extern int safe_smp_processor_id(void);

#elif defined(CONFIG_X86_64_SMP)
Index: work/arch/x86/kernel/process_32.c
===================================================================
--- work.orig/arch/x86/kernel/process_32.c
+++ work/arch/x86/kernel/process_32.c
@@ -592,7 +592,7 @@ __switch_to(struct task_struct *prev_p,
if (prev->gs | next->gs)
loadsegment(gs, next->gs);

- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);

return prev_p;
}
Index: work/arch/x86/kernel/tlb_32.c
===================================================================
--- work.orig/arch/x86/kernel/tlb_32.c
+++ work/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
*/
void leave_mm(int cpu)
{
- BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
- cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+ BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_
* BUG();
*/

- if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -237,7 +237,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();

__flush_tlb_all();
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}

Index: work/arch/x86/mach-voyager/voyager_smp.c
===================================================================
--- work.orig/arch/x86/mach-voyager/voyager_smp.c
+++ work/arch/x86/mach-voyager/voyager_smp.c
@@ -410,7 +410,7 @@ void __init find_smp_config(void)
VOYAGER_SUS_IN_CONTROL_PORT);

current_thread_info()->cpu = boot_cpu_id;
- x86_write_percpu(cpu_number, boot_cpu_id);
+ percpu_write(cpu_number, boot_cpu_id);
}

/*
@@ -1790,7 +1790,7 @@ static void __init voyager_smp_cpus_done
void __init smp_setup_processor_id(void)
{
current_thread_info()->cpu = hard_smp_processor_id();
- x86_write_percpu(cpu_number, hard_smp_processor_id());
+ percpu_write(cpu_number, hard_smp_processor_id());
}

static void voyager_send_call_func(cpumask_t callmask)
Index: work/arch/x86/xen/enlighten.c
===================================================================
--- work.orig/arch/x86/xen/enlighten.c
+++ work/arch/x86/xen/enlighten.c
@@ -702,17 +702,17 @@ static void xen_write_cr0(unsigned long

static void xen_write_cr2(unsigned long cr2)
{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+ percpu_read(xen_vcpu)->arch.cr2 = cr2;
}

static unsigned long xen_read_cr2(void)
{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
+ return percpu_read(xen_vcpu)->arch.cr2;
}

static unsigned long xen_read_cr2_direct(void)
{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
+ return percpu_read(xen_vcpu_info.arch.cr2);
}

static void xen_write_cr4(unsigned long cr4)
@@ -725,12 +725,12 @@ static void xen_write_cr4(unsigned long

static unsigned long xen_read_cr3(void)
{
- return x86_read_percpu(xen_cr3);
+ return percpu_read(xen_cr3);
}

static void set_current_cr3(void *v)
{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ percpu_write(xen_current_cr3, (unsigned long)v);
}

static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -755,7 +755,7 @@ static void __xen_write_cr3(bool kernel,
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

if (kernel) {
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

/* Update xen_current_cr3 once the batch has actually
been submitted. */
@@ -771,7 +771,7 @@ static void xen_write_cr3(unsigned long

/* Update while interrupts are disabled, so its atomic with
respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

__xen_write_cr3(true, cr3);

Index: work/arch/x86/xen/irq.c
===================================================================
--- work.orig/arch/x86/xen/irq.c
+++ work/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
struct vcpu_info *vcpu;
unsigned long flags;

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);

/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();

@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}

@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;

/* Doesn't matter if we get preempted here, because any
Index: work/arch/x86/xen/mmu.c
===================================================================
--- work.orig/arch/x86/xen/mmu.c
+++ work/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info

/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
load_cr3(swapper_pg_dir);
arch_flush_lazy_cpu_mode();
}
Index: work/arch/x86/xen/multicalls.h
===================================================================
--- work.orig/arch/x86/xen/multicalls.h
+++ work/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned
xen_mc_flush();

/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(percpu_read(xen_mc_irq_flags));
}

/* Set up a callback to be called when the current batch is flushed */
Index: work/arch/x86/xen/smp.c
===================================================================
--- work.orig/arch/x86/xen/smp.c
+++ work/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();

cpu_set(cpu, cpu_online_map);
- x86_write_percpu(cpu_state, CPU_ONLINE);
+ percpu_write(cpu_state, CPU_ONLINE);
wmb();

/* We can take interrupts now: we're officially "up". */
Index: work/include/asm-generic/percpu.h
===================================================================
--- work.orig/include/asm-generic/percpu.h
+++ work/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)

+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
+
+#define __percpu_generic_to_op(var, val, op) \
+do { \
+ get_cpu_var(var) op val; \
+ put_cpu_var(var); \
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=)
+#endif
+
#endif /* _ASM_GENERIC_PERCPU_H_ */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 13:08:28 UTC
Permalink
Post by Ingo Molnar
It is an optimization and a cleanup, and adds the following new
percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_or()
percpu_xor()
nit: add percpu_and() here ;-)
Post by Ingo Molnar
tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
* added percpu_and() for completeness's sake
* made generic percpu ops atomic against preemption
okay - as long as the optimized versions stay single-instruction i'm a
happy camper.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 13:24:41 UTC
Permalink
From: Ingo Molnar <***@elte.hu>

It is an optimization and a cleanup, and adds the following new
generic percpu methods:

percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_and()
percpu_or()
percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

return __get_cpu_var(var);

ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx
ffffffff8102ca32: 81
ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax
ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

return percpu_read(var);

ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
* added percpu_and() for completeness's sake
* made generic percpu ops atomic against preemption

Signed-off-by: Ingo Molnar <***@elte.hu>
Signed-off-by: Tejun Heo <***@kernel.org>
Cc: Roel Kluin <***@gmail.com>
---
Okay, here's the updated patch. It's on top of x86/percpu[1]. You
can pull from... (or just take the mail, I don't have much problem
with rebasing, so whatever suits you better works for me).

git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

[1] d5fd35177b763186a3f6288624094ee402f0a7e3

Thanks.

arch/x86/include/asm/current.h | 2 +-
arch/x86/include/asm/irq_regs_32.h | 4 +-
arch/x86/include/asm/mmu_context_32.h | 12 ++++----
arch/x86/include/asm/pda.h | 10 +++---
arch/x86/include/asm/percpu.h | 24 ++++++++-------
arch/x86/include/asm/smp.h | 2 +-
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/tlb_32.c | 10 +++---
arch/x86/mach-voyager/voyager_smp.c | 4 +-
arch/x86/xen/enlighten.c | 14 ++++----
arch/x86/xen/irq.c | 8 ++--
arch/x86/xen/mmu.c | 2 +-
arch/x86/xen/multicalls.h | 2 +-
arch/x86/xen/smp.c | 2 +-
include/asm-generic/percpu.h | 52 +++++++++++++++++++++++++++++++++
15 files changed, 102 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f..0728480 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
- return x86_read_percpu(current_task);
+ return percpu_read(current_task);
}

#else /* X86_32 */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index 86afd74..d7ed33e 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);

static inline struct pt_regs *get_irq_regs(void)
{
- return x86_read_percpu(irq_regs);
+ return percpu_read(irq_regs);
}

static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
struct pt_regs *old_regs;

old_regs = get_irq_regs();
- x86_write_percpu(irq_regs, new_regs);
+ percpu_write(irq_regs, new_regs);

return old_regs;
}
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 7e98ce1..08b5345 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}

@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- x86_write_percpu(cpu_tlbstate.active_mm, next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);

@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
}
#ifdef CONFIG_SMP
else {
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index e3d3a08..47f274f 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);

#define cpu_pda(cpu) (&per_cpu(__pda, cpu))

-#define read_pda(field) x86_read_percpu(__pda.field)
-#define write_pda(field, val) x86_write_percpu(__pda.field, val)
-#define add_pda(field, val) x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val) x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val) x86_or_percpu(__pda.field, val)
+#define read_pda(field) percpu_read(__pda.field)
+#define write_pda(field, val) percpu_write(__pda.field, val)
+#define add_pda(field, val) percpu_add(__pda.field, val)
+#define sub_pda(field, val) percpu_sub(__pda.field, val)
+#define or_pda(field, val) percpu_or(__pda.field, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define test_and_clear_bit_pda(bit, field) \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 328b31a..03aa4b0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@

#ifdef CONFIG_SMP
#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset percpu_read(this_cpu_off)
#else
#define __percpu_seg_str
#endif

-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do { \
ret__; \
})

-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
@@ -131,6 +128,11 @@ do { \
old__; \
})

+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1274154..c7bbbbe 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -160,7 +160,7 @@ extern unsigned disabled_cpus __cpuinitdata;
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
extern int safe_smp_processor_id(void);

#elif defined(CONFIG_X86_64_SMP)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55..77d5468 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -591,7 +591,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
loadsegment(gs, next->gs);

- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);

return prev_p;
}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index ec53818..e65449d 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
*/
void leave_mm(int cpu)
{
- BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
- cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+ BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/

- if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -222,7 +222,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();

__flush_tlb_all();
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}

diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 1a48368..96f15b0 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -402,7 +402,7 @@ void __init find_smp_config(void)
VOYAGER_SUS_IN_CONTROL_PORT);

current_thread_info()->cpu = boot_cpu_id;
- x86_write_percpu(cpu_number, boot_cpu_id);
+ percpu_write(cpu_number, boot_cpu_id);
}

/*
@@ -1782,7 +1782,7 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
void __init smp_setup_processor_id(void)
{
current_thread_info()->cpu = hard_smp_processor_id();
- x86_write_percpu(cpu_number, hard_smp_processor_id());
+ percpu_write(cpu_number, hard_smp_processor_id());
}

static void voyager_send_call_func(cpumask_t callmask)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 312414e..75b9413 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -695,17 +695,17 @@ static void xen_write_cr0(unsigned long cr0)

static void xen_write_cr2(unsigned long cr2)
{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+ percpu_read(xen_vcpu)->arch.cr2 = cr2;
}

static unsigned long xen_read_cr2(void)
{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
+ return percpu_read(xen_vcpu)->arch.cr2;
}

static unsigned long xen_read_cr2_direct(void)
{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
+ return percpu_read(xen_vcpu_info.arch.cr2);
}

static void xen_write_cr4(unsigned long cr4)
@@ -718,12 +718,12 @@ static void xen_write_cr4(unsigned long cr4)

static unsigned long xen_read_cr3(void)
{
- return x86_read_percpu(xen_cr3);
+ return percpu_read(xen_cr3);
}

static void set_current_cr3(void *v)
{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ percpu_write(xen_current_cr3, (unsigned long)v);
}

static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -748,7 +748,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

if (kernel) {
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

/* Update xen_current_cr3 once the batch has actually
been submitted. */
@@ -764,7 +764,7 @@ static void xen_write_cr3(unsigned long cr3)

/* Update while interrupts are disabled, so its atomic with
respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ percpu_write(xen_cr3, cr3);

__xen_write_cr3(true, cr3);

diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb04260..2e82714 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
struct vcpu_info *vcpu;
unsigned long flags;

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);

/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long flags)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();

@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}

@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */

- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;

/* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240..7bc7852 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info)

/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
load_cr3(swapper_pg_dir);
arch_flush_lazy_cpu_mode();
}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8589382..e786fa7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned mode)
xen_mc_flush();

/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(percpu_read(xen_mc_irq_flags));
}

/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 83fa423..3bfd6dd 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();

cpu_set(cpu, cpu_online_map);
- x86_write_percpu(cpu_state, CPU_ONLINE);
+ percpu_write(cpu_state, CPU_ONLINE);
wmb();

/* We can take interrupts now: we're officially "up". */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c6..00f45ff 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)

+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
+
+#define __percpu_generic_to_op(var, val, op) \
+do { \
+ get_cpu_var(var) op val; \
+ put_cpu_var(var); \
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=)
+#endif
+
#endif /* _ASM_GENERIC_PERCPU_H_ */
--
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 13:37:24 UTC
Permalink
Post by Tejun Heo
Okay, here's the updated patch. It's on top of x86/percpu[1]. You
can pull from... (or just take the mail, I don't have much problem
with rebasing, so whatever suits you better works for me).
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
[1] d5fd35177b763186a3f6288624094ee402f0a7e3
Pulled into tip/x86/percpu, thanks Tejun!

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 18:03:37 UTC
Permalink
Post by Tejun Heo
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
I wonder if the preempt_disable()/preempt_enable() in here actually
does anything useful on any architecture.
Provides "this is IRQ safe" and "this is preempt safe" semantics.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Andrew Morton
2009-01-15 18:35:36 UTC
Permalink
Post by Ingo Molnar
Post by Tejun Heo
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
I wonder if the preempt_disable()/preempt_enable() in here actually
does anything useful on any architecture.
Provides "this is IRQ safe"
?
Post by Ingo Molnar
and "this is preempt safe" semantics.
Of course. But do any architectures actually _need_ that for a single
read?

Maybe. And if so, they can interpose their arch-specific
implementation. But if the generic version is optimal for them, they
wouldn't need to..


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 18:40:33 UTC
Permalink
Post by Andrew Morton
Post by Ingo Molnar
Post by Tejun Heo
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
I wonder if the preempt_disable()/preempt_enable() in here actually
does anything useful on any architecture.
Provides "this is IRQ safe"
?
Post by Ingo Molnar
and "this is preempt safe" semantics.
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
Post by Andrew Morton
Maybe. And if so, they can interpose their arch-specific
implementation. But if the generic version is optimal for them, they
wouldn't need to..
we cannot turn the generic ops into a single instruction so arch methods
are preferable no matter how thick or thin the generic version is. But i
agree that the optimization you suggest could be done.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Herbert Xu
2009-01-16 00:12:54 UTC
Permalink
Post by Ingo Molnar
Post by Andrew Morton
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
One of the things I'd like to see happen with this work is for
us to have a cheap per-cpu atomic counter that we can use for
SNMP stats.

If we can make the inc/add variants into a single instruction,
then it won't need to disable preemption or interrupts.

So if you could design the API such that we have a variant of
add/inc that automatically disables/enables preemption then we
can optimise that away on x86.

Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <***@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 00:17:01 UTC
Permalink
Post by Ingo Molnar
Post by Andrew Morton
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
One of the things I'd like to see happen with this work is for us to
have a cheap per-cpu atomic counter that we can use for SNMP stats.
If we can make the inc/add variants into a single instruction, then it
won't need to disable preemption or interrupts.
So if you could design the API such that we have a variant of add/inc
that automatically disables/enables preemption then we can optimise that
away on x86.
Yeah. percpu_add(var, 1) does exactly that on x86.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Herbert Xu
2009-01-16 00:19:24 UTC
Permalink
Post by Ingo Molnar
So if you could design the API such that we have a variant of add/inc
that automatically disables/enables preemption then we can optimise that
away on x86.
Yeah. percpu_add(var, 1) does exactly that on x86.
Awesome! Sounds like we can finally do away with those nasty
hacks in the SNMP macros.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <***@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 22:10:50 UTC
Permalink
Post by Ingo Molnar
So if you could design the API such that we have a variant of add/inc
that automatically disables/enables preemption then we can optimise that
away on x86.
Yeah. percpu_add(var, 1) does exactly that on x86.
<sigh>. No it doesn't.
What do you mean by "No it doesn't". It does exactly what i claimed it
does.
It's really nice that everyone's excited about this, but it's more
complex than this. Unf. I'm too busy preparing for linux.conf.au to
1) This only works on static per-cpu vars.
- We are working on fixing this, but it's non-trivial for large allocs like
those in networking. Small allocs, we have patches for.
How do difficulties of dynamic percpu-alloc make my above suggestion
unsuitable for SNMP stats in networking? Most of those stats are not
dynamically allocated - they are plain straightforward percpu variables.

Plus the majority of percpu usage is static - just like the majority of
local variables is static, not dynamic. So any percpu-alloc complication
is a non-issue.
2) The generic versions of these as posted by Tejun are unsuitable for
networking; they need to bh_disable. That would make networking less
efficient than it is now for non-x86, and to be generic it would have
to be local_irq_save/restore anyway.
The generic versions will not be used on 95%+ of the active Linux systems
out there, as they run on x86. If you worry about the remaining 5%, those
can be optimized too.
3) local_t was designed to do exactly this: a fast cpu-local counter
implemented optimally for each arch. For sparc64, doing a trivalue version
seems optimal, for s390 atomics, for x86 single-insn, for powerpc
irq_save/restore, etc.
But local_t does not actually solve this problem at all - because one
still has to have per-cpu-ness.
4) Unfortunately, local_t has been extended beyond a simple counter, meaning
it now has more complex requirements (eg. Mathieu wants nmi-safe, even
though that's impossible on sparc and parisc, and percpu_counter wants
local_add_return, which makes trival less desirable). These discussions
are on the back burner at the moment, but ongoing.
In reality local_t has almost zero users in the kernel - despite being
with us at least since v2.6.12. That pretty much tells us all about its
utility.

The thing is, local_t without proper percpu integration is a toothless
tiger in the jungle. And our APIS do exactly that kind of integration and
i expect them to be more popular than local_t. There's already a dozen
usage sites of it in arch/x86.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-20 06:26:47 UTC
Permalink
The generic versions Tejun posted are not softirq safe, so not
suitable for network counters. To figure out what semantics we
really want we need to we must audit the users; I'm sorry I haven't
finished that task (and won't until after the conference).
No, they're not. They're preempt safe as mentioned in the comment and
is basically just generalization of the original x86 versions used by
x86_64 on SMP before pda and percpu areas were merged. I agree that
it's something very close to local_t and it would be nice to see those
somehow unified (and I have patches which make use of local_t in my
queue waiting for dynamic percpu allocation).

Another question to ask is whether to keep using separate interfaces
for static and dynamic percpu variables or migrate to something which
can take both.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-20 10:37:38 UTC
Permalink
Post by Tejun Heo
The generic versions Tejun posted are not softirq safe, so not
suitable for network counters. To figure out what semantics we
really want we need to we must audit the users; I'm sorry I haven't
finished that task (and won't until after the conference).
No, they're not. They're preempt safe as mentioned in the comment and
is basically just generalization of the original x86 versions used by
x86_64 on SMP before pda and percpu areas were merged. I agree that
it's something very close to local_t and it would be nice to see those
somehow unified (and I have patches which make use of local_t in my
queue waiting for dynamic percpu allocation).
Another question to ask is whether to keep using separate interfaces for
static and dynamic percpu variables or migrate to something which can
take both.
Also, there's over 400 PER_CPU variable definitions in the kernel, while
only about 40 dynamic percpu allocation usage sites. (in that i included
the percpu_counter bits used by networking)

So our percpu code usage is on the static side, by a large margin.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-27 02:25:27 UTC
Permalink
Hello, Rusty.
No, they're not. They're preempt safe as mentioned in the comment
and is basically just generalization of the original x86 versions
used by x86_64 on SMP before pda and percpu areas were merged. I
agree that it's something very close to local_t and it would be
nice to see those somehow unified (and I have patches which make
use of local_t in my queue waiting for dynamic percpu allocation).
1) Mine did just read because that covers the most common fast-path use
and is easily atomic for word-sizes on all archs,
2) Didn't replace x86, just #defined generic one, so much less churn,
3) read_percpu_var and read_percpu_ptr variants following the convention
reinforced by my other patches.
Linus' tree had read/write/add/or counts at 22/13/0/0. Yours has
more write usage, so I'm happy there, but still only one add and one
or. If we assume that generic code will look a bit like that when
converted, I'm not convinced that generic and/or/etc ops are worth
it.
There actually were quite some places where atomic add ops would be
useful, especially the places where statistics are collected. For
logical bitops, I don't think we'll have too many of them.
If they are worth doing generically, should the ops be atomic? To
extrapolate from x86 usages again, it seems to be happy with
non-atomic (tho of course it is atomic on x86).
If atomic rw/add/sub are implementible on most archs (and judging from
local_t, I suppose it is), I think it should. So that it can replace
local_t and we won't need something else again in the future.
Another question to ask is whether to keep using separate
interfaces for static and dynamic percpu variables or migrate to
something which can take both.
Well, IA64 can do stuff with static percpus that it can't do with
dynamic (assuming we get expanding dynamic percpu areas
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Isn't something like the following possible?

#define pcpu_read(ptr) \
({ \
if (__builtin_constant_p(ptr) && \
ptr >= PCPU_STATIC_START && ptr < PCPU_STATIC_END) \
do 64k TLB trick for static pcpu; \
else \
do generic stuff; \
})
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours
involves some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-27 13:14:03 UTC
Permalink
Post by Tejun Heo
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours involves
some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).
Sure - but please do it as a clean rebase ontop of latest, not as a
conflict-merge, as i'd expect there to be quite many clashes.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-27 23:08:06 UTC
Permalink
Post by Ingo Molnar
Post by Tejun Heo
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours involves
some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).
Sure - but please do it as a clean rebase ontop of latest, not as a
conflict-merge, as i'd expect there to be quite many clashes.
Alrighty, re-basing.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-28 03:36:39 UTC
Permalink
Post by Tejun Heo
Post by Ingo Molnar
Post by Tejun Heo
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours involves
some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).
Sure - but please do it as a clean rebase ontop of latest, not as a
conflict-merge, as i'd expect there to be quite many clashes.
Alrighty, re-basing.
Okay, rebased on top of the current master[1].

git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

The rebased commit ID is e7f538f3d2a6b3a0a632190a7d736730ee10909c.

There were several changes which didn't fit the rebased tree as they
really belong to other branches in the x86 tree. I'll re-submit them
as patches against the appropriate branch.

Thanks.
--
tejun

[1] 5ee810072175042775e39bdd3eaaa68884c27805
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-28 08:13:21 UTC
Permalink
Post by Tejun Heo
Okay, rebased on top of the current master[1].
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
The rebased commit ID is e7f538f3d2a6b3a0a632190a7d736730ee10909c.
There were several changes which didn't fit the rebased tree as they
really belong to other branches in the x86 tree. I'll re-submit them
as patches against the appropriate branch.
Two patches against #stackprotector and one against #cpus4096 sent
out. With these three patches, most of stuff has been preserved but
there are still few missing pieces which only make sense after trees
are merged. Will be happy to drive it if the current branch looks
fine.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-27 21:06:11 UTC
Permalink
Post by Tejun Heo
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Why wont it scale? this is a separate TLB entry for each processor.
Post by Tejun Heo
Isn't something like the following possible?
#define pcpu_read(ptr) \
({ \
if (__builtin_constant_p(ptr) && \
ptr >= PCPU_STATIC_START && ptr < PCPU_STATIC_END) \
do 64k TLB trick for static pcpu; \
else \
do generic stuff; \
})
The TLB trick is just to access the percpu data at a fixed base. I.e.
value = SHIFT_PERCPU_PTR(percpu_var, FIXED_ADDRESS);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
David Miller
2009-01-27 21:48:16 UTC
Permalink
From: Christoph Lameter <***@linux-foundation.org>
Date: Tue, 27 Jan 2009 15:08:57 -0500 (EST)
Post by Christoph Lameter
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Why wont it scale? this is a separate TLB entry for each processor.
The IA64 per-cpu TLB entry only covers 64k which makes use of it for
dynamic per-cpu stuff out of the question. That's why it "doesn't
scale"
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Rick Jones
2009-01-27 22:48:03 UTC
Permalink
Post by David Miller
Date: Tue, 27 Jan 2009 15:08:57 -0500 (EST)
Post by Christoph Lameter
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Why wont it scale? this is a separate TLB entry for each processor.
The IA64 per-cpu TLB entry only covers 64k which makes use of it for
dynamic per-cpu stuff out of the question. That's why it "doesn't
scale"
I was asking around, and was told that on IA64 *harware* at least, in addition to
supporting multiple page sizes (up to a GB IIRC), one can pin up to 8 or perhaps
1/2 the TLB entries.

So, in theory if one were so inclined the special pinned per-CPU entry could
either be more than one 64K entry, or a single, rather larger entry.

As a sanity check, I've cc'd linux-ia64.

rick jones
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Luck, Tony
2009-01-28 00:18:35 UTC
Permalink
Post by Rick Jones
I was asking around, and was told that on IA64 *harware* at least, in addition to
supporting multiple page sizes (up to a GB IIRC), one can pin up to 8 or perhaps
1/2 the TLB entries.
The number of TLB entries that can be pinned is model specific (but the
minimum allowed is 8 each for code & data). Page sizes supported also
vary by model, recent models go up to 4GB.

BUT ... we stopped pinning this entry in the TLB when benchmarks showed
that it was better to just insert this as a regular TLB entry which might
can be dropped to map something else. So now there is code in the Alt-DTLB
miss handler (ivt.S) to insert the per-cpu mapping on an as needed basis.
Post by Rick Jones
So, in theory if one were so inclined the special pinned per-CPU entry could
either be more than one 64K entry, or a single, rather larger entry.
Managing a larger space could be done ... but at the expense of making
the Alt-DTLB miss handler do a memory lookup to find the physical address
of the per-cpu page needed (assuming that we allocate a bunch of random
physical pages for use as per-cpu space rather than a single contiguous
block of physical memory).

When do we know the total amount of per-cpu memory needed?
1) CONFIG time?
2) Boot time?
3) Arbitrary run time?

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Luck, Tony
2009-01-28 17:15:51 UTC
Permalink
Post by Luck, Tony
Managing a larger space could be done ... but at the expense of making
the Alt-DTLB miss handler do a memory lookup to find the physical address
of the per-cpu page needed (assuming that we allocate a bunch of random
physical pages for use as per-cpu space rather than a single contiguous
block of physical memory).
We cannot resize the area by using a single larger TLB entry?
Yes we could ... the catch is that the supported TLB page sizes go
up by multiples of 4. So if 64K is not enough the next stop is
at 256K, then 1M, then 4M, and so on.

That's why I asked when we know what total amount of per-cpu memory
is needed. CONFIG time or boot time is good, because allocating higher
order pages is easy at boot time. Arbitrary run time is bad because
we might have difficulty allocating a high order page for every cpu.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-28 17:58:37 UTC
Permalink
Post by Luck, Tony
Managing a larger space could be done ... but at the expense of making
the Alt-DTLB miss handler do a memory lookup to find the physical address
of the per-cpu page needed (assuming that we allocate a bunch of random
physical pages for use as per-cpu space rather than a single contiguous
block of physical memory).
We cannot resize the area by using a single larger TLB entry?
Post by Luck, Tony
When do we know the total amount of per-cpu memory needed?
1) CONFIG time?
Would be easiest.
Post by Luck, Tony
2) Boot time?
We could make the TLB entry size configurable with a kernel parameter
Post by Luck, Tony
3) Arbitrary run time?
We could reserve a larger virtual space and switch TLB entries as needed?
We would need do get larger order pages to do this.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-28 17:58:21 UTC
Permalink
Post by David Miller
Post by Christoph Lameter
Why wont it scale? this is a separate TLB entry for each processor.
The IA64 per-cpu TLB entry only covers 64k which makes use of it for
dynamic per-cpu stuff out of the question. That's why it "doesn't
scale"
IA64 supports varying page sizes. You can use a 128k TLB entry etc.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Rusty Russell
2009-01-28 10:39:27 UTC
Permalink
Post by Tejun Heo
Hello, Rusty.
Hi Tejun!
Post by Tejun Heo
There actually were quite some places where atomic add ops would be
useful, especially the places where statistics are collected. For
logical bitops, I don't think we'll have too many of them.
If the stats are only manipulated in one context, than an atomic requirement is overkill (and expensive on non-x86).
Post by Tejun Heo
If they are worth doing generically, should the ops be atomic? To
extrapolate from x86 usages again, it seems to be happy with
non-atomic (tho of course it is atomic on x86).
If atomic rw/add/sub are implementible on most archs (and judging from
local_t, I suppose it is), I think it should. So that it can replace
local_t and we won't need something else again in the future.
This is more like Christoph's CPU_OPS: they were special operators on normal per-cpu vars/ptrs. Generic version was irqsave+op+irqrestore.

I actually like this idea, but Mathieu insists that the ops be NMI-safe, for ftrace. Hence local_t needing to be atomic_t for generic code.

AFAICT we'll need a hybrid: HAVE_NMISAFE_CPUOPS, and if not, use atomic_t
in ftrace (which isn't NMI safe on parisc or sparc/32 anyway, but I don't think we care).

Other than the shouting, I liked Christoph's system:
- CPU_INC = always safe (eg. local_irq_save/per_cpu(i)++/local_irq_restore)
- _CPU_INC = not safe against interrupts (eg. get_cpu/per_cpu(i)++/put_cpu)
- __CPU_INC = not safe against anything (eg. per_cpu(i)++)

I prefer the name 'local' to the name 'cpu', but I'm not hugely fussed.
Post by Tejun Heo
Post by Tejun Heo
Another question to ask is whether to keep using separate
interfaces for static and dynamic percpu variables or migrate to
something which can take both.
Well, IA64 can do stuff with static percpus that it can't do with
dynamic (assuming we get expanding dynamic percpu areas
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Isn't something like the following possible?
#define pcpu_read(ptr) \
({ \
if (__builtin_constant_p(ptr) && \
ptr >= PCPU_STATIC_START && ptr < PCPU_STATIC_END) \
do 64k TLB trick for static pcpu; \
else \
do generic stuff; \
})
No, that will be "do generic stuff", since it's a link-time constant. I don't know that this is a huge worry, to be honest. We can leave the __ia64_per_cpu_var for their arch-specific code (I feel the same way about x86 to be honest).
Post by Tejun Heo
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours
involves some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).
Ah, I did not realize that you celebrated Australia day :)

Cheers!
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-28 10:57:31 UTC
Permalink
Hello,
Post by Rusty Russell
Post by Tejun Heo
Hello, Rusty.
Hi Tejun!
Post by Tejun Heo
There actually were quite some places where atomic add ops would be
useful, especially the places where statistics are collected. For
logical bitops, I don't think we'll have too many of them.
If the stats are only manipulated in one context, than an atomic
requirement is overkill (and expensive on non-x86).
Yes, it is. I was hoping it to be not more expensive on most archs.
It isn't on x86 at the very least but I don't know much about other
archs.
Post by Rusty Russell
Post by Tejun Heo
If they are worth doing generically, should the ops be atomic? To
extrapolate from x86 usages again, it seems to be happy with
non-atomic (tho of course it is atomic on x86).
If atomic rw/add/sub are implementible on most archs (and judging from
local_t, I suppose it is), I think it should. So that it can replace
local_t and we won't need something else again in the future.
This is more like Christoph's CPU_OPS: they were special operators
on normal per-cpu vars/ptrs. Generic version was
irqsave+op+irqrestore.
I actually like this idea, but Mathieu insists that the ops be
NMI-safe, for ftrace. Hence local_t needing to be atomic_t for
generic code.
AFAICT we'll need a hybrid: HAVE_NMISAFE_CPUOPS, and if not, use
atomic_t in ftrace (which isn't NMI safe on parisc or sparc/32
anyway, but I don't think we care).
Requiring NMI-safeness is quite an exception, I suppose. I don't
think we should design around it. If it can be worked around one way
or the other, it should be fine.
Post by Rusty Russell
- CPU_INC = always safe (eg. local_irq_save/per_cpu(i)++/local_irq_restore)
- _CPU_INC = not safe against interrupts (eg. get_cpu/per_cpu(i)++/put_cpu)
- __CPU_INC = not safe against anything (eg. per_cpu(i)++)
I prefer the name 'local' to the name 'cpu', but I'm not hugely fussed.
I like local better too but no biggies one way or the other.
Post by Rusty Russell
Post by Tejun Heo
Post by Tejun Heo
Another question to ask is whether to keep using separate
interfaces for static and dynamic percpu variables or migrate to
something which can take both.
Well, IA64 can do stuff with static percpus that it can't do with
dynamic (assuming we get expanding dynamic percpu areas
later). That's because they use TLB tricks for a static 64k per-cpu
area, but this doesn't scale. That might not be vital: abandoning
that trick will mean they can't optimise read_percpu/read_percpu_var
etc as much.
Isn't something like the following possible?
#define pcpu_read(ptr) \
({ \
if (__builtin_constant_p(ptr) && \
ptr >= PCPU_STATIC_START && ptr < PCPU_STATIC_END) \
do 64k TLB trick for static pcpu; \
else \
do generic stuff; \
})
No, that will be "do generic stuff", since it's a link-time
constant. I don't know that this is a huge worry, to be honest. We
can leave the __ia64_per_cpu_var for their arch-specific code (I
feel the same way about x86 to be honest).
Yes, right. Got confused there. Hmmm... looks like what would work
there is "is it a lvalue?" test. Well, anyways, if it isn't
necessary.
Post by Rusty Russell
Post by Tejun Heo
Tejun, any chance of you updating the tj-percpu tree? My current
patches are against Linus's tree, and rebasing them on yours
involves some icky merging.
If Ingo is okay with it, I'm fine with it too. Unless Ingo objects,
I'll do it tomorrow-ish (still big holiday here).
Ah, I did not realize that you celebrated Australia day :)
Hey, didn't know Australia was founded on lunar New Year's day.
Nice. :-)

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Christoph Lameter
2009-01-28 17:28:19 UTC
Permalink
Post by Rusty Russell
AFAICT we'll need a hybrid: HAVE_NMISAFE_CPUOPS, and if not, use atomic_t
in ftrace (which isn't NMI safe on parisc or sparc/32 anyway, but I don't think we care).
Right.
Post by Rusty Russell
- CPU_INC = always safe (eg. local_irq_save/per_cpu(i)++/local_irq_restore)
- _CPU_INC = not safe against interrupts (eg. get_cpu/per_cpu(i)++/put_cpu)
- __CPU_INC = not safe against anything (eg. per_cpu(i)++)
I prefer the name 'local' to the name 'cpu', but I'm not hugely fussed.
The term cpu is meaning multiple things at this point. So yes it may be
better to go with glibc naming of thread local space.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mathieu Desnoyers
2009-01-28 18:13:31 UTC
Permalink
Post by Christoph Lameter
Post by Rusty Russell
AFAICT we'll need a hybrid: HAVE_NMISAFE_CPUOPS, and if not, use atomic_t
in ftrace (which isn't NMI safe on parisc or sparc/32 anyway, but I don't think we care).
Right.
Ideally this should be done transparently so we don't have to #ifdef
code around HAVE_NMISAFE_CPUOPS everywhere in the tracer. We might
consider declaring an intermediate type with this kind of #ifdef in the
tracer code if we are the only one user though.
Post by Christoph Lameter
Post by Rusty Russell
- CPU_INC = always safe (eg. local_irq_save/per_cpu(i)++/local_irq_restore)
- _CPU_INC = not safe against interrupts (eg. get_cpu/per_cpu(i)++/put_cpu)
- __CPU_INC = not safe against anything (eg. per_cpu(i)++)
I prefer the name 'local' to the name 'cpu', but I'm not hugely fussed.
The term cpu is meaning multiple things at this point. So yes it may be
better to go with glibc naming of thread local space.
However using "local" for "per-cpu" could be confusing with the glibc
naming of thread local space, because "per-thread" and "per-cpu"
variables are different from a synchronization POV and we can end up
needing both (e.g. a thread local variable can never be accessed by
another thread, but a cpu local variable could be accessed by a
different CPU due to scheduling).

I'm currently thinking about implementing user-space per-cpu tracing buffers,
and the last thing I'd like is to have a "local" semantic clash between
the kernel and glibc. Ideally, we could have something like :

Atomic safe primitives (emulated with irq disable if the architecture
does not have atomic primitives) :
- atomic_thread_inc()
* current mainline local_t local_inc().
- atomic_cpu_inc()
* Your proposed CPU_INC.

Non-safe against interrupts, but safe against preemption :
- thread_inc()
* no preempt_disable involved, because this deals with per-thread
variables :
* Simple var++
- cpu_inc()
* disables preemption, per_cpu(i)++, enables preemption

Not safe against preemption nor interrupts :
- _thread_inc()
* maps to thread_inc()
- _cpu_inc()
* simple per_cpu(i)++

So maybe we don't really need thread_inc(), _thread_inc() and _cpu_inc,
because they map to standard C operations, but we may find that in some
architectures that the atomic_cpu_inc() is faster than the per_cpu(i)++,
and in those cases it would make sense to map e.g. _cpu_inc() to
atomic_cpu_inc().

Also note that don't think adding _ and __ prefixes to the operations
makes it clear for the programmer and reviewer what is safe against
what. OMHO, it will just make the code more obscure. One level of
underscore is about the limit I think people can "know" what this
"unsafe" version of the primitive does.

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-20 10:41:25 UTC
Permalink
Post by Ingo Molnar
How do difficulties of dynamic percpu-alloc make my above suggestion
unsuitable for SNMP stats in networking? Most of those stats are not
dynamically allocated - they are plain straightforward percpu variables.
No they're not. [...]
hm, i see this got changed recently - part of the networking stats went
over to the lib/percpu_counter API recently.

The larger point still remains: the kernel dominantly uses static percpu
variables by a margin of 10 to 1, so we cannot just brush away the static
percpu variables and must concentrate on optimizing that side with
priority. It's nice if the dynamic percpu-alloc side improves as well, of
course.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-21 05:53:50 UTC
Permalink
Post by Ingo Molnar
The larger point still remains: the kernel dominantly uses static percpu
variables by a margin of 10 to 1, so we cannot just brush away the static
percpu variables and must concentrate on optimizing that side with
priority. It's nice if the dynamic percpu-alloc side improves as well, of
course.
Well, the infrequent usage of dynamic percpu allocation is in some
part due to the poor implementation, so it's sort of chicken and egg
problem. I got into this percpu thing because I wanted a percpu
reference count which can be dynamically allocated and it sucked.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-21 10:06:41 UTC
Permalink
Post by Ingo Molnar
The larger point still remains: the kernel dominantly uses static percpu
variables by a margin of 10 to 1, so we cannot just brush away the static
percpu variables and must concentrate on optimizing that side with
priority. It's nice if the dynamic percpu-alloc side improves as well, of
course.
Well, the infrequent usage of dynamic percpu allocation is in some part
due to the poor implementation, so it's sort of chicken and egg problem.
I got into this percpu thing because I wanted a percpu reference count
which can be dynamically allocated and it sucked.
Sure, but even static percpu sucked very much (it expanded to half a dozen
or more instructions), and dynamic is _more_ complex. Anyway, it's getting
fixed now :-)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Eric W. Biederman
2009-01-21 11:22:08 UTC
Permalink
Post by Tejun Heo
Post by Ingo Molnar
The larger point still remains: the kernel dominantly uses static percpu
variables by a margin of 10 to 1, so we cannot just brush away the static
percpu variables and must concentrate on optimizing that side with
priority. It's nice if the dynamic percpu-alloc side improves as well, of
course.
Well, the infrequent usage of dynamic percpu allocation is in some
part due to the poor implementation, so it's sort of chicken and egg
problem. I got into this percpu thing because I wanted a percpu
reference count which can be dynamically allocated and it sucked.
Counters are our other special case, and counters are interesting
because they are individually very small. I just looked and the vast
majority of the alloc_percpu users are counters.

I just did a rough count in include/linux/snmp.h and I came
up with 171*2 counters. At 8 bytes per counter that is roughly
2.7K. Or about two thirds of a 4K page.

What makes this is a challenge is that those counters are per network
namespace, and there are no static limits on the number of network
namespaces.

If we push the system and allocate 1024 network namespaces we wind up
needing 2.7MB per cpu, just for the SNMP counters.

Which nicely illustrates the principle that typically each individual
per cpu allocation is small, but with dynamic allocation we have the
challenge that number of allocations becomes unbounded and in some cases
could be large, while the typical per cpu size is likely to be very small.

I wonder if for the specific case of counters it might make sense to
simply optimize the per cpu allocator for machine word size allocations
and allocate each counter individually freeing us from the burden of
worrying about fragmentation.


The pain with the current alloc_percpu implementation when working
with counters is that it has to allocate an array with one entry
for each cpu to point at the per cpu data. Which isn't especially efficient.


Eric



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Stephen Hemminger
2009-01-21 12:46:22 UTC
Permalink
On Wed, 21 Jan 2009 03:21:23 -0800
Post by Eric W. Biederman
Post by Tejun Heo
Post by Ingo Molnar
The larger point still remains: the kernel dominantly uses static percpu
variables by a margin of 10 to 1, so we cannot just brush away the static
percpu variables and must concentrate on optimizing that side with
priority. It's nice if the dynamic percpu-alloc side improves as well, of
course.
Well, the infrequent usage of dynamic percpu allocation is in some
part due to the poor implementation, so it's sort of chicken and egg
problem. I got into this percpu thing because I wanted a percpu
reference count which can be dynamically allocated and it sucked.
Counters are our other special case, and counters are interesting
because they are individually very small. I just looked and the vast
majority of the alloc_percpu users are counters.
I just did a rough count in include/linux/snmp.h and I came
up with 171*2 counters. At 8 bytes per counter that is roughly
2.7K. Or about two thirds of a 4K page.
This is crap. only a small fraction of these SNMP counters are
close enough to the hot path to deserve per-cpu treatment.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Eric W. Biederman
2009-01-21 14:14:15 UTC
Permalink
Post by Stephen Hemminger
This is crap. only a small fraction of these SNMP counters are
close enough to the hot path to deserve per-cpu treatment.
Interesting. I was just reading af_inet.c:ipv4_mib_init_net()
to get a feel for what a large consumer of percpu counters looks
like.

I expect the patterns I saw will hold even if the base size is much
smaller.


Your statement reinforces my point that our allocations in a per cpu
area are small and our dynamic per cpu allocator is not really
optimized for small allocations.

In most places what we want are 1-5 counters, which is max 40 bytes.
And our minimum allocation is a full cache line (64-128 bytes) per
allocation.



I'm wondering if dynamic per cpu allocations should work more like
slab. Have a set of percpu base pointers for an area, and return an
area + offset in some convenient form.

Ideally we would just have one area (allowing us to keep the base
pointer in a register), but I'm not convinced that doesn't fall down
in the face of dynamic allocation.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
David Miller
2009-01-21 20:34:34 UTC
Permalink
From: Stephen Hemminger <***@vyatta.com>
Date: Wed, 21 Jan 2009 23:45:01 +1100
Post by Stephen Hemminger
This is crap. only a small fraction of these SNMP counters are
close enough to the hot path to deserve per-cpu treatment.
Only if your router/firewall/webserver isn't hitting that code path
which bumps the counters you think aren't hot path.

It's a micro-DoS waiting to happen if we start trying to split
counters up into groups which matter for hot path processing
(and thus use per-cpu stuff) and those which don't (and thus
use atomics or whatever your idea is).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
H. Peter Anvin
2009-01-16 01:10:29 UTC
Permalink
Post by Ingo Molnar
So if you could design the API such that we have a variant of add/inc
that automatically disables/enables preemption then we can optimise that
away on x86.
Yeah. percpu_add(var, 1) does exactly that on x86.
And even on architectures which can only do percpu inc/dec we can make
the right thing happen for a constant here, so we really don't need a
new interface. This is a good thing.

-hpa
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 22:00:50 UTC
Permalink
Post by Herbert Xu
Post by Ingo Molnar
Post by Andrew Morton
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
One of the things I'd like to see happen with this work is for
us to have a cheap per-cpu atomic counter that we can use for
SNMP stats.
If we can make the inc/add variants into a single instruction,
then it won't need to disable preemption or interrupts.
So if you could design the API such that we have a variant of
add/inc that automatically disables/enables preemption then we
can optimise that away on x86.
Yep, already on it. It's called local_t; that's what it was originally
designed for.
Unfortunately, to use it efficiently, we need large per-cpu areas.
Do you mean constructs like:

local_inc(&__get_cpu_var(var));

?

If yes then i think you are missing the point here.

Yes, local_t can be useful when something is in an object and we know only
a local IRQ context can update it and we dont want to disable irqs or use
heavy atomics.

But percpu_read()/write()/add()/sub() ops are about optimizing _percpu_
variables. local_t alone does not solve that problem - because to use
local_t as a percpu variable you have to get to the address of that
variable - and that alone is not well optimized.

Or do you propose some new API that would allow that?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 22:11:27 UTC
Permalink
Post by Herbert Xu
Post by Ingo Molnar
Post by Andrew Morton
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
One of the things I'd like to see happen with this work is for
us to have a cheap per-cpu atomic counter that we can use for
SNMP stats.
If we can make the inc/add variants into a single instruction, then it
won't need to disable preemption or interrupts.
So if you could design the API such that we have a variant of add/inc
that automatically disables/enables preemption then we can optimise
that away on x86.
Yep, already on it. It's called local_t; that's what it was originally
designed for.
Unfortunately, to use it efficiently, we need large per-cpu areas.
This makes no sense to me at all. Care to explain?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 21:54:27 UTC
Permalink
Post by Ingo Molnar
Post by Andrew Morton
Post by Ingo Molnar
and "this is preempt safe" semantics.
Of course. But do any architectures actually _need_ that for a single
read?
not for a read i guess - but for the other ops like add/and/or/xor.
FWIW, it prevents cross-cpu cacheline contamination. :-)
Post by Ingo Molnar
Post by Andrew Morton
Maybe. And if so, they can interpose their arch-specific
implementation. But if the generic version is optimal for them, they
wouldn't need to..
we cannot turn the generic ops into a single instruction so arch methods
are preferable no matter how thick or thin the generic version is. But i
agree that the optimization you suggest could be done.
I think the preemption protection is good to have there for, if
nothing else, documentation.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Mark Lord
2009-01-16 14:10:59 UTC
Permalink
.
Post by Ingo Molnar
I wonder if the preempt_disable()/preempt_enable() in here actually
does anything useful on any architecture.
Provides "this is IRQ safe"
?
Post by Ingo Molnar
and "this is preempt safe" semantics.
Of course. But do any architectures actually _need_ that for a single read?
.

If the target is unaligned, then RISC architectures will need protection there.
If we can guarantee correct memory alignment of the target, then no / none.

Cheers
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Andrew Morton
2009-01-15 17:30:56 UTC
Permalink
Post by Tejun Heo
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
__typeof__(type) per_cpu_var(name)
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * equal char, int or long. percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var(). Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
I wonder if the preempt_disable()/preempt_enable() in here actually
does anything useful on any architecture.
Post by Tejun Heo
+#define __percpu_generic_to_op(var, val, op) \
+do { \
+ get_cpu_var(var) op val; \
+ put_cpu_var(var); \
+} while (0)
We'll need it here.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-15 18:47:47 UTC
Permalink
Post by Ingo Molnar
It is an optimization and a cleanup, and adds the following new
percpu_read()
percpu_write()
percpu_add()
percpu_sub()
percpu_and()
percpu_or()
percpu_xor()
on a second thought ... i just started using this construct in code, and
promptly typoed it: i typed "per_cpu_read()".

Which is really the more logical name for this. Mind if i do a
s/percpu/per_cpu/ rename of all of these new APIs?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
roel kluin
2009-01-15 14:07:53 UTC
Permalink
Post by Ingo Molnar
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
I'm sorry for your eyes, but since var occurs twice, isn't it better to do:

# define percpu_read(var) \
({ \
typeof(var) __pcpu_read_var__ = var;
\
typeof(per_cpu_var(__pcpu_read_var__)) __tmp_var__; \
__tmp_var__ = get_cpu_var(__pcpu_read_var__); \
put_cpu_var(__pcpu_read_var__); \
__tmp_var__; \
})
Post by Ingo Molnar
+
+#define __percpu_generic_to_op(var, val, op) \
+do { \
+ get_cpu_var(var) op val; \
+ put_cpu_var(var); \
+} while (0)
and:

#define __percpu_generic_to_op(var, val, op) \
do { \
typeof(var) __pcpu_gto_var__ = var;
\
get_cpu_var(__pcpu_gto_var__) op val;
\
put_cpu_var(__pcpu_gto_var__);
\
} while (0)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 21:52:07 UTC
Permalink
Hello, Roel.
Post by Ingo Molnar
Post by Ingo Molnar
+#ifndef percpu_read
+# define percpu_read(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp_var__; \
+ __tmp_var__ = get_cpu_var(var); \
+ put_cpu_var(var); \
+ __tmp_var__; \
+ })
+#endif
# define percpu_read(var) \
({ \
typeof(var) __pcpu_read_var__ = var;
\
typeof(per_cpu_var(__pcpu_read_var__)) __tmp_var__; \
__tmp_var__ = get_cpu_var(__pcpu_read_var__); \
put_cpu_var(__pcpu_read_var__); \
__tmp_var__; \
})
Post by Ingo Molnar
+
+#define __percpu_generic_to_op(var, val, op) \
+do { \
+ get_cpu_var(var) op val; \
+ put_cpu_var(var); \
+} while (0)
#define __percpu_generic_to_op(var, val, op) \
do { \
typeof(var) __pcpu_gto_var__ = var;
\
get_cpu_var(__pcpu_gto_var__) op val;
\
put_cpu_var(__pcpu_gto_var__);
\
} while (0)
@var has to be simple identifier as it ends up getting concatenated to
a string. There's even a check for it in get_cpu_var() macro. Please
also note that lack of any protecting ()'s around @var for the same
reason. So, basically, typeof(var) just doesn't exist.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Ingo Molnar
2009-01-16 21:25:25 UTC
Permalink
Post by Tejun Heo
Hello,
The new ops are a pretty nice and clean solution i think.
Firstly, accessing the current CPU is the only safe shortcut anyway (there
is where we can do %fs/%gs / rip-relative addressing modes), and the
generic per_cpu() APIs dont really provide that guarantee for us. We might
be able to hook into __get_cpu_var() but those both require to be an
lvalue and are also relatively rarely used.
So introducing the new, rather straightforward APIs and using them
wherever they matter for performance is good. Your patchset already shaved
off an instruction from ordinary per_cpu() accesses, so it's all moving
rather close to the most-optimal situation already.
Yeah, I don't think we can do much better than those ops. I have two
issues tho.
1. percpu_and() is missing. I added it for completeness's sake.
2. The generic percpu_op() should be local to the cpu, so it should
expand to...
do { get_cpu_var(var) OP (val); put_cpu_var(var) } while (0)
as the original x86_OP_percpu() did. Right?
Thanks.
No no no no. This is a crapload of infrastructure noone will use.
Please just start by adding read_percpu like so (this won't apply since
there's lots of other per-cpu things going on, but you get the idea).
We don't need a whole set of operators for a handful of
non-arch-specific cases. Reading a var is fairly common, the other ops
are diminishing returns and we already have local_t for some of these
cases (and we're reviewing that, too).
Actually, the percpu_add()/sub() ops are useful for statistics. (can be
done without preempt disable/enable) percpu_write() is also obviously
useful. The others are common arithmetic operators, for completeness of
the API.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Tejun Heo
2009-01-15 10:29:04 UTC
Permalink
Post by Ingo Molnar
percpu_read()
I already have this in my patch series.
Frankly, it starts to get marginal in generic code after percpu_read(),
so I didn't do them (and the per-cpu interfaces are already pretty wide).
Tejun, is now a good time to rebase my alloc_percpu series on top of
yours? I'm more than happy to hand them over to someone with more cycles...
Sure, send them my way.

Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Loading...