[RFC][PATCH 07/13] sched: Reorder task_struct

Discussion:

[RFC][PATCH 07/13] sched: Reorder task_struct

(too old to reply)

Peter Zijlstra

2015-06-22 12:24:44 UTC

Fill some 4 byte holes by slightly re-ordering some variables.

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/sched.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1363,17 +1363,16 @@ struct task_struct {
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
+ int on_rq;

#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+ int wake_cpu;
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts;
-
- int wake_cpu;
#endif
- int on_rq;

int prio, static_prio, normal_prio;
unsigned int rt_priority;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:24:54 UTC

Provide a static init

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/percpu-rwsem.h | 13 +++++++++++++
1 file changed, 13 insertions(+)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,19 @@ struct percpu_rw_semaphore {
struct rw_semaphore rw_sem;
};

+#define DEFINE_STATIC_PERCPU_RWSEM(name) \
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
+static struct percpu_rw_semaphore name = { \
+ .refcount = &__percpu_rwsem_refcount_##name, \
+ .state = 0, \
+ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
+ .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
+ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
+}
+
+#define lockdep_assert_held_percpu_rwsem(sem) \
+ lockdep_assert_held(&(sem)->rw_sem)
+
extern void __percpu_down_read(struct percpu_rw_semaphore *);
extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:25:14 UTC

It would be nice to validate that the caller of rcu_sync_is_idle()
holds the corresponding type of RCU read-side lock. Add the new
rcu_sync_ops->held() method and change rcu_sync_is_idle() to
WARN() if it returns false.

This obviously penalizes the readers (fast-path), but only if
CONFIG_PROVE_RCU.

Reviewed-by: Paul E. McKenney <***@linux.vnet.ibm.com>
Suggested-by: "Paul E. McKenney" <***@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/rcusync.h | 6 ++++++
kernel/rcu/sync.c | 21 +++++++++++++++++++++
2 files changed, 27 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -17,9 +17,15 @@ struct rcu_sync_struct {
enum rcu_sync_type gp_type;
};

+extern bool __rcu_sync_is_idle(struct rcu_sync_struct *);
+
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
+#ifdef CONFIG_PROVE_RCU
+ return __rcu_sync_is_idle(rss);
+#else
return !rss->gp_state; /* GP_IDLE */
+#endif
}

extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,21 +1,33 @@
#include <linux/rcusync.h>
#include <linux/sched.h>

+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func) .held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+ int (*held)(void);
+#endif
} gp_ops[] = {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ __INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ __INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ __INIT_HELD(rcu_read_lock_bh_held)
},
};

@@ -24,6 +36,15 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLA

#define rss_lock gp_wait.lock

+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ WARN_ON(!gp_ops[rss->gp_type].held());
+ return rss->gp_state == GP_IDLE;
+}
+EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
+#endif
+
void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
{
memset(rss, 0, sizeof(*rss));

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:25:19 UTC

It is functionally equivalent to

struct rcu_sync_struct {
atomic_t counter;
};

static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return atomic_read(&rss->counter) == 0;
}

static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
{
atomic_inc(&rss->counter);
synchronize_sched();
}

static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
{
synchronize_sched();
atomic_dec(&rss->counter);
}

except: it records the state and synchronize_sched() is only called by
rcu_sync_enter() and only if necessary.

Reviewed-by: Paul E. McKenney <***@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/rcusync.h | 64 ++++++++++++++++++++++++++++
kernel/rcu/Makefile | 2
kernel/rcu/sync.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+), 1 deletion(-)

--- /dev/null
+++ b/include/linux/rcusync.h
@@ -0,0 +1,64 @@
+#ifndef _LINUX_RCUSYNC_H_
+#define _LINUX_RCUSYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+struct rcu_sync_struct {
+ int gp_state;
+ int gp_count;
+ wait_queue_head_t gp_wait;
+
+ int cb_state;
+ struct rcu_head cb_head;
+
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name) \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_sched, \
+ .call = call_rcu_sched, \
+}
+
+#define __RCU_BH_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu_bh, \
+ .call = call_rcu_bh, \
+}
+
+#define __RCU_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu, \
+ .call = call_rcu, \
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
+
+static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ return !rss->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync_struct *);
+extern void rcu_sync_exit(struct rcu_sync_struct *);
+
+#endif /* _LINUX_RCUSYNC_H_ */
+
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,108 @@
+
+#include <linux/rcusync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define rss_lock gp_wait.lock
+
+void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+{
+ memset(rss, 0, sizeof(*rss));
+ init_waitqueue_head(&rss->gp_wait);
+
+ switch (type) {
+ case RCU_SYNC:
+ rss->sync = synchronize_rcu;
+ rss->call = call_rcu;
+ break;
+
+ case RCU_SCHED_SYNC:
+ rss->sync = synchronize_sched;
+ rss->call = call_rcu_sched;
+ break;
+
+ case RCU_BH_SYNC:
+ rss->sync = synchronize_rcu_bh;
+ rss->call = call_rcu_bh;
+ break;
+ }
+}
+
+void rcu_sync_enter(struct rcu_sync_struct *rss)
+{
+ bool need_wait, need_sync;
+
+ spin_lock_irq(&rss->rss_lock);
+ need_wait = rss->gp_count++;
+ need_sync = rss->gp_state == GP_IDLE;
+ if (need_sync)
+ rss->gp_state = GP_PENDING;
+ spin_unlock_irq(&rss->rss_lock);
+
+ BUG_ON(need_wait && need_sync);
+
+ if (need_sync) {
+ rss->sync();
+ rss->gp_state = GP_PASSED;
+ wake_up_all(&rss->gp_wait);
+ } else if (need_wait) {
+ wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ } else {
+ /*
+ * Possible when there's a pending CB from a rcu_sync_exit().
+ * Nobody has yet been allowed the 'fast' path and thus we can
+ * avoid doing any sync(). The callback will get 'dropped'.
+ */
+ BUG_ON(rss->gp_state != GP_PASSED);
+ }
+}
+
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+ struct rcu_sync_struct *rss =
+ container_of(rcu, struct rcu_sync_struct, cb_head);
+ unsigned long flags;
+
+
+ BUG_ON(rss->gp_state != GP_PASSED);
+ BUG_ON(rss->cb_state == CB_IDLE);
+
+ spin_lock_irqsave(&rss->rss_lock, flags);
+ if (rss->gp_count) {
+ /*
+ * A new rcu_sync_begin() has happened; drop the callback.
+ */
+ rss->cb_state = CB_IDLE;
+ } else if (rss->cb_state == CB_REPLAY) {
+ /*
+ * A new rcu_sync_exit() has happened; requeue the callback
+ * to catch a later GP.
+ */
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else {
+ /*
+ * We're at least a GP after rcu_sync_exit(); eveybody will now
+ * have observed the write side critical section. Let 'em rip!.
+ */
+ rss->cb_state = CB_IDLE;
+ rss->gp_state = GP_IDLE;
+ }
+ spin_unlock_irqrestore(&rss->rss_lock, flags);
+}
+
+void rcu_sync_exit(struct rcu_sync_struct *rss)
+{
+ spin_lock_irq(&rss->rss_lock);
+ if (!--rss->gp_count) {
+ if (rss->cb_state == CB_IDLE) {
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else if (rss->cb_state == CB_PENDING) {
+ rss->cb_state = CB_REPLAY;
+ }
+ }
+ spin_unlock_irq(&rss->rss_lock);
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:25:27 UTC

Add the new struct rcu_sync_ops which holds sync/call methods, and
turn the function pointers in rcu_sync_struct into an array of struct
rcu_sync_ops.

This simplifies the "init" helpers, and this way it is simpler to add
the new methods we need, especially ifdef'ed.

Reviewed-by: Paul E. McKenney <***@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/rcusync.h | 60 ++++++++++++++++++------------------------------
kernel/rcu/sync.c | 43 +++++++++++++++++-----------------
2 files changed, 45 insertions(+), 58 deletions(-)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -4,6 +4,8 @@
#include <linux/wait.h>
#include <linux/rcupdate.h>

+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
struct rcu_sync_struct {
int gp_state;
int gp_count;
@@ -12,53 +14,37 @@ struct rcu_sync_struct {
int cb_state;
struct rcu_head cb_head;

- void (*sync)(void);
- void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ enum rcu_sync_type gp_type;
};

-#define ___RCU_SYNC_INIT(name) \
- .gp_state = 0, \
- .gp_count = 0, \
- .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
- .cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_sched, \
- .call = call_rcu_sched, \
-}
-
-#define __RCU_BH_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu_bh, \
- .call = call_rcu_bh, \
-}
-
-#define __RCU_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu, \
- .call = call_rcu, \
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name) \
- struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
-
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return !rss->gp_state; /* GP_IDLE */
}

-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);

+#define __RCU_SYNC_INITIALIZER(name, type) { \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0, \
+ .gp_type = type, \
+ }
+
+#define __DEFINE_RCU_SYNC(name, type) \
+ struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
#endif /* _LINUX_RCUSYNC_H_ */

--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,7 +1,24 @@
-
#include <linux/rcusync.h>
#include <linux/sched.h>

+static const struct {
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+ [RCU_SYNC] = {
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ },
+ [RCU_SCHED_SYNC] = {
+ .sync = synchronize_sched,
+ .call = call_rcu_sched,
+ },
+ [RCU_BH_SYNC] = {
+ .sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
+ },
+};
+
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };

@@ -11,23 +28,7 @@ void rcu_sync_init(struct rcu_sync_struc
{
memset(rss, 0, sizeof(*rss));
init_waitqueue_head(&rss->gp_wait);
-
- switch (type) {
- case RCU_SYNC:
- rss->sync = synchronize_rcu;
- rss->call = call_rcu;
- break;
-
- case RCU_SCHED_SYNC:
- rss->sync = synchronize_sched;
- rss->call = call_rcu_sched;
- break;
-
- case RCU_BH_SYNC:
- rss->sync = synchronize_rcu_bh;
- rss->call = call_rcu_bh;
- break;
- }
+ rss->gp_type = type;
}

void rcu_sync_enter(struct rcu_sync_struct *rss)
@@ -44,7 +45,7 @@ void rcu_sync_enter(struct rcu_sync_stru
BUG_ON(need_wait && need_sync);

if (need_sync) {
- rss->sync();
+ gp_ops[rss->gp_type].sync();
rss->gp_state = GP_PASSED;
wake_up_all(&rss->gp_wait);
} else if (need_wait) {
@@ -81,7 +82,7 @@ static void rcu_sync_func(struct rcu_hea
* to catch a later GP.
*/
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else {
/*
* We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -99,7 +100,7 @@ void rcu_sync_exit(struct rcu_sync_struc
if (!--rss->gp_count) {
if (rss->cb_state == CB_IDLE) {
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else if (rss->cb_state == CB_PENDING) {
rss->cb_state = CB_REPLAY;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:25:33 UTC

We can replace both the global and local part of the lglock by better
usage of cpu_stopper::lock.

By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
order against the global stop_machine which takes each of these locks
in order.

Cc: Rik van Riel <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
kernel/stop_machine.c | 52 ++++++++++++++++++++++++++++----------------------
lib/Kconfig | 5 ++++
2 files changed, 35 insertions(+), 22 deletions(-)

--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
-#include <linux/lglock.h>

/*
* Structure to determine completion condition and record errors. May
@@ -44,14 +43,6 @@ static DEFINE_PER_CPU(struct cpu_stopper
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;

-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
-
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -71,21 +62,26 @@ static void cpu_stop_signal_done(struct
}

/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static void __cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct task_struct *p = per_cpu(cpu_stopper_task, cpu);

- unsigned long flags;
-
- spin_lock_irqsave(&stopper->lock, flags);
-
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
wake_up_process(p);
- } else
+ } else {
cpu_stop_signal_done(work->done, false);
+ }
+}

+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ __cpu_stop_queue_work(cpu, work);
spin_unlock_irqrestore(&stopper->lock, flags);
}

@@ -224,9 +220,14 @@ static int multi_cpu_stop(void *data)
*/
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
{
- struct cpu_stop_done done;
+ struct cpu_stopper *stopper1, *stopper2;
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
+ struct cpu_stop_done done;
+ unsigned long flags;
+
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);

preempt_disable();
msdata = (struct multi_stop_data){
@@ -258,10 +259,17 @@ int stop_two_cpus(unsigned int cpu1, uns
return -ENOENT;
}

- lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
- cpu_stop_queue_work(cpu1, &work1);
- cpu_stop_queue_work(cpu2, &work2);
- lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+ stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+ stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+
+ spin_lock_irqsave(&stopper1->lock, flags);
+ spin_lock(&stopper2->lock);
+
+ __cpu_stop_queue_work(cpu1, &work1);
+ __cpu_stop_queue_work(cpu2, &work2);
+
+ spin_unlock(&stopper2->lock);
+ spin_unlock_irqrestore(&stopper1->lock, flags);

preempt_enable();

@@ -315,10 +323,10 @@ static void queue_stop_cpus_work(const s
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- lg_global_lock(&stop_cpus_lock);
+ preempt_disable();
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
- lg_global_unlock(&stop_cpus_lock);
+ preempt_enable();
}

static int __stop_cpus(const struct cpumask *cpumask,
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -61,6 +61,11 @@ config PERCPU_RWSEM_HOTPLUG
depends on HOTPLUG_CPU
select PERCPU_RWSEM

+config PERCPU_RWSEM_SMP
+ def_bool y
+ depends on SMP
+ select PERCPU_RWSEM
+
config ARCH_USE_CMPXCHG_LOCKREF
bool

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-22 22:23:29 UTC

Post by Peter Zijlstra
By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
order against the global stop_machine which takes each of these locks
in order.

Yes, but stop_machine() locks/unlocs cpu_stopper->lock sequentially, it
never holds more than 1 ->lock, so

Post by Peter Zijlstra
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ __cpu_stop_queue_work(cpu, work);
spin_unlock_irqrestore(&stopper->lock, flags);
}

..

Post by Peter Zijlstra
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
{
- struct cpu_stop_done done;
+ struct cpu_stopper *stopper1, *stopper2;
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
+ struct cpu_stop_done done;
+ unsigned long flags;
+
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);

..

Post by Peter Zijlstra
+ stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+ stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+
+ spin_lock_irqsave(&stopper1->lock, flags);
+ spin_lock(&stopper2->lock);
+
+ __cpu_stop_queue_work(cpu1, &work1);
+ __cpu_stop_queue_work(cpu2, &work2);

Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().

- stop_machine takes the lock on CPU 0, adds the work
and drops the lock

- cpu_stop_queue_work() queues both works

- stop_machine takes the lock on CPU 1, etc

In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
use different multi_stop_data's, so they will wait for each other
forever?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 10:09:56 UTC

Post by Oleg Nesterov
Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
- stop_machine takes the lock on CPU 0, adds the work
and drops the lock
- cpu_stop_queue_work() queues both works

cpu_stop_queue_work() only ever queues _1_ work.

Post by Oleg Nesterov
- stop_machine takes the lock on CPU 1, etc
In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
use different multi_stop_data's, so they will wait for each other
forever?

So what you're saying is:

queue_stop_cpus_work() stop_two_cpus()

cpu_stop_queue_work(0,..);
spin_lock(0);
spin_lock(1);

__cpu_stop_queue_work(0,..);
__cpu_stop_queue_work(1,..);

spin_unlock(1);
spin_unlock(0);
cpu_stop_queue_work(1,..);

Indeed, I don't know what I was thinking...

We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 10:56:12 UTC

Post by Peter Zijlstra
We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.

Urgh, we cannot use percpu-rwsem here, because that would require
percpu_down_write_trylock(), and I'm not sure we can get around the
sync_sched() for that.

Now try_stop_cpus(), which requires the down_write_trylock() is used to
implement synchronize_sched_expedited().

Using sync_sched() to implement sync_sched_expedited would make me
happy, but it does somewhat defeat the purpose.

Also, I think _expedited is used too eagerly, look at this:

+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}

sync_srcu() is slow already, why then bother with an
sync_rcu_expedited() :/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 11:21:02 UTC

Post by Peter Zijlstra

Post by Peter Zijlstra
We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.

Urgh, we cannot use percpu-rwsem here, because that would require
percpu_down_write_trylock(), and I'm not sure we can get around the
sync_sched() for that.
Now try_stop_cpus(), which requires the down_write_trylock() is used to
implement synchronize_sched_expedited().
Using sync_sched() to implement sync_sched_expedited would make me
happy, but it does somewhat defeat the purpose.

Paul, why does this use stop_machine anyway? I seemed to remember you
sending resched IPIs around.

The rcu_sched_qs() thing would set passed_quiesce, which you can then
collect to gauge progress.

Shooting IPIs around is bad enough, but running a full blown
stop_machine is really blunt and heavy.

Also, OMFG @ 74b51ee152b6 ("ACPI / osl: speedup grace period in
acpi_os_map_cleanup"), that's an expedited use to help the nVidiot
binary blob. WTF!!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 13:08:57 UTC

Post by Peter Zijlstra
Paul, why does this use stop_machine anyway? I seemed to remember you
sending resched IPIs around.
The rcu_sched_qs() thing would set passed_quiesce, which you can then
collect to gauge progress.
Shooting IPIs around is bad enough, but running a full blown
stop_machine is really blunt and heavy.

Is there anything obviously amiss with the below? It does stop_one_cpu()
in a loop instead of the multi cpu stop_machine and is therefore much
friendlier (albeit still heavier than bare resched IPIs) since the CPUs
do not have to go an sync up.

After all, all we're really interested in is that each CPUs has
scheduled at least once, we do not care about the cross cpu syncup.

---
include/linux/stop_machine.h | 7 ----
kernel/rcu/tree.c | 99 +++++---------------------------------------
kernel/stop_machine.c | 30 --------------
3 files changed, 10 insertions(+), 126 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..f992da7ee492 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);

#else /* CONFIG_SMP */

@@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
return -ENOENT;
}

-static inline int try_stop_cpus(const struct cpumask *cpumask,
- cpu_stop_fn_t fn, void *arg)
-{
- return stop_cpus(cpumask, fn, arg);
-}
-
#endif /* CONFIG_SMP */

/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..4a8cde155dce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
* There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
+ * between the time that stop_one_cpu() is called and the
* time that it returns.
*
* In the current initial implementation of cpu_stop, the
@@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* grace period. We are then done, so we use atomic_cmpxchg() to
* update sync_sched_expedited_done to match our snapshot -- but
* only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;

/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;

- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);

-all_cpus_idle:
- free_cpumask_var(cm);
+ atomic_long_inc(&rsp->expedited_stoppedcpus);

/*
* Everyone up to our most recent fetch is covered by our grace
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..b1329a213503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}

-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-23 16:37:53 UTC

Post by Peter Zijlstra
void synchronize_sched_expedited(void)
{

..

Post by Peter Zijlstra
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }

..

Post by Peter Zijlstra
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

..

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

I too thought about something like this change ;)

Not sure I read this patch correctly, but it seems that then you can
remove all rsp->expedited_* members/code ?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-23 17:30:59 UTC

Post by Peter Zijlstra

Post by Peter Zijlstra
Paul, why does this use stop_machine anyway? I seemed to remember you
sending resched IPIs around.

It used to, but someone submitted a patch long ago that switched it
to try_stop_cpus(). At that time, RCU didn't unconditionally do the
dyntick-idle thing for CONFIG_NO_HZ=n kernel, so try_stop_cpus() was
quite a bit simpler.

That said, I do use your new-age resched-IPI API in other cases.

Post by Peter Zijlstra

Post by Peter Zijlstra
The rcu_sched_qs() thing would set passed_quiesce, which you can then
collect to gauge progress.
Shooting IPIs around is bad enough, but running a full blown
stop_machine is really blunt and heavy.

Is there anything obviously amiss with the below? It does stop_one_cpu()
in a loop instead of the multi cpu stop_machine and is therefore much
friendlier (albeit still heavier than bare resched IPIs) since the CPUs
do not have to go an sync up.
After all, all we're really interested in is that each CPUs has
scheduled at least once, we do not care about the cross cpu syncup.

This was on my list. I was thinking of using smp_call_function_single()
combined with polling in order to avoid the double context switch, but
there the approach below is of course simpler. I was intending to fix
up the rest of RCU's relationship with CPU hotplug first, as this would
allow fully covering the incoming and outgoing code paths.

But perhaps a bit too simple. A few comments below...

Thanx, Paul

Post by Peter Zijlstra
---
include/linux/stop_machine.h | 7 ----
kernel/rcu/tree.c | 99 +++++---------------------------------------
kernel/stop_machine.c | 30 --------------
3 files changed, 10 insertions(+), 126 deletions(-)
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..f992da7ee492 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
#else /* CONFIG_SMP */
@@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
return -ENOENT;
}
-static inline int try_stop_cpus(const struct cpumask *cpumask,
- cpu_stop_fn_t fn, void *arg)
-{
- return stop_cpus(cpumask, fn, arg);
-}
-
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..4a8cde155dce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
* There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
+ * between the time that stop_one_cpu() is called and the
* time that it returns.
*
* In the current initial implementation of cpu_stop, the
@@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* grace period. We are then done, so we use atomic_cmpxchg() to
* update sync_sched_expedited_done to match our snapshot -- but
* only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;

Hmmm...

Post by Peter Zijlstra
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }

Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

If you had lots of CPUs coming and going, you could argue that tracking
them would help, but synchronize_sched_expedited() should run fast enough
that there isn't time for CPUs to come or go, at least in the common case.

Post by Peter Zijlstra
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;

Here you lose batching. Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path. Need a mutex_trylock() with a counter and checking for
others having already done the needed work.

Post by Peter Zijlstra
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }

And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

Post by Peter Zijlstra
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;

Let's see... This does work for idle CPUs and for nohz_full CPUs running
in userspace.

It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.

There always has been a race window involving CPU hotplug. My recent
CPU_DYING_IDLE change allows things to be exact on the outgoing side,
and I need to make a similar change on the incoming side. There will
continue to be a window where RCU needs to pay attention to the CPU,
but neither IPIs nor scheduling works, and I guess I just do a timed
wait in that case. Rare race anyway, so should be fine.

Post by Peter Zijlstra
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so. This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch. It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.

Post by Peter Zijlstra
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
- free_cpumask_var(cm);
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..b1329a213503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-/**
- * try_stop_cpus - try to stop multiple cpus
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * Might sleep.
- *
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 18:04:35 UTC

Post by Paul E. McKenney
Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

get_online_cpus()
for_each_online_cpus() {
...
}

is what the new code does.

Post by Paul E. McKenney

Post by Peter Zijlstra
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;

Here you lose batching. Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path. Need a mutex_trylock() with a counter and checking for
others having already done the needed work.

I really think you're making that expedited nonsense far too accessible.

But it was exactly that trylock I was trying to get rid of.

Post by Paul E. McKenney
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

Post by Paul E. McKenney

Post by Peter Zijlstra
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;

Let's see... This does work for idle CPUs and for nohz_full CPUs running
in userspace.
It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.

Right, realized after I send it out, but it _should_ work for the
current cpu too. Just pointless doing it.

Post by Paul E. McKenney
There always has been a race window involving CPU hotplug.

There is no hotplug race, the entire thing has get_online_cpus() held
across it.

Post by Paul E. McKenney

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.

set_tsk_need_resched() is buggy and should not be used.

Post by Paul E. McKenney
This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch. It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.

_IF_ you're going to touch rcu_note_context_switch(), you might as well
use a completion, set it for the number of CPUs that need a resched,
spray resched-IPI and have rcu_note_context_switch() do a complete().

But I would really like to avoid adding code to
rcu_note_context_switch(), because we run that on _every_ single context
switch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-23 18:26:46 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney
Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

get_online_cpus()
for_each_online_cpus() {
...
}
is what the new code does.

Ah, I missed that this was not deleted.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;

Here you lose batching. Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path. Need a mutex_trylock() with a counter and checking for
others having already done the needed work.

I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many
non-RCU bugs.

Post by Peter Zijlstra
But it was exactly that trylock I was trying to get rid of.

OK. Why, exactly?

Post by Peter Zijlstra

Post by Paul E. McKenney
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

Yes, we should fix them. No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;

Let's see... This does work for idle CPUs and for nohz_full CPUs running
in userspace.
It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.

Right, realized after I send it out, but it _should_ work for the
current cpu too. Just pointless doing it.

OK, and easily fixed up in any case.

Post by Peter Zijlstra

Post by Paul E. McKenney
There always has been a race window involving CPU hotplug.

There is no hotplug race, the entire thing has get_online_cpus() held
across it.

Which I would like to get rid of, but not urgent.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.

set_tsk_need_resched() is buggy and should not be used.

OK, what API is used for this purpose?

Post by Peter Zijlstra

Post by Paul E. McKenney
This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch. It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.

_IF_ you're going to touch rcu_note_context_switch(), you might as well
use a completion, set it for the number of CPUs that need a resched,
spray resched-IPI and have rcu_note_context_switch() do a complete().
But I would really like to avoid adding code to
rcu_note_context_switch(), because we run that on _every_ single context
switch.

I believe that I can rework the current code to get the effect without
increased overhead, given that I have no intention of adding the
complete(). Adding the complete -would- add overhead to that fastpath.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-23 19:05:22 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

get_online_cpus()
for_each_online_cpus() {
...
}
is what the new code does.

Ah, I missed that this was not deleted.

But get_online_cpus() will re-introduce a deadlock.

Thanx, Paul

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;

Here you lose batching. Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path. Need a mutex_trylock() with a counter and checking for
others having already done the needed work.

I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many
non-RCU bugs.

Post by Peter Zijlstra
But it was exactly that trylock I was trying to get rid of.

OK. Why, exactly?

Post by Peter Zijlstra

Post by Paul E. McKenney
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

Yes, we should fix them. No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;

Let's see... This does work for idle CPUs and for nohz_full CPUs running
in userspace.
It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.

Right, realized after I send it out, but it _should_ work for the
current cpu too. Just pointless doing it.

OK, and easily fixed up in any case.

Post by Peter Zijlstra

Post by Paul E. McKenney
There always has been a race window involving CPU hotplug.

There is no hotplug race, the entire thing has get_online_cpus() held
across it.

Which I would like to get rid of, but not urgent.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.

set_tsk_need_resched() is buggy and should not be used.

OK, what API is used for this purpose?

Post by Peter Zijlstra

Post by Paul E. McKenney
This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch. It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.

_IF_ you're going to touch rcu_note_context_switch(), you might as well
use a completion, set it for the number of CPUs that need a resched,
spray resched-IPI and have rcu_note_context_switch() do a complete().
But I would really like to avoid adding code to
rcu_note_context_switch(), because we run that on _every_ single context
switch.

I believe that I can rework the current code to get the effect without
increased overhead, given that I have no intention of adding the
complete(). Adding the complete -would- add overhead to that fastpath.
Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 02:24:11 UTC

Post by Paul E. McKenney

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

get_online_cpus()
for_each_online_cpus() {
...
}
is what the new code does.

Ah, I missed that this was not deleted.

But get_online_cpus() will re-introduce a deadlock.

And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you. There are a number of simplifications that can be made, but
the basic approach gets a good testing first.

And I just noticed that I forgot to get rid of try_stop_cpus().
Well, there will probably be a test failure or two to handle, so
I can add that in the next version. ;-)

Thanx, Paul

------------------------------------------------------------------------

commit 1de96c34b39d840c5fe2689640345ed26f78b8f8
Author: Peter Zijlstra <***@infradead.org>
Date: Tue Jun 23 19:03:45 2015 -0700

rcu: Switch synchronize_sched_expedited() to stop_one_cpu()

The synchronize_sched_expedited() currently invokes try_stop_cpus(),
which schedules the stopper kthreads on each online non-idle CPU,
and waits until all those kthreads are running before letting any
of them stop. This is disastrous for real-time workloads, which
get hit with a preemption that is as long as the longest scheduling
latency on any CPU, including any non-realtime housekeeping CPUs.
This commit therefore switches to using stop_one_cpu() on each CPU
in turn. This avoids inflicting the worst-case scheduling latency
on the worst-case CPU onto all other CPUs, and also simplifies the
code a little bit.

Follow-up commits will simplify the counter-snapshotting algorithm
and convert a number of the counters that are now protected by the
new ->expedited_mutex to non-atomic.

Signed-off-by: Peter Zijlstra <***@infradead.org>
[ paulmck: Kept stop_one_cpu(), dropped disabling of "guardrails". ]
Signed-off-by: Paul E. McKenney <***@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..a30971474134 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3357,8 +3358,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
long firstsnap, s, snap;
int trycount = 0;
@@ -3394,28 +3393,11 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
+ while (!mutex_trylock(&rsp->expedited_mutex)) {
put_online_cpus();
atomic_long_inc(&rsp->expedited_tryfail);

@@ -3425,7 +3407,6 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
return;
}

@@ -3435,7 +3416,6 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
return;
}

@@ -3445,7 +3425,6 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
return;
}

@@ -3460,16 +3439,23 @@ void synchronize_sched_expedited(void)
/* CPU hotplug operation in flight, use normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
return;
}
snap = atomic_long_read(&rsp->expedited_start);
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);

-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
+ }
+ atomic_long_inc(&rsp->expedited_stoppedcpus);

/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3488,6 +3474,7 @@ all_cpus_idle:
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+ mutex_unlock(&rsp->expedited_mutex);

put_online_cpus();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..b04ffa0dea58 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -478,6 +478,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 08:33:22 UTC

Post by Paul E. McKenney
And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you. There are a number of simplifications that can be made, but
the basic approach gets a good testing first.

So I really do not get the point of the trylock. It doesn't make sense.

Why would you poll the mutex instead of just wait for it and then
recheck if someone did the work while you were waiting for it?

What's wrong with the below?

---
kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
kernel/rcu/tree.h | 1 +
2 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..b39a5672a7ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;

/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);

- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }

- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;

- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
atomic_long_inc(&rsp->expedited_stoppedcpus);

-all_cpus_idle:
- free_cpumask_var(cm);
-
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ mutex_unlock(&rsp->expedited_mutex);

put_online_cpus();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..10348c081e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 09:31:28 UTC

Post by Peter Zijlstra
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */

FWIW isn't that guaranteed by the control dep?

Post by Peter Zijlstra
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 13:49:33 UTC

Post by Peter Zijlstra

Post by Peter Zijlstra
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */

FWIW isn't that guaranteed by the control dep?

For trailing stores, yes, but not for trailing loads. Of course,
trailing loads don't matter in the pure kfree case, but do matter in
other situations. And this isn't anywhere near a fastpath, so I
am not all that worried about the extra memory barrier.

Thanx, Paul

Post by Peter Zijlstra

Post by Peter Zijlstra
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 15:20:25 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney
And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you. There are a number of simplifications that can be made, but
the basic approach gets a good testing first.

So I really do not get the point of the trylock. It doesn't make sense.
Why would you poll the mutex instead of just wait for it and then
recheck if someone did the work while you were waiting for it?
What's wrong with the below?

Various delays can cause tasks to queue on the mutex out of order.
This can cause a given task not only to have been delayed between
sampling ->expedited_start and the mutex_lock(), but be further delayed
because tasks granted the mutex earlier will wait on grace periods that
the delayed task doesn't need to wait on. These extra waits are simply
not consistent with the "expedited" in synchronize_sched_expedited().

That said, my polling code can most definitely be improved -- as I
mentioned earlier, it is from 2008 or so, back when a lot of things
worked differently. My first thought is to apply something sort of
like force_quiescent_state()'s funnel locking, but with unconditional
mutex_lock() instead of the raw_spin_trylock(). That way, when a given
task is awakened, there is a high probability that a grace period it
can use has already elapsed, allowing it to break out of the loop and go
on its way. This can probably be further improved, but it is a decent
place for me to start.

Thanx, Paul

Post by Peter Zijlstra
---
kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
kernel/rcu/tree.h | 1 +
2 files changed, 29 insertions(+), 72 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..b39a5672a7ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
- free_cpumask_var(cm);
-
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..10348c081e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 15:35:11 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you. There are a number of simplifications that can be made, but
the basic approach gets a good testing first.

So I really do not get the point of the trylock. It doesn't make sense.
Why would you poll the mutex instead of just wait for it and then
recheck if someone did the work while you were waiting for it?
What's wrong with the below?

Various delays can cause tasks to queue on the mutex out of order.

If the mutex owner sleeps, mutexes are FIFO, otherwise things can get
iffy indeed.

Post by Paul E. McKenney
This can cause a given task not only to have been delayed between
sampling ->expedited_start and the mutex_lock(), but be further delayed
because tasks granted the mutex earlier will wait on grace periods that
the delayed task doesn't need to wait on. These extra waits are simply
not consistent with the "expedited" in synchronize_sched_expedited().

Feh, I really do not know if its worth optimizing the concurrent
expedited case, but we could just make it an open-coded mutex that's
strictly FIFO. A waitqueue on the done variable might be sufficient.

That's still tons better than polling.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 07:35:25 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many
non-RCU bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

Yes, we should fix them. No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.

I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?

How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.

set_tsk_need_resched() is buggy and should not be used.

OK, what API is used for this purpose?

As per exception you (rcu) already have access to resched_cpu(), use
that -- if it doesn't do what you need it to, we'll fix it, you're the
only consumer of it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-06-24 08:43:07 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many non-RCU
bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)

Lockdep (and the scheduler APIs as well) frequently got into such situations as
well, and we mostly solved it by being more informative with debug splats.

I don't think a kernel API should (ever!) stay artificially silent, just for fear
of flagging too many problems in other code.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 13:40:08 UTC

Post by Ingo Molnar

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many non-RCU
bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)

I will gladly accept that compliment.

And the burden. But, lazy as I am, I intend to automate it. ;-)

Post by Ingo Molnar
Lockdep (and the scheduler APIs as well) frequently got into such situations as
well, and we mostly solved it by being more informative with debug splats.
I don't think a kernel API should (ever!) stay artificially silent, just for fear
of flagging too many problems in other code.

I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
RCU checks, and the object-debug-based checks for double call_rcu().
That said, in all of these cases, including your example of lockdep,
the diagnostic is a debug splat rather than a mutex-contention meltdown.
And it is the mutex-contention meltdown that I will continue making
synchronize_sched_expedited() avoid.

But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
IPIs, it would not be hard to splat if a given CPU didn't come back fast
enough. The latency tracer would of course provide better information,
but synchronize_sched_expedited() could do a coarse-grained job with
less setup required.

My first guess for the timeout would be something like 500 milliseconds.
Thoughts?

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-06-24 13:43:55 UTC

Post by Paul E. McKenney

Post by Ingo Molnar

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many non-RCU
bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)

I will gladly accept that compliment.
And the burden. But, lazy as I am, I intend to automate it. ;-)

lol :)

Post by Paul E. McKenney

Post by Ingo Molnar
Lockdep (and the scheduler APIs as well) frequently got into such situations as
well, and we mostly solved it by being more informative with debug splats.
I don't think a kernel API should (ever!) stay artificially silent, just for fear
of flagging too many problems in other code.

I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
RCU checks, and the object-debug-based checks for double call_rcu().
That said, in all of these cases, including your example of lockdep,
the diagnostic is a debug splat rather than a mutex-contention meltdown.
And it is the mutex-contention meltdown that I will continue making
synchronize_sched_expedited() avoid.
But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
IPIs, it would not be hard to splat if a given CPU didn't come back fast
enough. The latency tracer would of course provide better information,
but synchronize_sched_expedited() could do a coarse-grained job with
less setup required.
My first guess for the timeout would be something like 500 milliseconds.
Thoughts?

So I'd start with 5,000 milliseconds and observe the results first ...

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 14:07:19 UTC

Post by Ingo Molnar

Post by Paul E. McKenney

Post by Ingo Molnar

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many non-RCU
bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)

I will gladly accept that compliment.
And the burden. But, lazy as I am, I intend to automate it. ;-)

lol :)

Post by Paul E. McKenney

Post by Ingo Molnar
Lockdep (and the scheduler APIs as well) frequently got into such situations as
well, and we mostly solved it by being more informative with debug splats.
I don't think a kernel API should (ever!) stay artificially silent, just for fear
of flagging too many problems in other code.

I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
RCU checks, and the object-debug-based checks for double call_rcu().
That said, in all of these cases, including your example of lockdep,
the diagnostic is a debug splat rather than a mutex-contention meltdown.
And it is the mutex-contention meltdown that I will continue making
synchronize_sched_expedited() avoid.
But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
IPIs, it would not be hard to splat if a given CPU didn't come back fast
enough. The latency tracer would of course provide better information,
but synchronize_sched_expedited() could do a coarse-grained job with
less setup required.
My first guess for the timeout would be something like 500 milliseconds.
Thoughts?

So I'd start with 5,000 milliseconds and observe the results first ...

Sounds good, especially when I recall that the default RCU CPU stall
warning timeout is 21,000 milliseconds... ;-)

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 14:51:46 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many
non-RCU bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

As discussed in the thread with Ingo, I will do both.

Alternatively, RCU -is- abuse. Anyone who tries to tell you
otherwise simply lacks proper respect for and adoration of traditional
synchronization mechanisms. ;-)

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

Yes, we should fix them. No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.

I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?
How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?

Because if there is too much delay, synchronize_rcu() is no slower
than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
more efficient.

That said, it appears that I have not given any particular thought to the
polling code since about 2008 or so, and it could use quite an upgrade...

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.

set_tsk_need_resched() is buggy and should not be used.

OK, what API is used for this purpose?

As per exception you (rcu) already have access to resched_cpu(), use
that -- if it doesn't do what you need it to, we'll fix it, you're the
only consumer of it.

Color me slow and stupid!

And it looks like resched_cpu() does just fine on the local CPU, so it
should be just fine as is. Thank you for the reminder.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 15:02:30 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?
How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?

Because if there is too much delay, synchronize_rcu() is no slower
than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
more efficient.

Still confused.. How is polling and then blocking more efficient than
just blocking in the first place? I'm seeing the polling as a waste of
cpu time.

The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 15:32:12 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?
How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?

Because if there is too much delay, synchronize_rcu() is no slower
than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
more efficient.

Still confused.. How is polling and then blocking more efficient than
just blocking in the first place? I'm seeing the polling as a waste of
cpu time.

As I said, the current code is quite old and will get a facelift.

Post by Peter Zijlstra
The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.

Ah, but synchronize_rcu() doesn't force waiting on more than one extra
grace period. With strictly queued mutex, you can end up waiting on
several.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 15:40:28 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.

Ah, but synchronize_rcu() doesn't force waiting on more than one extra
grace period. With strictly queued mutex, you can end up waiting on
several.

But you could fix that by replacing/augmenting the expedited ticket with
gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 16:09:24 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.

Ah, but synchronize_rcu() doesn't force waiting on more than one extra
grace period. With strictly queued mutex, you can end up waiting on
several.

But you could fix that by replacing/augmenting the expedited ticket with
gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().

Yes, good point, that would be a way of speeding the existing polling
loop up in the case where the polling loop took longer than a normal
grace period. Might also be a way to speed up the new "polling" regime,
but I am still beating up the counters. ;-)

But if the mutex serializes everything unconditionally, then you have
already potentially waited for several grace periods worth of time
before you get a chance to check the ticket, so the check doesn't help.
Or am I missing something subtle here?

It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 16:42:25 UTC

Post by Paul E. McKenney
Yes, good point, that would be a way of speeding the existing polling
loop up in the case where the polling loop took longer than a normal
grace period. Might also be a way to speed up the new "polling" regime,
but I am still beating up the counters. ;-)
But if the mutex serializes everything unconditionally, then you have
already potentially waited for several grace periods worth of time
before you get a chance to check the ticket, so the check doesn't help.
Or am I missing something subtle here?

Observe gpnum before you acquire the mutex, once you get it, check it
against completed, if you've waited long enough, bail.

The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.

Furthermore, until this point we can have 'slow' progress by kicking the
CPUs.

That said, the all cpus concurrent sync_rcu_expedited scenario is
absolutely horrid, its everyone spray everyone else.

Post by Paul E. McKenney
It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.

Bah, I was afraid of that, the problem is that we wait for the
individual stop_work to complete before sending another.

The below is getting a little out of hand, but should avoid the problem
and might be easier than getting the IPI think going, but who knows.

---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);

+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}

+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = { .wq = &stop_wait, };
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;

/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);

- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }

- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;

- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
+ &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);

-all_cpus_idle:
- free_cpumask_var(cm);
+ wait_event(ess.wq, !atomic_read(&ess.count));
+
+ atomic_long_inc(&rsp->expedited_stoppedcpus);

/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ mutex_unlock(&rsp->expedited_mutex);

put_online_cpus();
}
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 17:10:45 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney
Yes, good point, that would be a way of speeding the existing polling
loop up in the case where the polling loop took longer than a normal
grace period. Might also be a way to speed up the new "polling" regime,
but I am still beating up the counters. ;-)
But if the mutex serializes everything unconditionally, then you have
already potentially waited for several grace periods worth of time
before you get a chance to check the ticket, so the check doesn't help.
Or am I missing something subtle here?

Observe gpnum before you acquire the mutex, once you get it, check it
against completed, if you've waited long enough, bail.
The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.

In my experience, this sort of thing simply melts down on large systems.
I am reworking this with multiple locks so as to keep the large-system
contention down to a dull roar.

Post by Peter Zijlstra
Furthermore, until this point we can have 'slow' progress by kicking the
CPUs.
That said, the all cpus concurrent sync_rcu_expedited scenario is
absolutely horrid, its everyone spray everyone else.

Agreed, but we really need a system in this state to remain responsive
enough to allow reasonable debugging to proceed rather than just silently
hanging. Ergo, I will be providing multiple locks to keep contention
within the realm of reason. It really isn't complex enough to be worth
arguing about. Maybe 20 lines of straightforward code. (Yeah, yeah,
Murphy says otherwise, but he will have to prove it.)

Post by Peter Zijlstra

Post by Paul E. McKenney
It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.

Bah, I was afraid of that, the problem is that we wait for the
individual stop_work to complete before sending another.
The below is getting a little out of hand, but should avoid the problem
and might be easier than getting the IPI think going, but who knows.

OK, I will give this a try. Of course, the counter needs to be
initialized to 1 rather than zero, and it needs to be atomically
decremented after all stop_one_cpu_nowait() invocations, otherwise you
can get an early wakeup due to the usual race conditions.

Thanx, Paul

Post by Peter Zijlstra
---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = { .wq = &stop_wait, };
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
+ &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
- free_cpumask_var(cm);
+ wait_event(ess.wq, !atomic_read(&ess.count));
+
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-24 17:20:36 UTC

[ . . . ]

Post by Paul E. McKenney

Post by Peter Zijlstra

Post by Paul E. McKenney
It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.

Bah, I was afraid of that, the problem is that we wait for the
individual stop_work to complete before sending another.
The below is getting a little out of hand, but should avoid the problem
and might be easier than getting the IPI think going, but who knows.

OK, I will give this a try. Of course, the counter needs to be
initialized to 1 rather than zero, and it needs to be atomically
decremented after all stop_one_cpu_nowait() invocations, otherwise you
can get an early wakeup due to the usual race conditions.

Except that I promised Ingo I would check for CPUs failing to schedule
quickly enough, which means that I must track them individually rather
than via a single counter...

You did have me going for a bit, though! ;-)

Thanx, Paul

Post by Paul E. McKenney

Post by Peter Zijlstra
---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = { .wq = &stop_wait, };
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
+ &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
- free_cpumask_var(cm);
+ wait_event(ess.wq, !atomic_read(&ess.count));
+
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 17:29:46 UTC

Post by Paul E. McKenney
Except that I promised Ingo I would check for CPUs failing to schedule
quickly enough, which means that I must track them individually rather
than via a single counter...

You can track individual CPUs timestamps by extending the per-cpu
storage we use for the exp_stop_work.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 17:28:32 UTC

Post by Paul E. McKenney
OK, I will give this a try. Of course, the counter needs to be
initialized to 1 rather than zero, and it needs to be atomically
decremented after all stop_one_cpu_nowait() invocations, otherwise you
can get an early wakeup due to the usual race conditions.

Clever that.

How about something like this, it replaced mutex and start/done ticket
thing with an MCS style lockless FIFO queue.

I further uses the gpnum/completed thing to short circuit things if
we've waited long enough.

---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);

+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}

+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,138 +3309,84 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = {
+ .wq = &stop_wait,
+ .count = ATOMIC_INIT(1),
+ };
struct rcu_state *rsp = &rcu_sched_state;
+ struct expedited_queue_task {
+ struct expedited_queue_task *next;
+ struct task_struct *task;
+ int done;
+ } *prev, *next, entry = {
+ .task = current,
+ };
+ long gpnum;
+ int cpu;

- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
-
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
return;
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ smp_mb();
+ gpnum = smp_load_acquire(&rsp->gpnum);

- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
+ /* MCS style queue 'lock' */
+ prev = xchg(&rsp->expedited_queue, &entry);
+ if (prev) {
+ WRITE_ONCE(prev->next, &entry);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (smp_load_acquire(&entry.done))
+ break;
+ schedule();
}
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ __set_current_state(TASK_RUNNING);
}

/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Check to see if someone else did our work for us, while we were
+ * waiting on the queue.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
-
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+ goto unlock;

- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
+
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ &ess, &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
+ atomic_dec(&ess.count);

-all_cpus_idle:
- free_cpumask_var(cm);
+ wait_event(stop_wait, !atomic_read(&ess.count));

- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ /* MCS style queue 'unlock' */
+ next = READ_ONCE(entry.next);
+ if (!next) {
+ if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+ goto done;
+ while (!(next = READ_ONCE(entry.next)))
+ cpu_relax();
+ }
+ smp_store_release(&next->done, 1);

+done:
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ void *expedited_queue;

unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 17:32:22 UTC

Post by Peter Zijlstra
+ /* MCS style queue 'unlock' */
+ next = READ_ONCE(entry.next);
+ if (!next) {
+ if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+ goto done;
+ while (!(next = READ_ONCE(entry.next)))
+ cpu_relax();
+ }
+ smp_store_release(&next->done, 1);

Do you suppose:

wake_up_process(next->task);

would help? :-)

Post by Peter Zijlstra
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 18:15:01 UTC

Post by Peter Zijlstra
How about something like this, it replaced mutex and start/done ticket
thing with an MCS style lockless FIFO queue.
I further uses the gpnum/completed thing to short circuit things if
we've waited long enough.

Prettier version

--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,41 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);

+struct expedited_task_state {
+ struct expedited_task_state *next;
+ struct task_struct *task;
+ atomic_t count;
+ int done;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct expedited_task_state *ets = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ets->count))
+ wake_up_process(ets->task);
+
return 0;
}

+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
+#define current_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,138 +3322,71 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ struct expedited_task_state *prev, *next, entry = {
+ .task = current,
+ .count = ATOMIC_INIT(1), /* avoid spurious wakeups */
+ };
+ long gpnum;
+ int cpu;

- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
-
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
return;
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ smp_mb();
+ gpnum = smp_load_acquire(&rsp->gpnum);
+
+ /* MCS style queue 'lock' */
+ prev = xchg(&rsp->expedited_queue, &entry);
+ if (prev) {
+ WRITE_ONCE(prev->next, &entry);
+ current_wait(smp_load_acquire(&entry.done));
}

/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Check to see if someone else did our work for us, while we were
+ * waiting on the queue.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
-
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
-
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+ goto unlock;
+
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
+
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&entry.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ &entry, &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);

-all_cpus_idle:
- free_cpumask_var(cm);
+ atomic_dec(&entry.count); /* let the wakeups in */
+ current_wait(!atomic_read(&entry.count));

- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ /* MCS style queue 'unlock' */
+ next = READ_ONCE(entry.next);
+ if (!next) {
+ if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+ goto done;
+ while (!(next = READ_ONCE(entry.next)))
+ cpu_relax();
+ }
+ smp_store_release(&next->done, 1);
+ wake_up_process(next->task);

+done:
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ void *expedited_queue;

unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 17:58:49 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.

In my experience, this sort of thing simply melts down on large systems.
I am reworking this with multiple locks so as to keep the large-system
contention down to a dull roar.

So with the MCS queue we're got less global trashing than you had with
the start/done tickets. Only the queue head on enqueue.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-25 03:23:49 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.

In my experience, this sort of thing simply melts down on large systems.
I am reworking this with multiple locks so as to keep the large-system
contention down to a dull roar.

So with the MCS queue we're got less global trashing than you had with
the start/done tickets. Only the queue head on enqueue.

Here is what I had in mind, where you don't have any global trashing
except when the ->expedited_sequence gets updated. Passes mild rcutorture
testing.

Still needs asynchronous CPU stoppage and stall warnings and trace
documentation updates. Plus fixes for whatever bugs show up.

Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..887370b7e52a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");

static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];

/*
* In order to export the rcu_state name to the tracing tools, it
@@ -3323,6 +3324,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
return 0;
}

+/* Common code for synchronize_sched_expedited() work-done checking. */
+static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ put_online_cpus();
+ return true;
+ }
+ return false;
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3334,58 +3351,24 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ long s;
struct rcu_state *rsp = &rcu_sched_state;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;

- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */

- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3394,100 +3377,47 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
+ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
return;
- }
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
+ if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
+ return;

- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+ smp_mb(); /* Ensure expedited GP seen after counter increment. */
+ WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1));

- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-
-all_cpus_idle:
- free_cpumask_var(cm);

- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ smp_mb(); /* Ensure expedited GP seen before counter increment. */
+ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+ WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
+ mutex_unlock(&rnp0->exp_funnel_mutex);
+ smp_mb(); /* ensure subsequent action seen after grace period. */

put_online_cpus();
}
@@ -4043,6 +3973,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
static u8 fl_mask = 0x1;

int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4101,6 +4032,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..f0f4dd96dd73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -68,6 +68,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
@@ -76,6 +77,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
@@ -85,6 +87,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
@@ -95,6 +98,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -237,6 +241,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;

/*
@@ -478,17 +484,11 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */

- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
+ unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_tryfail; /* # acquisition failures. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */

unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..d2aab8dcd58e 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,18 +185,13 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;

- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
+ seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
+ rsp->expedited_sequence,
atomic_long_read(&rsp->expedited_tryfail),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ rsp->expedited_sequence / 2);
return 0;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-25 11:07:57 UTC

Post by Paul E. McKenney
Here is what I had in mind, where you don't have any global trashing
except when the ->expedited_sequence gets updated. Passes mild rcutorture
testing.
/*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
*/
+ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
return;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
+ if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
+ return;

I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.

This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.

So on the one hand you have to strictly order these expedited caller,
but then you don't want to actually process them in order. If 'by magic'
you manage to process the 3rd in queue, you can drop the 2nd because it
will have waited long enough. OTOH the 2nd will have waited too long.

You also do not take the actual RCU state machine into account -- this
is a parallel state.

Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?

We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.

If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-25 13:48:14 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney
Here is what I had in mind, where you don't have any global trashing
except when the ->expedited_sequence gets updated. Passes mild rcutorture
testing.
/*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
*/
+ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
return;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
+ if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
+ return;

I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.
This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.

Not random at all!

The whole funnel is controlled by the root ->exp_funnel_mutex holder,
who is going to hold the lock for a single expedited grace period, then
release it. This means that any time a task acquires a lock, there is
very likely to have been a recent state change. Hence the checks after
each lock acquisition.

So in the heavy-use case, what tends to happen is that there are one
or two expedited grace periods, and then the entire queue of waiters
acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
the expedited grace period whose completion resulted in their acquisition
completing and thus them being awakened. No fuss, no muss, no unnecessary
contention or cache thrashing.

Post by Peter Zijlstra
So on the one hand you have to strictly order these expedited caller,
but then you don't want to actually process them in order. If 'by magic'
you manage to process the 3rd in queue, you can drop the 2nd because it
will have waited long enough. OTOH the 2nd will have waited too long.

Let's take the example of a 4096-CPU system with default configuration of
CONFIG_RCU_FANOUT=64 and CONFIG_RCU_FANOUT_LEAF=16. There will then be
256 leaf rcu_node structures, each of which is subordinate to one of four
internal rcu_node structures, each of which is subordinate to the root
rcu_node structure. There can then be up to 260 tasks waiting on non-root
rcu_node ->exp_funnel_mutex, with an additional task holding the root
rcu_node ->exp_funnel_mutex and carrying out an expedited grace period.
Once that grace period completes, one of the tasks holding an internal
->exp_funnel_mutex acquires the root ->exp_funnel_mutex. If it can use
the just-completed grace period, it releases its ->exp_funnel_mutex,
and the cycle repeats, until the queue drains. If not, then it will
carry out another grace period, perhaps making some of the queue wait
unnecessarily -- but that can happen in the strictly queued case as well,
due to delays between snapshotting the counter and getting on the queue.

The key advantage of the funnel approach is that many tasks can be
concurrently discovering that the grace period they need has already
happened.

Of course, if there are more than 260 tasks queued, the excess tasks will
queue on the leaf ->exp_funnel_mutex mutexes. But they will eventually
start draining 256 at a time, in parallel.

And nothing comes for free. In an idle system, the single task wanting
an expedited grace period must work its way up the rcu_node tree. In
the 4096-CPU case with default configuration, it must acquire three
uncontended mutexes. But this is way down in the noise compared to
the 4095 cache misses required to determine that all the rest of the
CPUs are idle. So the funnel approach is a good tradeoff.

Post by Peter Zijlstra
You also do not take the actual RCU state machine into account -- this
is a parallel state.
Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?
We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.
If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.

That gets quite complex, actually. Lots of races with the normal grace
periods doing one thing or another.

However, it should be quite easy to go the other way and make the normal
grace-period processing take advantage of expedited grace periods that
happened to occur at the right time. I will look into this, thank you
for the nudge!

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-25 14:20:34 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.
This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.

Not random at all!

No, they are random per, definition it depends on the amount of
contention and since that's random, the rest it too.

Post by Paul E. McKenney
The whole funnel is controlled by the root ->exp_funnel_mutex holder,
who is going to hold the lock for a single expedited grace period, then
release it. This means that any time a task acquires a lock, there is
very likely to have been a recent state change. Hence the checks after
each lock acquisition.
So in the heavy-use case, what tends to happen is that there are one
or two expedited grace periods, and then the entire queue of waiters
acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
the expedited grace period whose completion resulted in their acquisition
completing and thus them being awakened. No fuss, no muss, no unnecessary
contention or cache thrashing.

Plenty of cache trashing, since your 'tree' is not at all cache aligned
or even remotely coherent with the actual machine topology -- I'll keep
reminding you :-)

But I must admit that the workings of the sequence thing elided me this
morning. Yes that's much better than the strict ticket order of before.

Post by Paul E. McKenney

Post by Peter Zijlstra
You also do not take the actual RCU state machine into account -- this
is a parallel state.
Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?
We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.
If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.

That gets quite complex, actually. Lots of races with the normal grace
periods doing one thing or another.

How so? I'm probably missing several years of RCU trickery and detail
again, but since we can advance from the tick, we should be able to
advance from the stop work with IRQs disabled with equal ease.

And since the stop work and the tick are fully serialized, there cannot
be any races there.

And the stop work against other CPUs is the exact same races you already
had with tick vs tick.

So please humour me and explain how all this is far more complicated ;-)

Post by Paul E. McKenney
However, it should be quite easy to go the other way and make the normal
grace-period processing take advantage of expedited grace periods that
happened to occur at the right time. I will look into this, thank you
for the nudge!

That should already be happening, right? Since we force context
switches, the tick driven RCU state machine will observe those and make
progress -- assuming it was trying to make progress at all of course.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-25 14:52:06 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.
This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.

Not random at all!

No, they are random per, definition it depends on the amount of
contention and since that's random, the rest it too.

Not sure how to parse this one. ;-)

Post by Peter Zijlstra

Post by Paul E. McKenney
The whole funnel is controlled by the root ->exp_funnel_mutex holder,
who is going to hold the lock for a single expedited grace period, then
release it. This means that any time a task acquires a lock, there is
very likely to have been a recent state change. Hence the checks after
each lock acquisition.
So in the heavy-use case, what tends to happen is that there are one
or two expedited grace periods, and then the entire queue of waiters
acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
the expedited grace period whose completion resulted in their acquisition
completing and thus them being awakened. No fuss, no muss, no unnecessary
contention or cache thrashing.

Plenty of cache trashing, since your 'tree' is not at all cache aligned
or even remotely coherent with the actual machine topology -- I'll keep
reminding you :-)

And, as I keep reminding you, if you actually show me system-level data
demonstrating that this is a real problem, I might consider taking some
action. And also reminding you that in the meantime, you can experiment
by setting the fanout sizes to match a given system and see if it makes
any visible difference. (Yes, I do understand the odd numbering of
hyperthreads, but you can still run a reasonable experiment.)

Post by Peter Zijlstra
But I must admit that the workings of the sequence thing elided me this
morning. Yes that's much better than the strict ticket order of before.

OK, good!

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
You also do not take the actual RCU state machine into account -- this
is a parallel state.
Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?
We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.
If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.

That gets quite complex, actually. Lots of races with the normal grace
periods doing one thing or another.

How so? I'm probably missing several years of RCU trickery and detail
again, but since we can advance from the tick, we should be able to
advance from the stop work with IRQs disabled with equal ease.
And since the stop work and the tick are fully serialized, there cannot
be any races there.
And the stop work against other CPUs is the exact same races you already
had with tick vs tick.
So please humour me and explain how all this is far more complicated ;-)

Yeah, I do need to get RCU design/implementation documentation put together.

In the meantime, RCU's normal grace-period machinery is designed to be
quite loosely coupled. The idea is that almost all actions occur locally,
reducing contention and cache thrashing. But an expedited grace period
needs tight coupling in order to be able to complete quickly. Making
something that switches between loose and tight coupling in short order
is not at all simple.

Post by Peter Zijlstra

Post by Paul E. McKenney
However, it should be quite easy to go the other way and make the normal
grace-period processing take advantage of expedited grace periods that
happened to occur at the right time. I will look into this, thank you
for the nudge!

That should already be happening, right? Since we force context
switches, the tick driven RCU state machine will observe those and make
progress -- assuming it was trying to make progress at all of course.

It is to an extent, but I believe that I can do better. On the other hand,
it is quite possible that this is a 6AM delusion on my part. ;-)

If it is not a delusion, the eventual solution will likely be a much more
satisfying answer to your "why not merge into the normal RCU grace period
machinery" question. But I need to complete reworking the expedited
machinery first!

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-26 12:32:35 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
So please humour me and explain how all this is far more complicated ;-)

Yeah, I do need to get RCU design/implementation documentation put together.
In the meantime, RCU's normal grace-period machinery is designed to be
quite loosely coupled. The idea is that almost all actions occur locally,
reducing contention and cache thrashing. But an expedited grace period
needs tight coupling in order to be able to complete quickly. Making
something that switches between loose and tight coupling in short order
is not at all simple.

But expedited just means faster, we never promised that
sync_rcu_expedited is the absolute fastest primitive ever.

So I really should go read the RCU code I suppose, but I don't get
what's wrong with starting a forced quiescent state, then doing the
stop_work spray, where each work will run the regular RCU tick thing to
push it forwards.

From my feeble memories, what I remember is that the last cpu to
complete a GP on a leaf node will push the completion up to the next
level, until at last we've reached the root of your tree and we can
complete the GP globally.

To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-26 16:14:46 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
So please humour me and explain how all this is far more complicated ;-)

Yeah, I do need to get RCU design/implementation documentation put together.
In the meantime, RCU's normal grace-period machinery is designed to be
quite loosely coupled. The idea is that almost all actions occur locally,
reducing contention and cache thrashing. But an expedited grace period
needs tight coupling in order to be able to complete quickly. Making
something that switches between loose and tight coupling in short order
is not at all simple.

But expedited just means faster, we never promised that
sync_rcu_expedited is the absolute fastest primitive ever.

Which is good, because given that it is doing something to each and
every CPU, it most assuredly won't in any way resemble the absolute
fastest primitive ever. ;-)

Post by Peter Zijlstra
So I really should go read the RCU code I suppose, but I don't get
what's wrong with starting a forced quiescent state, then doing the
stop_work spray, where each work will run the regular RCU tick thing to
push it forwards.

Post by Paul E. McKenney
From my feeble memories, what I remember is that the last cpu to

complete a GP on a leaf node will push the completion up to the next
level, until at last we've reached the root of your tree and we can
complete the GP globally.

That is true, the task that notices the last required quiescent state
will push up the tree and notice that the grace period has ended.
If that task is not the grace-period kthread, it will then awaken
the grace-period kthread.

Post by Peter Zijlstra
To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.

Suppose that someone invokes synchronize_sched_expedited(), but there
is no normal grace period in flight. Then each CPU will note its own
quiescent state, but when it later might have tried to push it up the
tree, it will see that there is no grace period in effect, and will
therefore not bother.

OK, we could have synchronize_sched_expedited() tell the grace-period
kthread to start a grace period if one was not already in progress.
But that still isn't good enough, because the grace-period kthread will
take some time to initialize the new grace period, and if we hammer all
the CPUs before the initialization is complete, the resulting quiescent
states cannot be counted against the new grace period. (The reason for
this is that there is some delay between the actual quiescent state
and the time that it is reported, so we have to be very careful not
to incorrectly report a quiescent state from an earlier grace period
against the current grace period.)

OK, the grace-period kthread could tell synchronize_sched_expedited()
when it has finished initializing the grace period, though this is
starting to get a bit on the Rube Goldberg side. But this -still- is
not good enough, because even though the grace-period kthread has fully
initialized the new grace period, the individual CPUs are unaware of it.
And they will therefore continue to ignore any quiescent state that they
encounter, because they cannot prove that it actually happened after
the start of the current grace period.

OK, we could have some sort of indication when all CPUs become aware
of the new grace period by having them atomically manipulate a global
counter. Presumably we have some flag indicating when this is and is
not needed so that we avoid the killer memory contention in the common
case where it is not needed. But this -still- isn't good enough, because
idle CPUs never will become aware of the new grace period -- by design,
as they are supposed to be able to sleep through an arbitrary number of
grace periods.

OK, so we could have some sort of indication when all non-idle CPUs
become aware of the new grace period. But there could be races where
an idle CPU suddenly becomes non-idle just after it was reported that
the all non-idle CPUs were aware of the grace period. This would result
in a hang, because this this newly non-idle CPU might not have noticed
the new grace period at the time that synchronize_sched_expedited()
hammers it, which would mean that this newly non-idle CPU would refuse
to report the resulting quiescent state.

OK, so the grace-period kthread could track and report the set of CPUs
that had ever been idle since synchronize_sched_expedited() contacted it.
But holy overhead Batman!!!

And that is just one of the possible interactions with the grace-period
kthread. It might be in the middle of setting up a new grace period.
It might be in the middle of cleaning up after the last grace period.
It might be waiting for a grace period to complete, and the last quiescent
state was just reported, but hasn't propagated all the way up yet. All
of these would need to be handled correctly, and a number of them would
be as messy as the above scenario. Some might be even more messy.

I feel like there is a much easier way, but cannot yet articulate it.
I came across a couple of complications and a blind alley with it thus
far, but it still looks promising. I expect to be able to generate
actual code for it within a few days, but right now it is just weird
abstract shapes in my head. (Sorry, if I knew how to describe them,
I could just write the code! When I do write the code, it will probably
seem obvious and trivial, that being the usual outcome...)

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-29 07:57:10 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.

Suppose that someone invokes synchronize_sched_expedited(), but there
is no normal grace period in flight. Then each CPU will note its own
quiescent state, but when it later might have tried to push it up the
tree, it will see that there is no grace period in effect, and will
therefore not bother.

Right, I did mention the force grace period machinery to make sure we
start one before poking :-)

Post by Paul E. McKenney
OK, we could have synchronize_sched_expedited() tell the grace-period
kthread to start a grace period if one was not already in progress.

I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.

7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7

Which is almost as much time as my konsole:

2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole

Which seems somewhat excessive. But who knows.

Post by Paul E. McKenney
OK, the grace-period kthread could tell synchronize_sched_expedited()
when it has finished initializing the grace period, though this is
starting to get a bit on the Rube Goldberg side. But this -still- is
not good enough, because even though the grace-period kthread has fully
initialized the new grace period, the individual CPUs are unaware of it.

Right, so over the weekend -- I had postponed reading this rather long
email for I was knackered -- I had figured that because we trickle the
GP completion up, you probably equally trickle the GP start down of
sorts and there might be 'interesting' things there.

Post by Paul E. McKenney
And they will therefore continue to ignore any quiescent state that they
encounter, because they cannot prove that it actually happened after
the start of the current grace period.

Right, badness :-)

Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.

That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.

Post by Paul E. McKenney
But this -still- isn't good enough, because
idle CPUs never will become aware of the new grace period -- by design,
as they are supposed to be able to sleep through an arbitrary number of
grace periods.

Yes, I'm sure. Waking up seems like a serializing experience though; but
I suppose that's not good enough if we wake up right before we force
start the GP.

Post by Paul E. McKenney
I feel like there is a much easier way, but cannot yet articulate it.
I came across a couple of complications and a blind alley with it thus
far, but it still looks promising. I expect to be able to generate
actual code for it within a few days, but right now it is just weird
abstract shapes in my head. (Sorry, if I knew how to describe them,
I could just write the code! When I do write the code, it will probably
seem obvious and trivial, that being the usual outcome...)

Hehe, glad to have been of help :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-30 21:33:20 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.

Suppose that someone invokes synchronize_sched_expedited(), but there
is no normal grace period in flight. Then each CPU will note its own
quiescent state, but when it later might have tried to push it up the
tree, it will see that there is no grace period in effect, and will
therefore not bother.

Right, I did mention the force grace period machinery to make sure we
start one before poking :-)

Fair enough...

Post by Peter Zijlstra

Post by Paul E. McKenney
OK, we could have synchronize_sched_expedited() tell the grace-period
kthread to start a grace period if one was not already in progress.

I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.
7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
Which seems somewhat excessive. But who knows.

No idea. How long has that system been up? What has it been doing?

The rcu_sched overhead is expected behavior if the system has run between
ten and one hundred million grace periods, give or take an order of
magnitude depending on the number of idle CPUs and so on.

The overhead for the RCU offload kthreads is what it is. A kfree() takes
as much time as a kfree does, and they are all nicely counted up for you.

Post by Peter Zijlstra

Post by Paul E. McKenney
OK, the grace-period kthread could tell synchronize_sched_expedited()
when it has finished initializing the grace period, though this is
starting to get a bit on the Rube Goldberg side. But this -still- is
not good enough, because even though the grace-period kthread has fully
initialized the new grace period, the individual CPUs are unaware of it.

Right, so over the weekend -- I had postponed reading this rather long
email for I was knackered -- I had figured that because we trickle the
GP completion up, you probably equally trickle the GP start down of
sorts and there might be 'interesting' things there.

The GP completion trickles both up and down, though the down part shouldn't
matter in this case.

Post by Peter Zijlstra

Post by Paul E. McKenney
And they will therefore continue to ignore any quiescent state that they
encounter, because they cannot prove that it actually happened after
the start of the current grace period.

Right, badness :-)
Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.
That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.

Well, I thought that you wanted to leverage the combining tree to
determine when the grace period had completed. If a given CPU isn't
pushing its quiescent states up the combining tree, then the combining
tree can't do much for you.

Post by Peter Zijlstra

Post by Paul E. McKenney
But this -still- isn't good enough, because
idle CPUs never will become aware of the new grace period -- by design,
as they are supposed to be able to sleep through an arbitrary number of
grace periods.

Yes, I'm sure. Waking up seems like a serializing experience though; but
I suppose that's not good enough if we wake up right before we force
start the GP.

That would indeed be one of the problems that could occur. ;-)

Post by Peter Zijlstra

Post by Paul E. McKenney
I feel like there is a much easier way, but cannot yet articulate it.
I came across a couple of complications and a blind alley with it thus
far, but it still looks promising. I expect to be able to generate
actual code for it within a few days, but right now it is just weird
abstract shapes in my head. (Sorry, if I knew how to describe them,
I could just write the code! When I do write the code, it will probably
seem obvious and trivial, that being the usual outcome...)

Hehe, glad to have been of help :-)

Well, I do have something that seems reasonably straightforward. Sending
the patches along separately. Not sure that it is worth its weight.

The idea is that we keep the expedited grace periods working as they do
now, independently of the normal grace period. The normal grace period
takes a sequence number just after initialization, and checks to see
if an expedited grace period happened in the meantime at the beginning
of each quiescent-state forcing episode. This saves the last one or
two quiescent-state forcing scans if the case where an expedited grace
period really did happen.

It is possible for the expedited grace period to help things along by
waking up the grace-period kthread, but of course doing this too much
further increases the time consumed by your rcu_sched kthread. It is
possible to compromise by only doing the wakeup every so many grace
periods or only once per a given period of time, which is the approach
the last patch in the series takes.

I will be sending the series shortly, followed by a series for the
other portions of the expedited grace-period upgrade.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-07-01 11:57:01 UTC

Post by Paul E. McKenney

Post by Peter Zijlstra
I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.
7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
Which seems somewhat excessive. But who knows.

No idea. How long has that system been up? What has it been doing?

Some 40 odd days it seems. Its my desktop, I read email (in mutt in
Konsole), I type patches (in vim in Konsole), I compile kernels (in
Konsole) etc..

Now konsole is threaded and each new window/tab is just another thread
in the same process so runtime should accumulate. However I just found
that for some obscure reason there's two konsole processes around, and
the other is the one that I'm using most, it also has significantly more
runtime.

3264 ? Sl 452:43 \_ /usr/bin/konsole

Must be some of that brain damaged desktop shite that confused things --
I see the one is stared with some -session argument. Some day I'll
discover how to destroy all that nonsense and make things behave as they
should.

Post by Paul E. McKenney
The rcu_sched overhead is expected behavior if the system has run between
ten and one hundred million grace periods, give or take an order of
magnitude depending on the number of idle CPUs and so on.
The overhead for the RCU offload kthreads is what it is. A kfree() takes
as much time as a kfree does, and they are all nicely counted up for you.

Yah, if only we could account it back to whomever caused it :/

Post by Paul E. McKenney

Post by Peter Zijlstra
Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.
That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.

Well, I thought that you wanted to leverage the combining tree to
determine when the grace period had completed. If a given CPU isn't
pushing its quiescent states up the combining tree, then the combining
tree can't do much for you.

Right that is what I wanted, and sure the combining thing needs to
happen with atomics, but that's not new, it already does that.

What I was talking about was the interaction between the force
quiescence state and the poking detectoring that a QS had indeed be
started.

Post by Paul E. McKenney
Well, I do have something that seems reasonably straightforward. Sending
the patches along separately. Not sure that it is worth its weight.
The idea is that we keep the expedited grace periods working as they do
now, independently of the normal grace period. The normal grace period
takes a sequence number just after initialization, and checks to see
if an expedited grace period happened in the meantime at the beginning
of each quiescent-state forcing episode. This saves the last one or
two quiescent-state forcing scans if the case where an expedited grace
period really did happen.
It is possible for the expedited grace period to help things along by
waking up the grace-period kthread, but of course doing this too much
further increases the time consumed by your rcu_sched kthread.

Ah so that is the purpose of that patch. Still, I'm having trouble
seeing how you can do this too much, you would only be waking it if
there was a GP pending completion, right? At which point waking it is
the right thing.

If you wake it unconditionally, even if there's nothing to do, then yes
that'd be a waste of cycles.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-07-01 15:57:12 UTC

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.
7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
Which seems somewhat excessive. But who knows.

No idea. How long has that system been up? What has it been doing?

Some 40 odd days it seems. Its my desktop, I read email (in mutt in
Konsole), I type patches (in vim in Konsole), I compile kernels (in
Konsole) etc..
Now konsole is threaded and each new window/tab is just another thread
in the same process so runtime should accumulate. However I just found
that for some obscure reason there's two konsole processes around, and
the other is the one that I'm using most, it also has significantly more
runtime.
3264 ? Sl 452:43 \_ /usr/bin/konsole
Must be some of that brain damaged desktop shite that confused things --
I see the one is stared with some -session argument. Some day I'll
discover how to destroy all that nonsense and make things behave as they
should.

Well, you appear to be using about 6% of a CPU, or 0.7% of the entire
8-CPU system for the RCU GP kthread. That is more than I would like to
see consumed.

Odd that you have four of eight of the rcuos CPUs with higher consumption
than the others. I would expect three of eight. Are you by chance running
an eight-core system with hyperthreading disabled in hardware, via boot
parameter, or via explicit offline? The real question I have is "is
nr_cpu_ids equal to 16 rather than to 8?"

A significant fraction of rcu_sched's CPU overhead is likely due to that
extra wakeup for the fourth leader rcuos kthread.

Also, do you have nohz_full set? Just wondering why callback offloading
is enabled. (If you want it enabled, fine, but from what I can see your
workload isn't being helped by it and it does have higher overhead.)

Even if you don't want offloading and do disable it, it would be good to
reduce the penalty. Is there something I can do to reduce the overhead
of waking several kthreads? Right now, I just do a series of wake_up()
calls, one for each leader rcuos kthread.

Oh, are you running v3.10 or some such? If so, there are some more
recent RCU changes that can help with this. They are called out here:

http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf

Post by Peter Zijlstra

Post by Paul E. McKenney
The rcu_sched overhead is expected behavior if the system has run between
ten and one hundred million grace periods, give or take an order of
magnitude depending on the number of idle CPUs and so on.
The overhead for the RCU offload kthreads is what it is. A kfree() takes
as much time as a kfree does, and they are all nicely counted up for you.

Yah, if only we could account it back to whomever caused it :/

It could be done, but would require increasing the size of rcu_head.
And would require costly fine-grained timing of callback execution.
Not something for production systems, I would guess.

Post by Peter Zijlstra

Post by Paul E. McKenney

Post by Peter Zijlstra
Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.
That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.

Well, I thought that you wanted to leverage the combining tree to
determine when the grace period had completed. If a given CPU isn't
pushing its quiescent states up the combining tree, then the combining
tree can't do much for you.

Right that is what I wanted, and sure the combining thing needs to
happen with atomics, but that's not new, it already does that.
What I was talking about was the interaction between the force
quiescence state and the poking detectoring that a QS had indeed be
started.

It gets worse.

Suppose that a grace period is already in progess. You cannot leverage
its use of the combining tree because some of the CPUs might have already
indicated a quiescent state, which means that the current grace period
won't necessarily wait for all of the CPUs that the concurrent expedited
grace period needs to wait on. So you need to kick the current grace
period, wait for it to complete, wait for the next one to start (with
all the fun and exciting issues called out earlier), do the expedited
grace period, then wait for completion.

Post by Peter Zijlstra

Post by Paul E. McKenney
Well, I do have something that seems reasonably straightforward. Sending
the patches along separately. Not sure that it is worth its weight.
The idea is that we keep the expedited grace periods working as they do
now, independently of the normal grace period. The normal grace period
takes a sequence number just after initialization, and checks to see
if an expedited grace period happened in the meantime at the beginning
of each quiescent-state forcing episode. This saves the last one or
two quiescent-state forcing scans if the case where an expedited grace
period really did happen.
It is possible for the expedited grace period to help things along by
waking up the grace-period kthread, but of course doing this too much
further increases the time consumed by your rcu_sched kthread.

Ah so that is the purpose of that patch. Still, I'm having trouble
seeing how you can do this too much, you would only be waking it if
there was a GP pending completion, right? At which point waking it is
the right thing.
If you wake it unconditionally, even if there's nothing to do, then yes
that'd be a waste of cycles.

Heh! You are already complaining about rcu_sched consuming 0.7%
of your system, and rightfully so. Increasing this overhead still
further therefore cannot be considered a good thing unless there is some
overwhelming benefit. And I am not seeing that benefit. Perhaps due
to a failure of imagination, but until someone enlightens me, I have to
throttle the wakeups -- or, perhaps better, omit the wakeups entirely.

Actually, I am not convinced that I should push any of the patches that
leverage expedited grace periods to help out normal grace periods.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-07-01 16:16:59 UTC

Post by Paul E. McKenney
Odd that you have four of eight of the rcuos CPUs with higher consumption
than the others. I would expect three of eight. Are you by chance running
an eight-core system with hyperthreading disabled in hardware, via boot
parameter, or via explicit offline? The real question I have is "is
nr_cpu_ids equal to 16 rather than to 8?"

It should not, but I'd have to instrument to be sure. Its a regular
4 core + ht part.

Post by Paul E. McKenney
Also, do you have nohz_full set?

Nope..

Post by Paul E. McKenney
Just wondering why callback offloading
is enabled. (If you want it enabled, fine, but from what I can see your
workload isn't being helped by it and it does have higher overhead.)

I think this is a distro .config; every time I strip the desktop kernel
I end up needing a driver I hadn't built. Clearly I've not really paid
attention to the RCU options.

Post by Paul E. McKenney
Even if you don't want offloading and do disable it, it would be good to
reduce the penalty. Is there something I can do to reduce the overhead
of waking several kthreads? Right now, I just do a series of wake_up()
calls, one for each leader rcuos kthread.
Oh, are you running v3.10 or some such? If so, there are some more

Not that old, but not something recent either. I'll upgrade and see if
it goes away. I really detest rebooting the desktop, but it needs to
happen every so often.

Post by Paul E. McKenney

Post by Peter Zijlstra
Yah, if only we could account it back to whomever caused it :/

It could be done, but would require increasing the size of rcu_head.
And would require costly fine-grained timing of callback execution.
Not something for production systems, I would guess.

Nope :/ I know.

Post by Paul E. McKenney

Post by Peter Zijlstra
What I was talking about was the interaction between the force
quiescence state and the poking detectoring that a QS had indeed be
started.

It gets worse.
Suppose that a grace period is already in progess. You cannot leverage
its use of the combining tree because some of the CPUs might have already
indicated a quiescent state, which means that the current grace period
won't necessarily wait for all of the CPUs that the concurrent expedited
grace period needs to wait on. So you need to kick the current grace
period, wait for it to complete, wait for the next one to start (with
all the fun and exciting issues called out earlier), do the expedited
grace period, then wait for completion.

Ah yes. You do do find the fun cases :-)

Post by Paul E. McKenney

Post by Peter Zijlstra
If you wake it unconditionally, even if there's nothing to do, then yes
that'd be a waste of cycles.

Heh! You are already complaining about rcu_sched consuming 0.7%
of your system, and rightfully so. Increasing this overhead still
further therefore cannot be considered a good thing unless there is some
overwhelming benefit. And I am not seeing that benefit. Perhaps due
to a failure of imagination, but until someone enlightens me, I have to
throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
Actually, I am not convinced that I should push any of the patches that
leverage expedited grace periods to help out normal grace periods.

It would seem a shame not to.. I've not yet had time to form a coherent
reply to that thread though.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

2015-06-23 14:39:54 UTC

Post by Peter Zijlstra

Post by Peter Zijlstra
We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.

Urgh, we cannot use percpu-rwsem here, because that would require
percpu_down_write_trylock(), and I'm not sure we can get around the
sync_sched() for that.
Now try_stop_cpus(), which requires the down_write_trylock() is used to
implement synchronize_sched_expedited().
Using sync_sched() to implement sync_sched_expedited would make me
happy, but it does somewhat defeat the purpose.
+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}
sync_srcu() is slow already, why then bother with an
sync_rcu_expedited() :/

Actually, this code was added in 2013, which was after the new variant of
synchronize_srcu(), which last I checked is reasonably fast in the common
case (no readers and not having tons of concurrent synchronize_srcu()
calls on the same srcu_struct), especially on systems with a small number
of CPUs, courtesy of srcu_read_lock()'s and srcu_read_unlock()'s read-side
memory barriers.

So synchronize_rcu() really would be expected to have quite a bit higher
latency than synchronize_srcu().

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-23 16:21:51 UTC

Post by Peter Zijlstra

Post by Oleg Nesterov
Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
- stop_machine takes the lock on CPU 0, adds the work
and drops the lock
- cpu_stop_queue_work() queues both works

cpu_stop_queue_work() only ever queues _1_ work.

Post by Oleg Nesterov
- stop_machine takes the lock on CPU 1, etc
In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
use different multi_stop_data's, so they will wait for each other
forever?

queue_stop_cpus_work() stop_two_cpus()
cpu_stop_queue_work(0,..);
spin_lock(0);
spin_lock(1);
__cpu_stop_queue_work(0,..);
__cpu_stop_queue_work(1,..);
spin_unlock(1);
spin_unlock(0);
cpu_stop_queue_work(1,..);

Yes, sorry for confusion.

Post by Peter Zijlstra
We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.

I am wondering too if we can make this multi_cpu_stop() more clever.
Or at least add some deadlock detection...

Until then you can probably just uglify queue_stop_cpus_work() and
avoid the race,

static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
{
struct cpu_stopper *stopper;
struct cpu_stop_work *work;
unsigned long flags;
unsigned int cpu;

local_irq_save(flags);
for_each_cpu(cpu, cpumask) {
stopper = &per_cpu(cpu_stopper, cpu);
spin_lock(&stopper->lock);

work = &per_cpu(stop_cpus_work, cpu);
work->fn = fn;
work->arg = arg;
work->done = done;
}

for_each_cpu(cpu, cpumask)
__cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));

for_each_cpu(cpu, cpumask) {
stopper = &per_cpu(cpu_stopper, cpu);
spin_unlock(&stopper->lock);
}
local_irq_restore(flags);
}

ignoring lockdep problems.

It would be nice to remove stop_cpus_mutex, it actually protects
stop_cpus_work... Then probably stop_two_cpus() can just use
stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
but this doesn't look nice.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-23 17:25:42 UTC

Post by Oleg Nesterov
It would be nice to remove stop_cpus_mutex, it actually protects
stop_cpus_work... Then probably stop_two_cpus() can just use
stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
but this doesn't look nice.

IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
I think we should move all per-cpu variables there...

Now,

lock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
}

unlock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(...);
}

which should be used instead of stop_cpus_mutex. After this change
stop_two_cpus() can just use stop_cpus().

Off-topic. Can't we make __stop_machine() static? The only caller,
_cpu_down() can safely call stop_machine(), get_online_cpus() is
fine under cpu_hotplug_begin().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-25 19:18:25 UTC

Post by Oleg Nesterov
IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
I think we should move all per-cpu variables there...
Now,
lock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
}
unlock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(...);
}
which should be used instead of stop_cpus_mutex. After this change
stop_two_cpus() can just use stop_cpus().

Right, lockdep annotating that will be 'interesting' though. And
stop_two_cpus() then has the problem of allocating a cpumask. Simpler to
let it keep 'abuse' the queueing spinlock in there.

Post by Oleg Nesterov
Off-topic. Can't we make __stop_machine() static? The only caller,
_cpu_down() can safely call stop_machine(), get_online_cpus() is
fine under cpu_hotplug_begin().

Can do I think.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:25:40 UTC

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/percpu-rwsem.h | 17 +++++++++++++++++
kernel/locking/percpu-rwsem.c | 12 ++++++++++++
2 files changed, 29 insertions(+)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -17,6 +17,7 @@ struct percpu_rw_semaphore {
};

extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -45,6 +46,22 @@ static inline void percpu_down_read(stru
*/
}

+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ bool ret = true;
+
+ preempt_disable();
+ __this_cpu_inc(*sem->refcount);
+ if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ if (ret)
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -80,6 +80,18 @@ void __percpu_down_read(struct percpu_rw
preempt_disable();
}

+bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* A matches D */
+
+ if (likely(smp_load_acquire(&sem->state) != readers_block))
+ return true;
+
+ __percpu_up_read(sem);
+
+ return false;
+}
+
void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
smp_mb(); /* B matches C */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-22 23:10:40 UTC

Post by Peter Zijlstra
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ bool ret = true;
+
+ preempt_disable();
+ __this_cpu_inc(*sem->refcount);
+ if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ if (ret)
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}

..

Post by Peter Zijlstra
+bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* A matches D */
+
+ if (likely(smp_load_acquire(&sem->state) != readers_block))
+ return true;
+
+ __percpu_up_read(sem);
+
+ return false;
+}

Looks like we can slightly refactor this code to avoid the code
duplication. But this is minor too and we can do this later.

Reviewed-by: Oleg Nesterov <***@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:26:56 UTC

The cpu hotplug lock is a rwsem with read-in-write and read-in-read
recursion. Implement it as such.

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/cpu.h | 6 +
include/linux/percpu-rwsem.h | 10 ++-
include/linux/sched.h | 4 +
init/main.c | 1
kernel/cpu.c | 133 +++++++++++++------------------------------
kernel/fork.c | 2
lib/Kconfig | 5 +
7 files changed, 66 insertions(+), 95 deletions(-)

--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -224,6 +224,9 @@ extern struct bus_type cpu_subsys;
#ifdef CONFIG_HOTPLUG_CPU
/* Stop CPUs going up and down. */

+extern void cpu_hotplug_init(void);
+extern void cpu_hotplug_init_task(struct task_struct *p);
+
extern void cpu_hotplug_begin(void);
extern void cpu_hotplug_done(void);
extern void get_online_cpus(void);
@@ -242,6 +245,9 @@ int cpu_down(unsigned int cpu);

#else /* CONFIG_HOTPLUG_CPU */

+static inline void cpu_hotplug_init(void) {}
+static inline void cpu_hotplug_init_task(struct task_struct *p) {}
+
static inline void cpu_hotplug_begin(void) {}
static inline void cpu_hotplug_done(void) {}
#define get_online_cpus() do { } while (0)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -20,12 +20,10 @@ extern void __percpu_down_read(struct pe
extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);

-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();

- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
-
preempt_disable();
/*
* We are in an RCU-sched read-side critical section, so the writer
@@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
*/
}

+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ _percpu_down_read(sem);
+}
+
static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
bool ret = true;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,10 @@ struct task_struct {
unsigned int btrace_seq;
#endif

+#ifdef CONFIG_HOTPLUG_CPU
+ int cpuhp_ref;
+#endif
+
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
--- a/init/main.c
+++ b/init/main.c
@@ -588,6 +588,7 @@ asmlinkage __visible void __init start_k
sched_clock_postinit();
perf_event_init();
profile_init();
+ cpu_hotplug_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,6 +22,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <trace/events/power.h>
+#include <linux/percpu-rwsem.h>

#include "smpboot.h"

@@ -50,7 +51,8 @@ EXPORT_SYMBOL(cpu_notifier_register_done

static RAW_NOTIFIER_HEAD(cpu_chain);

-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
@@ -58,126 +60,72 @@ static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU

static struct {
- struct task_struct *active_writer;
- /* wait queue to wake up the active_writer */
- wait_queue_head_t wq;
- /* verifies that no writer will get active while readers are active */
- struct mutex lock;
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
- .active_writer = NULL,
- .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
- lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+ struct percpu_rw_semaphore rwsem;
+ struct task_struct *writer;
+} cpu_hotplug = { .writer = &init_task, };
+
+void cpu_hotplug_init(void)
+{
+ percpu_init_rwsem(&cpu_hotplug.rwsem);
+ cpu_hotplug.writer = NULL;
+}

+void cpu_hotplug_init_task(struct task_struct *p)
+{
+ p->cpuhp_ref = 0;
+}

void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+
+ /* read in write recursion */
+ if (cpu_hotplug.writer == current)
+ return;
+
+ /* read in read recursion */
+ if (current->cpuhp_ref++)
return;
- cpuhp_lock_acquire_read();
- mutex_lock(&cpu_hotplug.lock);
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(get_online_cpus);

bool try_get_online_cpus(void)
{
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug.writer == current)
return true;
- if (!mutex_trylock(&cpu_hotplug.lock))
- return false;
- cpuhp_lock_acquire_tryread();
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
- return true;
+
+ if (current->cpuhp_ref++)
+ return true;
+
+ return percpu_down_read_trylock(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(try_get_online_cpus);

void put_online_cpus(void)
{
- int refcount;
-
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug.writer == current)
return;

- refcount = atomic_dec_return(&cpu_hotplug.refcount);
- if (WARN_ON(refcount < 0)) /* try to fix things up */
- atomic_inc(&cpu_hotplug.refcount);
-
- if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
- wake_up(&cpu_hotplug.wq);
-
- cpuhp_lock_release();
+ if (--current->cpuhp_ref)
+ return;

+ percpu_up_read(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(put_online_cpus);

-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
void cpu_hotplug_begin(void)
{
- DEFINE_WAIT(wait);
-
- cpu_hotplug.active_writer = current;
- cpuhp_lock_acquire();
-
- for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
- if (likely(!atomic_read(&cpu_hotplug.refcount)))
- break;
- mutex_unlock(&cpu_hotplug.lock);
- schedule();
- }
- finish_wait(&cpu_hotplug.wq, &wait);
+ percpu_down_write(&cpu_hotplug.rwsem);
+ cpu_hotplug.writer = current;
}

void cpu_hotplug_done(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
- cpuhp_lock_release();
+ cpu_hotplug.writer = NULL;
+ percpu_up_write(&cpu_hotplug.rwsem);
}

/*
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
p->sequential_io_avg = 0;
#endif

+ cpu_hotplug_init_task(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -56,6 +56,11 @@ config STMP_DEVICE
config PERCPU_RWSEM
bool

+config PERCPU_RWSEM_HOTPLUG
+ def_bool y
+ depends on HOTPLUG_CPU
+ select PERCPU_RWSEM
+
config ARCH_USE_CMPXCHG_LOCKREF
bool

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-22 22:59:07 UTC

Post by Peter Zijlstra
The cpu hotplug lock is a rwsem with read-in-write and read-in-read
recursion. Implement it as such.

And this patch fixes the problem afaics. Currently cpu_hotplug_begin()
can livelock because it doesn't stop the new readers. With this patch
this is no longer possible.

Post by Peter Zijlstra
-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
-
preempt_disable();
/*
* We are in an RCU-sched read-side critical section, so the writer
@@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
*/
}
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ _percpu_down_read(sem);
+}

..

Post by Peter Zijlstra
void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+
+ /* read in write recursion */
+ if (cpu_hotplug.writer == current)
+ return;
+
+ /* read in read recursion */
+ if (current->cpuhp_ref++)
return;
- cpuhp_lock_acquire_read();
- mutex_lock(&cpu_hotplug.lock);
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?

Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?

Post by Peter Zijlstra
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
p->sequential_io_avg = 0;
#endif
+ cpu_hotplug_init_task(p);

This is probably unnecessary, copy_process() should not be called under
get_online_cpus().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 07:17:12 UTC

Post by Oleg Nesterov

Post by Peter Zijlstra
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?
Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?

While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.

Change it and boot, you'll find lockdep output pretty quickly.

Post by Oleg Nesterov

Post by Peter Zijlstra
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
p->sequential_io_avg = 0;
#endif
+ cpu_hotplug_init_task(p);

This is probably unnecessary, copy_process() should not be called under
get_online_cpus().

Probably true, in which case we could still use the callback to insert a
WARN_ON_ONCE(p->cpuhp_ref) :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-23 17:03:03 UTC

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?
Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?

While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.
Change it and boot, you'll find lockdep output pretty quickly.

Hmm. and I simply can't understand why...

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
p->sequential_io_avg = 0;
#endif
+ cpu_hotplug_init_task(p);

This is probably unnecessary, copy_process() should not be called under
get_online_cpus().

Probably true, in which case we could still use the callback to insert a
WARN_ON_ONCE(p->cpuhp_ref) :-)

Yes, agreed.

And, perhaps, WARN_ON_ONCE(in_irq) in try_get_online_cpus() makes sense...
percpu_down_read_trylock() from irq is fine, but try_get_online_cpus()
can come right after get/put_online_cpus() updates ->cpuhp_ref.

And I forgot to say,

Post by Peter Zijlstra
void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+
+ /* read in write recursion */
+ if (cpu_hotplug.writer == current)
+ return;

..

Post by Peter Zijlstra
void put_online_cpus(void)
{
- int refcount;
-
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug.writer == current)
return;

We do not need to check cpu_hotplug.writer in get/put_online_cpus().
cpu_hotplug_begin/end can just inc/dec current->cpuhp_ref.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 17:53:33 UTC

Post by Oleg Nesterov

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?
Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?

While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.
Change it and boot, you'll find lockdep output pretty quickly.

Hmm. and I simply can't understand why...

If in one callchain we do:

get_online_cpus();
lock(A);

in another we do:

lock(A);
get_online_cpus();

lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.

I think the example you get on boot is slightly more complicated, but
ends up like the above iirc.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-24 13:52:17 UTC

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?
Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?

While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.
Change it and boot, you'll find lockdep output pretty quickly.

Hmm. and I simply can't understand why...

get_online_cpus();
lock(A);
lock(A);
get_online_cpus();
lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.

Ah, but in this case lockdep is right. This is deadlockable because
with the new implementation percpu_down_write() blocks the new readers.
So this change just hides the valid warning.

Just suppose that the 3rd CPU does percpu_down_write()->down_write()
right after the 2nd CPU (above) takes lock(A).

I have to admit that I didn't realize that the code above is currently
correct... but it is.

So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.

This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 14:14:25 UTC

Post by Oleg Nesterov

Post by Peter Zijlstra
get_online_cpus();
lock(A);
lock(A);
get_online_cpus();
lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.

Ah, but in this case lockdep is right. This is deadlockable because
with the new implementation percpu_down_write() blocks the new readers.
So this change just hides the valid warning.
Just suppose that the 3rd CPU does percpu_down_write()->down_write()
right after the 2nd CPU (above) takes lock(A).
I have to admit that I didn't realize that the code above is currently
correct... but it is.
So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.
This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.

I'm confused.. why isn't the read-in-read recursion good enough?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-24 15:14:58 UTC

Post by Peter Zijlstra

Post by Oleg Nesterov

Post by Peter Zijlstra
get_online_cpus();
lock(A);
lock(A);
get_online_cpus();
lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.

Ah, but in this case lockdep is right. This is deadlockable because
with the new implementation percpu_down_write() blocks the new readers.
So this change just hides the valid warning.
Just suppose that the 3rd CPU does percpu_down_write()->down_write()
right after the 2nd CPU (above) takes lock(A).
I have to admit that I didn't realize that the code above is currently
correct... but it is.
So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.
This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.

I'm confused.. why isn't the read-in-read recursion good enough?

Because the code above can actually deadlock if 2 CPU's do this at
the same time?

task_struct->cpuhp_ref only makes read-in-read work, but
percpu_down_write() blocks the new readers.

Suppose that ->cpuhp_ref == 0 on CPU's 0 and 1, suppose that CPU 2
does percpu_down_write() and "sem->state = readers_block" is already
visible to CPU 1 when it calls get_online_cpus().

CPU_0 CPU_1 CPU_2

get_online_cpus(); lock(A);

// waits for CPU_1
lock(A)

// waits for CPU_0
percpu_down_write();

// waits for CPU_2
get_online_cpus();

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 16:15:43 UTC

Post by Oleg Nesterov

Post by Peter Zijlstra
I'm confused.. why isn't the read-in-read recursion good enough?

Because the code above can actually deadlock if 2 CPU's do this at
the same time?

Hmm yes.. this makes the hotplug locking worse than I feared it was, but
alas.

FYI, the actual splat.

---

[ 7.399737] ======================================================
[ 7.406640] [ INFO: possible circular locking dependency detected ]
[ 7.413643] 4.1.0-02756-ge3d06bd-dirty #185 Not tainted
[ 7.419481] -------------------------------------------------------
[ 7.426483] kworker/0:1/215 is trying to acquire lock:
[ 7.432221] (&cpu_hotplug.rwsem){++++++}, at: [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 7.442564]
[ 7.442564] but task is already holding lock:
[ 7.449079] (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 7.458455]
[ 7.458455] which lock already depends on the new lock.
[ 7.458455]
[ 7.467591]
[ 7.467591] the existing dependency chain (in reverse order) is:
[ 7.475949]
-> #3 (&item->mutex){+.+.+.}:
[ 7.480662] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.487280] [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[ 7.494390] [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 7.501596] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 7.508514] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 7.515916] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 7.522922] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.529840] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.536463] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.543283] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.550106] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.557214] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.564029] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.570166] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.576792]
-> #2 (drm_global_mutex){+.+.+.}:
[ 7.581891] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.588514] [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[ 7.595622] [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[ 7.602632] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.609547] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.616170] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.622987] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.629806] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.636913] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.643727] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.649866] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.656490]
-> #1 ((&wfc.work)){+.+.+.}:
[ 7.661104] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.667727] [<ffffffff810e737d>] flush_work+0x3d/0x260
[ 7.674155] [<ffffffff810e9822>] work_on_cpu+0x82/0x90
[ 7.680584] [<ffffffff814bf2a2>] pci_device_probe+0x112/0x120
[ 7.687692] [<ffffffff815e685f>] driver_probe_device+0x17f/0x2e0
[ 7.695094] [<ffffffff815e6a94>] __driver_attach+0x94/0xa0
[ 7.701910] [<ffffffff815e4786>] bus_for_each_dev+0x66/0xa0
[ 7.708824] [<ffffffff815e626e>] driver_attach+0x1e/0x20
[ 7.715447] [<ffffffff815e5ed8>] bus_add_driver+0x168/0x210
[ 7.722361] [<ffffffff815e7880>] driver_register+0x60/0xe0
[ 7.729180] [<ffffffff814bd754>] __pci_register_driver+0x64/0x70
[ 7.736580] [<ffffffff81f9a10d>] pcie_portdrv_init+0x66/0x79
[ 7.743593] [<ffffffff810002c8>] do_one_initcall+0x88/0x1c0
[ 7.750508] [<ffffffff81f5f169>] kernel_init_freeable+0x1f5/0x282
[ 7.758005] [<ffffffff818da36e>] kernel_init+0xe/0xe0
[ 7.764338] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.770961]
-> #0 (&cpu_hotplug.rwsem){++++++}:
[ 7.776255] [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[ 7.783363] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.789986] [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[ 7.796805] [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 7.804398] [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[ 7.811992] [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[ 7.819295] [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[ 7.827277] [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[ 7.834481] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 7.841395] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 7.848793] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 7.855804] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.862715] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.869338] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.876159] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.882979] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.890087] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.896907] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.903043] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.909673]
[ 7.909673] other info that might help us debug this:
[ 7.909673]
[ 7.918616] Chain exists of:
&cpu_hotplug.rwsem --> drm_global_mutex --> &item->mutex

[ 7.927907] Possible unsafe locking scenario:
[ 7.927907]
[ 7.934521] CPU0 CPU1
[ 7.939580] ---- ----
[ 7.944639] lock(&item->mutex);
[ 7.948359] lock(drm_global_mutex);
[ 7.955292] lock(&item->mutex);
[ 7.961855] lock(&cpu_hotplug.rwsem);
[ 7.966158]
[ 7.966158] *** DEADLOCK ***
[ 7.966158]
[ 7.972771] 4 locks held by kworker/0:1/215:
[ 7.977539] #0: ("events"){.+.+.+}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[ 7.986929] #1: ((&wfc.work)){+.+.+.}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[ 7.996600] #2: (drm_global_mutex){+.+.+.}, at: [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[ 8.006690] #3: (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 8.016559]
[ 8.016559] stack backtrace:
[ 8.021427] CPU: 0 PID: 215 Comm: kworker/0:1 Not tainted 4.1.0-02756-ge3d06bd-dirty #185
[ 8.030565] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[ 8.042034] Workqueue: events work_for_cpu_fn
[ 8.046909] ffffffff82857e30 ffff88042b3437c8 ffffffff818e5189 0000000000000011
[ 8.055216] ffffffff8282aa40 ffff88042b343818 ffffffff8111ee76 0000000000000004
[ 8.063522] ffff88042b343888 ffff88042b33f040 0000000000000004 ffff88042b33f040
[ 8.071827] Call Trace:
[ 8.074559] [<ffffffff818e5189>] dump_stack+0x4c/0x6e
[ 8.080300] [<ffffffff8111ee76>] print_circular_bug+0x1c6/0x220
[ 8.087011] [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[ 8.093528] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 8.099559] [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[ 8.106755] [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[ 8.112981] [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[ 8.120176] [<ffffffff810ead27>] ? alloc_workqueue_attrs+0x27/0x80
[ 8.127178] [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 8.134182] [<ffffffff8111cc21>] ? debug_mutex_init+0x31/0x40
[ 8.140690] [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[ 8.147691] [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[ 8.154405] [<ffffffff8122b050>] ? __kmalloc+0x5e0/0x630
[ 8.160435] [<ffffffff815c4de2>] ? drm_global_item_ref+0x52/0xe0
[ 8.167243] [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[ 8.174631] [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[ 8.181245] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 8.187570] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 8.194383] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 8.200802] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 8.207125] [<ffffffff818ebf9e>] ? mutex_unlock+0xe/0x10
[ 8.213156] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 8.219187] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 8.225412] [<ffffffff8111db81>] ? __lock_is_held+0x51/0x80
[ 8.231736] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 8.237962] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 8.244477] [<ffffffff810e9cc6>] ? process_one_work+0x156/0x7e0
[ 8.251187] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 8.257410] [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[ 8.264120] [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[ 8.270829] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 8.276375] [<ffffffff818ee230>] ? _raw_spin_unlock_irq+0x30/0x60
[ 8.283282] [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220
[ 8.290566] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 8.296597] [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-28 23:57:44 UTC

Post by Oleg Nesterov
So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.

Never say tomorrow...

Post by Oleg Nesterov
This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.

Like with any other "recursive" lock.

Peter, I know you don't like the 1st patch. And yes, we could add another
mutex into percpu_rw_semaphore instead. But I think it would be better
to rely on rcu_sync_enter(). As for completion, we can remove it later.
Nevermind, the actual change is 3/3 and it looks simple.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-28 23:58:11 UTC

Add rcu_sync_struct->exclusive boolean set by rcu_sync_init(), it
obviously controls the exclusiveness of rcu_sync_enter(). This is
what percpu_down_write() actually wants.

We turn ->gp_wait into "struct completion gp_comp", it is used as
a resource counter in "exclusive" mode. Otherwise we only use its
completion->wait member for wait_event/wake_up_all. We never mix
the completion/wait_queue_head_t operations.

TODO: we can cleanup this logic and avoid "struct completion", but
this needs a bit more changes.

Signed-off-by: Oleg Nesterov <***@redhat.com>
---
include/linux/percpu-rwsem.h | 2 +-
include/linux/rcusync.h | 29 ++++++++++++++++-------------
kernel/locking/percpu-rwsem.c | 2 +-
kernel/rcu/sync.c | 25 ++++++++++++++++++++-----
4 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index e12ce86..9202e73 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -21,7 +21,7 @@ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
static struct percpu_rw_semaphore name = { \
.refcount = &__percpu_rwsem_refcount_##name, \
.state = 0, \
- .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
+ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1), \
.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
}
diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index 0135838..aaea86a 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -1,7 +1,7 @@
#ifndef _LINUX_RCUSYNC_H_
#define _LINUX_RCUSYNC_H_

-#include <linux/wait.h>
+#include <linux/completion.h>
#include <linux/rcupdate.h>

enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
@@ -9,11 +9,12 @@ enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
struct rcu_sync_struct {
int gp_state;
int gp_count;
- wait_queue_head_t gp_wait;
+ struct completion gp_comp;

int cb_state;
struct rcu_head cb_head;

+ bool exclusive;
enum rcu_sync_type gp_type;
};

@@ -28,30 +29,32 @@ static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
#endif
}

-extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_init(struct rcu_sync_struct *,
+ enum rcu_sync_type, bool excl);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
extern void rcu_sync_dtor(struct rcu_sync_struct *);

-#define __RCU_SYNC_INITIALIZER(name, type) { \
+#define __RCU_SYNC_INITIALIZER(name, type, excl) { \
.gp_state = 0, \
.gp_count = 0, \
- .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .gp_comp = COMPLETION_INITIALIZER(name.gp_comp), \
.cb_state = 0, \
+ .exclusive = excl, \
.gp_type = type, \
}

-#define __DEFINE_RCU_SYNC(name, type) \
- struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+#define __DEFINE_RCU_SYNC(name, type, excl) \
+ struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type, excl)

-#define DEFINE_RCU_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_SYNC)
+#define DEFINE_RCU_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_SYNC, excl)

-#define DEFINE_RCU_SCHED_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+#define DEFINE_RCU_SCHED_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC, excl)

-#define DEFINE_RCU_BH_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+#define DEFINE_RCU_BH_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_BH_SYNC, excl)

#endif /* _LINUX_RCUSYNC_H_ */

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 915646c..014d2f4 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
return -ENOMEM;

sem->state = readers_slow;
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
init_waitqueue_head(&sem->writer);
__init_rwsem(&sem->rw_sem, name, rwsem_key);

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 8835ad1..03ddc61 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -38,7 +38,8 @@ static const struct {
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };

-#define rss_lock gp_wait.lock
+#define rss_lock gp_comp.wait.lock
+#define gp_wait gp_comp.wait

#ifdef CONFIG_PROVE_RCU
bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
@@ -49,10 +50,12 @@ bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
#endif

-void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync_struct *rss,
+ enum rcu_sync_type type, bool excl)
{
memset(rss, 0, sizeof(*rss));
- init_waitqueue_head(&rss->gp_wait);
+ init_completion(&rss->gp_comp);
+ rss->exclusive = excl;
rss->gp_type = type;
}

@@ -72,9 +75,13 @@ void rcu_sync_enter(struct rcu_sync_struct *rss)
if (need_sync) {
gp_ops[rss->gp_type].sync();
rss->gp_state = GP_PASSED;
- wake_up_all(&rss->gp_wait);
+ if (!rss->exclusive)
+ wake_up_all(&rss->gp_wait);
} else if (need_wait) {
- wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ if (!rss->exclusive)
+ wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ else
+ wait_for_completion(&rss->gp_comp);
} else {
/*
* Possible when there's a pending CB from a rcu_sync_exit().
@@ -119,6 +126,12 @@ static void rcu_sync_func(struct rcu_head *rcu)
spin_unlock_irqrestore(&rss->rss_lock, flags);
}

+static inline void __complete_locked(struct completion *x)
+{
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+}
+
void rcu_sync_exit(struct rcu_sync_struct *rss)
{
spin_lock_irq(&rss->rss_lock);
@@ -129,6 +142,8 @@ void rcu_sync_exit(struct rcu_sync_struct *rss)
} else if (rss->cb_state == CB_PENDING) {
rss->cb_state = CB_REPLAY;
}
+ } else if (rss->exclusive) {
+ __complete_locked(&rss->gp_comp);
}
spin_unlock_irq(&rss->rss_lock);
}

--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-28 23:58:21 UTC

Add percpu_rw_semaphore->recursive boolean. If it is true then the
recursive percpu_down_read() is safe, percpu_down_write() doesn't
exclude the new readers, like cpu_hotplug_begin().

Signed-off-by: Oleg Nesterov <***@redhat.com>
---
include/linux/percpu-rwsem.h | 15 ++++++++++-----
kernel/events/uprobes.c | 2 +-
kernel/locking/percpu-rwsem.c | 15 +++++++++++----
3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 9202e73..9441abd 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,16 +13,18 @@ struct percpu_rw_semaphore {
int state;
struct rcu_sync_struct rss;
wait_queue_head_t writer;
+ bool recursive;
struct rw_semaphore rw_sem;
};

-#define DEFINE_STATIC_PERCPU_RWSEM(name) \
+#define DEFINE_STATIC_PERCPU_RWSEM(name, rec) \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
static struct percpu_rw_semaphore name = { \
.refcount = &__percpu_rwsem_refcount_##name, \
.state = 0, \
.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1), \
.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
+ .recursive = rec, \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
}

@@ -37,7 +39,10 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();

- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ if (sem->recursive)
+ rwlock_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ else
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);

preempt_disable();
/*
@@ -97,14 +102,14 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

-extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
+extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, bool,
const char *, struct lock_class_key *);
extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

-#define percpu_init_rwsem(sem) \
+#define percpu_init_rwsem(sem, recursive) \
({ \
static struct lock_class_key rwsem_key; \
- __percpu_init_rwsem(sem, #sem, &rwsem_key); \
+ __percpu_init_rwsem(sem, recursive, #sem, &rwsem_key); \
})

#endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f2..a4813a1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1985,7 +1985,7 @@ static int __init init_uprobes(void)
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);

- if (percpu_init_rwsem(&dup_mmap_sem))
+ if (percpu_init_rwsem(&dup_mmap_sem, false))
return -ENOMEM;

return register_die_notifier(&uprobe_exception_nb);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 609c13b..3db7c45 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -10,7 +10,7 @@

enum { readers_slow, readers_block };

-int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, bool recursive,
const char *name, struct lock_class_key *rwsem_key)
{
sem->refcount = alloc_percpu(unsigned int);
@@ -20,6 +20,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
sem->state = readers_slow;
rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
init_waitqueue_head(&sem->writer);
+ sem->recursive = recursive;
__init_rwsem(&sem->rw_sem, name, rwsem_key);

return 0;
@@ -124,9 +125,15 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- if (per_cpu_sum(*sem->refcount) != 0)
+ if (sem->recursive && !down_write_trylock(&sem->rw_sem))
return false;

+ if (per_cpu_sum(*sem->refcount) != 0) {
+ if (sem->recursive)
+ up_write(&sem->rw_sem);
+ return false;
+ }
+
/*
* If we observed the decrement; ensure we see the entire critical
* section.
@@ -155,8 +162,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* then we are guaranteed to see their sem->refcount increment, and
* therefore will wait for them.
*/
-
- down_write(&sem->rw_sem);
+ if (!sem->recursive)
+ down_write(&sem->rw_sem);
/* Wait for all now active readers to complete. */
wait_event(sem->writer, readers_active_check(sem));
}

--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-28 23:58:41 UTC

percpu_down_write() does down_write() to exclude both the readers and
other writers. We can rely on rcu_sync_enter() in exclusive mode and
take ->rw_sem right before wait_event().

Signed-off-by: Oleg Nesterov <***@redhat.com>
---
kernel/locking/percpu-rwsem.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 014d2f4..609c13b 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -139,8 +139,6 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)

void percpu_down_write(struct percpu_rw_semaphore *sem)
{
- down_write(&sem->rw_sem);
-
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);

@@ -158,6 +156,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* therefore will wait for them.
*/

+ down_write(&sem->rw_sem);
/* Wait for all now active readers to complete. */
wait_event(sem->writer, readers_active_check(sem));
}

--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:27:07 UTC

Since there are no users left of this primitive, make it go away.

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
Documentation/locking/lglock.txt | 166 ---------------------------------------
fs/file_table.c | 1
include/linux/lglock.h | 81 -------------------
kernel/locking/Makefile | 1
kernel/locking/lglock.c | 111 --------------------------
5 files changed, 360 deletions(-)

--- a/Documentation/locking/lglock.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-lglock - local/global locks for mostly local access patterns
-------------------------------------------------------------
-
-Origin: Nick Piggin's VFS scalability series introduced during
- 2.6.35++ [1] [2]
-Location: kernel/locking/lglock.c
- include/linux/lglock.h
-Users: currently only the VFS and stop_machine related code
-
-Design Goal:
-------------
-
-Improve scalability of globally used large data sets that are
-distributed over all CPUs as per_cpu elements.
-
-To manage global data structures that are partitioned over all CPUs
-as per_cpu elements but can be mostly handled by CPU local actions
-lglock will be used where the majority of accesses are cpu local
-reading and occasional cpu local writing with very infrequent
-global write access.
-
-
-* deal with things locally whenever possible
- - very fast access to the local per_cpu data
- - reasonably fast access to specific per_cpu data on a different
- CPU
-* while making global action possible when needed
- - by expensive access to all CPUs locks - effectively
- resulting in a globally visible critical section.
-
-Design:
--------
-
-Basically it is an array of per_cpu spinlocks with the
-lg_local_lock/unlock accessing the local CPUs lock object and the
-lg_local_lock_cpu/unlock_cpu accessing a remote CPUs lock object
-the lg_local_lock has to disable preemption as migration protection so
-that the reference to the local CPUs lock does not go out of scope.
-Due to the lg_local_lock/unlock only touching cpu-local resources it
-is fast. Taking the local lock on a different CPU will be more
-expensive but still relatively cheap.
-
-One can relax the migration constraints by acquiring the current
-CPUs lock with lg_local_lock_cpu, remember the cpu, and release that
-lock at the end of the critical section even if migrated. This should
-give most of the performance benefits without inhibiting migration
-though needs careful considerations for nesting of lglocks and
-consideration of deadlocks with lg_global_lock.
-
-The lg_global_lock/unlock locks all underlying spinlocks of all
-possible CPUs (including those off-line). The preemption disable/enable
-are needed in the non-RT kernels to prevent deadlocks like:
-
- on cpu 1
-
- task A task B
- lg_global_lock
- got cpu 0 lock
- <<<< preempt <<<<
- lg_local_lock_cpu for cpu 0
- spin on cpu 0 lock
-
-On -RT this deadlock scenario is resolved by the arch_spin_locks in the
-lglocks being replaced by rt_mutexes which resolve the above deadlock
-by boosting the lock-holder.
-
-
-Implementation:
----------------
-
-The initial lglock implementation from Nick Piggin used some complex
-macros to generate the lglock/brlock in lglock.h - they were later
-turned into a set of functions by Andi Kleen [7]. The change to functions
-was motivated by the presence of multiple lock users and also by them
-being easier to maintain than the generating macros. This change to
-functions is also the basis to eliminated the restriction of not
-being initializeable in kernel modules (the remaining problem is that
-locks are not explicitly initialized - see lockdep-design.txt)
-
-Declaration and initialization:
--------------------------------
-
- #include <linux/lglock.h>
-
- DEFINE_LGLOCK(name)
- or:
- DEFINE_STATIC_LGLOCK(name);
-
- lg_lock_init(&name, "lockdep_name_string");
-
- on UP this is mapped to DEFINE_SPINLOCK(name) in both cases, note
- also that as of 3.18-rc6 all declaration in use are of the _STATIC_
- variant (and it seems that the non-static was never in use).
- lg_lock_init is initializing the lockdep map only.
-
-Usage:
-------
-
-From the locking semantics it is a spinlock. It could be called a
-locality aware spinlock. lg_local_* behaves like a per_cpu
-spinlock and lg_global_* like a global spinlock.
-No surprises in the API.
-
- lg_local_lock(*lglock);
- access to protected per_cpu object on this CPU
- lg_local_unlock(*lglock);
-
- lg_local_lock_cpu(*lglock, cpu);
- access to protected per_cpu object on other CPU cpu
- lg_local_unlock_cpu(*lglock, cpu);
-
- lg_global_lock(*lglock);
- access all protected per_cpu objects on all CPUs
- lg_global_unlock(*lglock);
-
- There are no _trylock variants of the lglocks.
-
-Note that the lg_global_lock/unlock has to iterate over all possible
-CPUs rather than the actually present CPUs or a CPU could go off-line
-with a held lock [4] and that makes it very expensive. A discussion on
-these issues can be found at [5]
-
-Constraints:
-------------
-
- * currently the declaration of lglocks in kernel modules is not
- possible, though this should be doable with little change.
- * lglocks are not recursive.
- * suitable for code that can do most operations on the CPU local
- data and will very rarely need the global lock
- * lg_global_lock/unlock is *very* expensive and does not scale
- * on UP systems all lg_* primitives are simply spinlocks
- * in PREEMPT_RT the spinlock becomes an rt-mutex and can sleep but
- does not change the tasks state while sleeping [6].
- * in PREEMPT_RT the preempt_disable/enable in lg_local_lock/unlock
- is downgraded to a migrate_disable/enable, the other
- preempt_disable/enable are downgraded to barriers [6].
- The deadlock noted for non-RT above is resolved due to rt_mutexes
- boosting the lock-holder in this case which arch_spin_locks do
- not do.
-
-lglocks were designed for very specific problems in the VFS and probably
-only are the right answer in these corner cases. Any new user that looks
-at lglocks probably wants to look at the seqlock and RCU alternatives as
-her first choice. There are also efforts to resolve the RCU issues that
-currently prevent using RCU in place of view remaining lglocks.
-
-Note on brlock history:
------------------------
-
-The 'Big Reader' read-write spinlocks were originally introduced by
-Ingo Molnar in 2000 (2.4/2.5 kernel series) and removed in 2003. They
-later were introduced by the VFS scalability patch set in 2.6 series
-again as the "big reader lock" brlock [2] variant of lglock which has
-been replaced by seqlock primitives or by RCU based primitives in the
-3.13 kernel series as was suggested in [3] in 2003. The brlock was
-entirely removed in the 3.13 kernel series.
-
-Link: 1 http://lkml.org/lkml/2010/8/2/81
-Link: 2 http://lwn.net/Articles/401738/
-Link: 3 http://lkml.org/lkml/2003/3/9/205
-Link: 4 https://lkml.org/lkml/2011/8/24/185
-Link: 5 http://lkml.org/lkml/2011/12/18/189
-Link: 6 https://www.kernel.org/pub/linux/kernel/projects/rt/
- patch series - lglocks-rt.patch.patch
-Link: 7 http://lkml.org/lkml/2012/3/5/26
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,6 @@
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
-#include <linux/lglock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
--- a/include/linux/lglock.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Specialised local-global spinlock. Can only be declared as global variables
- * to avoid overhead and keep things simple (and we don't want to start using
- * these inside dynamically allocated structures).
- *
- * "local/global locks" (lglocks) can be used to:
- *
- * - Provide fast exclusive access to per-CPU data, with exclusive access to
- * another CPU's data allowed but possibly subject to contention, and to
- * provide very slow exclusive access to all per-CPU data.
- * - Or to provide very fast and scalable read serialisation, and to provide
- * very slow exclusive serialisation of data (not necessarily per-CPU data).
- *
- * Brlocks are also implemented as a short-hand notation for the latter use
- * case.
- *
- * Copyright 2009, 2010, Nick Piggin, Novell Inc.
- */
-#ifndef __LINUX_LGLOCK_H
-#define __LINUX_LGLOCK_H
-
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
-#include <linux/percpu.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define LOCKDEP_INIT_MAP lockdep_init_map
-#else
-#define LOCKDEP_INIT_MAP(a, b, c, d)
-#endif
-
-struct lglock {
- arch_spinlock_t __percpu *lock;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lock_class_key lock_key;
- struct lockdep_map lock_dep_map;
-#endif
-};
-
-#define DEFINE_LGLOCK(name) \
- static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
- = __ARCH_SPIN_LOCK_UNLOCKED; \
- struct lglock name = { .lock = &name ## _lock }
-
-#define DEFINE_STATIC_LGLOCK(name) \
- static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
- = __ARCH_SPIN_LOCK_UNLOCKED; \
- static struct lglock name = { .lock = &name ## _lock }
-
-void lg_lock_init(struct lglock *lg, char *name);
-
-void lg_local_lock(struct lglock *lg);
-void lg_local_unlock(struct lglock *lg);
-void lg_local_lock_cpu(struct lglock *lg, int cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
-
-void lg_global_lock(struct lglock *lg);
-void lg_global_unlock(struct lglock *lg);
-
-#else
-/* When !CONFIG_SMP, map lglock to spinlock */
-#define lglock spinlock
-#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name)
-#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name)
-#define lg_lock_init(lg, name) spin_lock_init(lg)
-#define lg_local_lock spin_lock
-#define lg_local_unlock spin_unlock
-#define lg_local_lock_cpu(lg, cpu) spin_lock(lg)
-#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg)
-#define lg_global_lock spin_lock
-#define lg_global_unlock spin_unlock
-#endif
-
-#endif
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
- LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
- BUG_ON(cpu1 == cpu2);
-
- /* lock in cpu order, just like lg_global_lock */
- if (cpu2 < cpu1)
- swap(cpu1, cpu2);
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
- preempt_enable();
-}
-
-void lg_global_lock(struct lglock *lg)
-{
- int i;
-
- preempt_disable();
- lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
- }
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
- int i;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:27:12 UTC

Add the new rcu_sync_ops->wait() method and the new helper,
rcu_sync_dtor().

It is needed if you are going to, say, kfree(rcu_sync_object).
It simply calls ops->wait() to "flush" the potentially pending
rcu callback.

Reviewed-by: Paul E. McKenney <***@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/rcusync.h | 1 +
kernel/rcu/sync.c | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -31,6 +31,7 @@ static inline bool rcu_sync_is_idle(stru
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
+extern void rcu_sync_dtor(struct rcu_sync_struct *);

#define __RCU_SYNC_INITIALIZER(name, type) { \
.gp_state = 0, \
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,6 +10,7 @@
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ void (*wait)(void);
#ifdef CONFIG_PROVE_RCU
int (*held)(void);
#endif
@@ -17,16 +18,19 @@ static const struct {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ .wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ .wait = rcu_barrier_sched,
__INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ .wait = rcu_barrier_bh,
__INIT_HELD(rcu_read_lock_bh_held)
},
};
@@ -128,3 +132,21 @@ void rcu_sync_exit(struct rcu_sync_struc
}
spin_unlock_irq(&rss->rss_lock);
}
+
+void rcu_sync_dtor(struct rcu_sync_struct *rss)
+{
+ int cb_state;
+
+ BUG_ON(rss->gp_count);
+
+ spin_lock_irq(&rss->rss_lock);
+ if (rss->cb_state == CB_REPLAY)
+ rss->cb_state = CB_PENDING;
+ cb_state = rss->cb_state;
+ spin_unlock_irq(&rss->rss_lock);
+
+ if (cb_state != CB_IDLE) {
+ gp_ops[rss->gp_type].wait();
+ BUG_ON(rss->cb_state != CB_IDLE);
+ }
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:27:20 UTC

Replace the global part of the lglock with a percpu-rwsem.

Since fcl_lock is a spinlock and itself nests under i_lock, which too
is a spinlock we cannot acquire sleeping locks at
locks_{insert,remove}_global_locks().

We can however wrap all fcl_lock acquisitions with percpu_down_read
such that all invocations of locks_{insert,remove}_global_locks() have
that read lock held.

This allows us to replace the lg_global part of the lglock with the
write side of the rwsem.

In the absense of writers, percpu_{down,up}_read() are free of atomic
instructions. This further avoids the very long preempt-disable
regions caused by lglock on larger machines.

Cc: Al Viro <***@ZenIV.linux.org.uk>
Cc: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
fs/locks.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

--- a/fs/locks.c
+++ b/fs/locks.c
@@ -165,6 +165,7 @@ int lease_break_time = 45;
*/
DEFINE_STATIC_LGLOCK(file_lock_lglock);
static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+static struct percpu_rw_semaphore file_rwsem;

/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -556,6 +557,8 @@ static int posix_same_owner(struct file_
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
+ lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
lg_local_lock(&file_lock_lglock);
fl->fl_link_cpu = smp_processor_id();
hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
@@ -565,6 +568,8 @@ static void locks_insert_global_locks(st
/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
+ lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
/*
* Avoid taking lock if already unhashed. This is safe since this check
* is done while holding the flc_lock, and new insertions into the list
@@ -885,6 +890,7 @@ static int flock_lock_file(struct file *
return -ENOMEM;
}

+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -925,6 +931,7 @@ static int flock_lock_file(struct file *

out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -960,6 +967,7 @@ static int __posix_lock_file(struct inod
new_fl2 = locks_alloc_lock();
}

+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1131,6 +1139,7 @@ static int __posix_lock_file(struct inod
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1407,6 +1416,7 @@ int __break_lease(struct inode *inode, u
return error;
}

+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);

time_out_leases(inode, &dispose);
@@ -1477,6 +1487,7 @@ int __break_lease(struct inode *inode, u
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1630,6 +1641,7 @@ generic_add_lease(struct file *filp, lon
return -EINVAL;
}

+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1700,6 +1712,7 @@ generic_add_lease(struct file *filp, lon
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
mutex_unlock(&inode->i_mutex);
@@ -1722,6 +1735,7 @@ static int generic_delete_lease(struct f
return error;
}

+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1734,6 +1748,7 @@ static int generic_delete_lease(struct f
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -2634,6 +2649,7 @@ static void *locks_start(struct seq_file
struct locks_iterator *iter = f->private;

iter->li_pos = *pos + 1;
+ percpu_down_write(&file_rwsem);
lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
@@ -2652,6 +2668,7 @@ static void locks_stop(struct seq_file *
{
spin_unlock(&blocked_lock_lock);
lg_global_unlock(&file_lock_lglock);
+ percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
@@ -2693,6 +2710,7 @@ static int __init filelock_init(void)
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+ percpu_init_rwsem(&file_rwsem);

for_each_possible_cpu(i)
INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:27:28 UTC

As Oleg suggested, replace file_lock_list with a structure containing
the hlist head and a spinlock.

This completely removes the lglock from fs/locks.

Cc: Al Viro <***@ZenIV.linux.org.uk>
Suggested-by: Oleg Nesterov <***@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
fs/Kconfig | 1 +
fs/locks.c | 47 +++++++++++++++++++++++++++++------------------
2 files changed, 30 insertions(+), 18 deletions(-)

--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -65,6 +65,7 @@ config EXPORTFS
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
+ select PERCPU_RWSEM
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -128,7 +128,6 @@
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
-#include <linux/lglock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
@@ -159,12 +158,17 @@ int lease_break_time = 45;

/*
* The global file_lock_list is only used for displaying /proc/locks, so we
- * keep a list on each CPU, with each list protected by its own spinlock via
- * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant flc_lock is held.
+ * keep a list on each CPU, with each list protected by its own spinlock.
+ * Global serialization is done using file_rwsem.
+ *
+ * Note that alterations to the list also require that the relevant flc_lock is
+ * held.
*/
-DEFINE_STATIC_LGLOCK(file_lock_lglock);
-static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+struct file_lock_list_struct {
+ spinlock_t lock;
+ struct hlist_head hlist;
+};
+static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
static struct percpu_rw_semaphore file_rwsem;

/*
@@ -557,17 +561,21 @@ static int posix_same_owner(struct file_
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
+ struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
+
lockdep_assert_held_percpu_rwsem(&file_rwsem);

- lg_local_lock(&file_lock_lglock);
+ spin_lock(&fll->lock);
fl->fl_link_cpu = smp_processor_id();
- hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
- lg_local_unlock(&file_lock_lglock);
+ hlist_add_head(&fl->fl_link, &fll->hlist);
+ spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
+ struct file_lock_list_struct *fll;
+
lockdep_assert_held_percpu_rwsem(&file_rwsem);

/*
@@ -577,9 +585,11 @@ static void locks_delete_global_locks(st
*/
if (hlist_unhashed(&fl->fl_link))
return;
- lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+
+ fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+ spin_lock(&fll->lock);
hlist_del_init(&fl->fl_link);
- lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+ spin_unlock(&fll->lock);
}

static unsigned long
@@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file

iter->li_pos = *pos + 1;
percpu_down_write(&file_rwsem);
- lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
- return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+ return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
@@ -2660,14 +2669,13 @@ static void *locks_next(struct seq_file
struct locks_iterator *iter = f->private;

++iter->li_pos;
- return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+ return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
__releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);
- lg_global_unlock(&file_lock_lglock);
percpu_up_write(&file_rwsem);
}

@@ -2709,11 +2717,14 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

- lg_lock_init(&file_lock_lglock, "file_lock_lglock");
percpu_init_rwsem(&file_rwsem);

- for_each_possible_cpu(i)
- INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+ for_each_possible_cpu(i) {
+ struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
+
+ spin_lock_init(&fll->lock);
+ INIT_HLIST_HEAD(&fll->hlist);
+ }

return 0;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-23 00:21:26 UTC

Off-topic question,

Post by Peter Zijlstra
@@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file
iter->li_pos = *pos + 1;
percpu_down_write(&file_rwsem);
- lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
- return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+ return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

..

Post by Peter Zijlstra
static void locks_stop(struct seq_file *f, void *v)
__releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);

With or without this patch, why locks_start/locks_stop need to take/drop
blocked_lock_lock ?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:27:32 UTC

Currently the percpu-rwsem has two issues:

- it switches to (global) atomic ops while a writer is waiting;
which could be quite a while and slows down releasing the readers.

- it employs synchronize_sched_expedited() _twice_ which is evil and
should die -- it shoots IPIs around the machine.

This patch cures the first problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

It cures the second problem by employing the rcu-sync primitives by
Oleg which reduces to no sync_sched() calls in the 'normal' case of
no write contention -- global locks had better be rare, and has a
maximum of one sync_sched() call in case of contention.

Signed-off-by: Peter Zijlstra (Intel) <***@infradead.org>
---
include/linux/percpu-rwsem.h | 62 +++++++++-
kernel/locking/percpu-rwsem.c | 243 ++++++++++++++++++++++--------------------
2 files changed, 182 insertions(+), 123 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,18 +5,64 @@
#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/wait.h>
+#include <linux/rcusync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
- unsigned int __percpu *fast_read_ctr;
- atomic_t write_ctr;
+ unsigned int __percpu *refcount;
+ int state;
+ struct rcu_sync_struct rss;
+ wait_queue_head_t writer;
struct rw_semaphore rw_sem;
- atomic_t slow_read_ctr;
- wait_queue_head_t write_waitq;
};

-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ might_sleep();
+
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+ preempt_disable();
+ /*
+ * We are in an RCU-sched read-side critical section, so the writer
+ * cannot both change sem->state from readers_fast and start
+ * checking counters while we are here. So if we see !sem->state,
+ * we know that the writer won't be checking until we past the
+ * preempt_enable() and that once the synchronize_sched() is done, the
+ * writer will see anything we did within this RCU-sched read-side
+ * critical section.
+ */
+ __this_cpu_inc(*sem->refcount);
+ if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ __percpu_down_read(sem); /* Unconditional memory barrier. */
+ preempt_enable();
+ /*
+ * The barrier() from preempt_enable() prevents the compiler from
+ * bleeding the critical section out.
+ */
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ /*
+ * The barrier() in preempt_disable() prevents the compiler from
+ * bleeding the critical section out.
+ */
+ preempt_disable();
+ /*
+ * Same as in percpu_down_read().
+ */
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_dec(*sem->refcount);
+ else
+ __percpu_up_read(sem); /* Unconditional memory barrier. */
+ preempt_enable();
+
+ rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}

extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);
@@ -25,10 +71,10 @@ extern int __percpu_init_rwsem(struct pe
const char *, struct lock_class_key *);
extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

-#define percpu_init_rwsem(brw) \
+#define percpu_init_rwsem(sem) \
({ \
static struct lock_class_key rwsem_key; \
- __percpu_init_rwsem(brw, #brw, &rwsem_key); \
+ __percpu_init_rwsem(sem, #sem, &rwsem_key); \
})

#endif
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,158 +8,171 @@
#include <linux/sched.h>
#include <linux/errno.h>

-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+enum { readers_slow, readers_block };
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->refcount = alloc_percpu(unsigned int);
+ if (unlikely(!sem->refcount))
return -ENOMEM;

- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ sem->state = readers_slow;
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ init_waitqueue_head(&sem->writer);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+
return 0;
}

-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->refcount);
+ sem->refcount = NULL; /* catch use after free bugs */
}

-/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+void __percpu_down_read(struct percpu_rw_semaphore *sem)
{
- bool success = false;
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * And yes, if the reader misses the writer's assignment of
+ * readers_block to sem->state, then the writer is
+ * guaranteed to see the reader's increment. Conversely, any
+ * readers that increment their sem->refcount after the
+ * writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->refcount, so that it doesn't matter
+ * that the writer missed them.
+ */
+
+ smp_mb(); /* A matches D */
+
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(smp_load_acquire(&sem->state) != readers_block))
+ return;
+
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
+
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
+
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ __this_cpu_inc(*sem->refcount);
+ __up_read(&sem->rw_sem);

preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
- __this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
- preempt_enable();
+}
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ this_cpu_dec(*sem->refcount);

- return success;
+ /* Prod writer to recheck readers_active */
+ wake_up(&sem->writer);
}

+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
+ * Return true if the modular sum of the sem->refcount per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
*/
-void percpu_down_read(struct percpu_rw_semaphore *brw)
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
- return;
- }
+ if (per_cpu_sum(*sem->refcount) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
+
+ smp_mb(); /* C matches B */

- down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
- __up_read(&brw->rw_sem);
+ return true;
}

-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+ down_write(&sem->rw_sem);

- if (likely(update_fast_ctr(brw, -1)))
- return;
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);

- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
-}
+ /*
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
+ */
+ sem->state = readers_block;

-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
- unsigned int sum = 0;
- int cpu;
+ smp_mb(); /* D matches A */

- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+ /*
+ * If they don't see our writer of readers_block to sem->state,
+ * then we are guaranteed to see their sem->refcount increment, and
+ * therefore will wait for them.
+ */

- return sum;
+ /* Wait for all now active readers to complete. */
+ wait_event(sem->writer, readers_active_check(sem));
}

-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
/*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
+ * Signal the writer is done, no fast path yet.
*
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
*
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
*/
- synchronize_sched_expedited();
+ smp_store_release(&sem->state, readers_slow);

- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
-
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
-
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
-}
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);

-void percpu_up_write(struct percpu_rw_semaphore *brw)
-{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
+ * Once this completes (at least one RCU grace period hence) the reader
+ * fast path will be available again. Safe to use outside the exclusive
+ * write lock because its counting.
*/
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
+ rcu_sync_exit(&sem->rss);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Oleg Nesterov

2015-06-22 23:03:45 UTC

Post by Peter Zijlstra
+enum { readers_slow, readers_block };

I still think this enum doesn't make sense, and percpu_rw_semaphore->state
should be a boolean. But this is really minor and subjective.

Reviewed-by: Oleg Nesterov <***@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Nicholas Mc Guire

2015-06-23 07:28:24 UTC

A bit off-topic probably
but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
generic percpu location as this construct is present in the core a few times
atleast in:
kernel/irq/irqdesc.c:kstat_irqs
kernel/fork.c:nr_processes
mm/memcontrol.c:mem_cgroup_read_events
mm/memcontrol.c:mem_cgroup_read_stat

+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+

so maybe put it into include/linux/percpu.h ?

thx!
hofrat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-25 19:08:23 UTC

Post by Nicholas Mc Guire
A bit off-topic probably
but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
generic percpu location as this construct is present in the core a few times
kernel/irq/irqdesc.c:kstat_irqs
kernel/fork.c:nr_processes

That has an odd unsigned long vs int fail, but yes.

Post by Nicholas Mc Guire
mm/memcontrol.c:mem_cgroup_read_events
mm/memcontrol.c:mem_cgroup_read_stat

Those seem to be hotplug challenged. I'm thinking dropping that
nocpu_base.count[] crap and just iterating all possible CPUs would've
been much easier.

Post by Nicholas Mc Guire

+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+

so maybe put it into include/linux/percpu.h ?

Yes I can do that.

We can try and use it more after that, there seems to be loads of places
that could use this fs/namespace.c fs/inode.c etc..
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Tejun Heo

2015-06-25 19:17:19 UTC

Hello,

Post by Peter Zijlstra

Post by Nicholas Mc Guire
mm/memcontrol.c:mem_cgroup_read_events
mm/memcontrol.c:mem_cgroup_read_stat

Those seem to be hotplug challenged. I'm thinking dropping that
nocpu_base.count[] crap and just iterating all possible CPUs would've
been much easier.

A patch doing that is already queued for this merge window. IIRC,
it's included as part of cgroup writeback updates.

Post by Peter Zijlstra

Post by Nicholas Mc Guire

+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+

so maybe put it into include/linux/percpu.h ?

percpu-defs.h would be the better place for it.

Post by Peter Zijlstra
Yes I can do that.
We can try and use it more after that, there seems to be loads of places
that could use this fs/namespace.c fs/inode.c etc..

Hmmm... the only worry I have about this is people using it on u64 on
32bit machines. CPU local ops can do split updates on lower and upper
halves and the remotely-read value will be surprising. We have the
same issues w/ regular per_cpu accesses to but the summing function /
macro is better at giving the false sense of security. Prolly
limiting it upto ulong size is a good idea?

Thanks.

--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-29 09:32:37 UTC

Post by Tejun Heo
Hmmm... the only worry I have about this is people using it on u64 on
32bit machines. CPU local ops can do split updates on lower and upper
halves and the remotely-read value will be surprising. We have the
same issues w/ regular per_cpu accesses to but the summing function /
macro is better at giving the false sense of security. Prolly
limiting it upto ulong size is a good idea?

Agreed, luckily we already have the infrastructure for this, something
like so?

--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -287,6 +287,16 @@ do { \
preempt_enable(); \
} while (0)

+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
/*
* Branching function to split up a function into a set of functions that
* are called for different scalar sizes of the objects handled.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Tejun Heo

2015-06-29 15:12:35 UTC

Hello, Peter.

Post by Peter Zijlstra
Agreed, luckily we already have the infrastructure for this, something
like so?
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -287,6 +287,16 @@ do { \
preempt_enable(); \
} while (0)
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \

Why not __cpu?

Post by Peter Zijlstra
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})

But other than that, looks good to me.

Thanks.

--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-29 15:14:31 UTC

Post by Tejun Heo
Hello, Peter.

Post by Peter Zijlstra
Agreed, luckily we already have the infrastructure for this, something
like so?
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -287,6 +287,16 @@ do { \
preempt_enable(); \
} while (0)
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \

Why not __cpu?

I've no idea, __cpu is indeed more consistent, consider it changed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 12:36:53 UTC

I forgot to Re-instate "From: Oleg Nesterov" On the first 4 patches.

Sorry about that. I'll take more care with a next posting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Daniel Wagner

2015-06-22 18:11:38 UTC

Also, since Linus thinks lglocks is a failed locking primitive (which I whole
heartedly agree with, its preempt-disable latencies are an abomination), it
also converts the global part of fs/locks's usage of lglock over to a
percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
another (4th) percpu-rwsem users and removes an lglock user.

I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
These microbenches execercise the fs' locks a bit.

I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
kernel boots fine and doesn't explode... so far...

The results aren't looking too bad. Though building a kernel with 'make -j200'
was extreme slow. I'll look into it tomorrow.

https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary

flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026

flock02
mean variance sigma max min
4.1.0 7.0197 1.1812 1.0868 10.6188 5.1706
percpu-rwsem 9.3194 1.3443 1.1594 11.5902 6.6138

lease01
mean variance sigma max min
4.1.0 41.8361 23.8462 4.8833 51.3493 28.5859
percpu-rwsem 40.2738 20.8323 4.5642 49.6037 28.0704

lease02
mean variance sigma max min
4.1.0 71.2159 12.7763 3.5744 77.8432 58.0390
percpu-rwsem 71.4312 14.7688 3.8430 76.5036 57.8615

posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073

posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751

posix03
mean variance sigma max min
4.1.0 0.9121 0.0000 0.0000 0.9121 0.9121
percpu-rwsem 0.9379 0.0000 0.0000 0.9379 0.9379

posix04
mean variance sigma max min
4.1.0 0.0703 0.0044 0.0664 0.6764 0.0437
percpu-rwsem 0.0675 0.0007 0.0267 0.3236 0.0491

cheers,
daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-22 19:06:12 UTC

Post by Daniel Wagner

Also, since Linus thinks lglocks is a failed locking primitive (which I whole
heartedly agree with, its preempt-disable latencies are an abomination), it
also converts the global part of fs/locks's usage of lglock over to a
percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
another (4th) percpu-rwsem users and removes an lglock user.

I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
These microbenches execercise the fs' locks a bit.
I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
kernel boots fine and doesn't explode... so far...

Its against tip/master, although I expect the locking/core bits that
were sent to Linus earlier today to be the biggest missing piece.

All I really did was build a kernel with lockdep enabled and boot +
build a kernel to see it didn't go belly up.

Post by Daniel Wagner
The results aren't looking too bad. Though building a kernel with 'make -j200'
was extreme slow. I'll look into it tomorrow.
https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary

Sweet, I wasn't aware these existed. I'll go have a play.

Post by Daniel Wagner
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751

These two seem to hurt, lemme go look at what they do.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Daniel Wagner

2015-06-23 09:35:47 UTC

Post by Peter Zijlstra

Post by Daniel Wagner

Also, since Linus thinks lglocks is a failed locking primitive (which I whole
heartedly agree with, its preempt-disable latencies are an abomination), it
also converts the global part of fs/locks's usage of lglock over to a
percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
another (4th) percpu-rwsem users and removes an lglock user.

I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
These microbenches execercise the fs' locks a bit.
I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
kernel boots fine and doesn't explode... so far...

Its against tip/master, although I expect the locking/core bits that
were sent to Linus earlier today to be the biggest missing piece.
All I really did was build a kernel with lockdep enabled and boot +
build a kernel to see it didn't go belly up.

Post by Daniel Wagner
The results aren't looking too bad. Though building a kernel with 'make -j200'
was extreme slow. I'll look into it tomorrow.

So this turns out to be false alarm. I had icecream installed/actived
and that interfered with gcc. Stupid me.

The machine has 0.5TB memory and doesn't seem to be really concerned about
'make -j200'

make clean && time make -j200

mainline 4.1.0
2nd run
real 1m7.595s
user 28m43.125s
sys 3m48.189s

tip v4.1-2756-ge3d06bd
2nd run
real 1m6.871s
user 28m50.803s
sys 3m50.223s
3rd run
real 1m6.974s
user 28m52.093s
sys 3m50.259s

tip v4.1-2769-g6ce2591 (percpu-rwsem)
2nd run
real 1m7.847s
user 29m0.439s
sys 3m51.181s
3rd run
real 1m7.113s
user 29m3.127s
sys 3m51.516s

Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
2nd run
real 1m7.605s
user 28m3.121s
sys 3m52.541s

Post by Peter Zijlstra

Post by Daniel Wagner
https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary

Sweet, I wasn't aware these existed. I'll go have a play.

Post by Daniel Wagner
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751

These two seem to hurt, lemme go look at what they do.

Now here the same tests with tip and tip+percpu-rwsem. The patches
applied cleanly :)

I put all the raw data here[1] in case someone is interested. Some of the
test behave a bit strange, running extremely fast compared to the other runs.
That is probably the result of me trying to reduce the run time to the min.

flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
tip 6.8390 329.3037 18.1467 81.0373 0.0021
tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026

flock02
mean variance sigma max min
4.1.0 7.0197 1.1812 1.0868 10.6188 5.1706
4.1.0+percpu-rwsem 9.3194 1.3443 1.1594 11.5902 6.6138
tip 7.1057 1.6719 1.2930 11.2362 5.1434
tip+percpu-rwsem 9.0357 1.9874 1.4097 14.0254 6.4380

lease01
mean variance sigma max min
4.1.0 41.8361 23.8462 4.8833 51.3493 28.5859
4.1.0+percpu-rwsem 40.2738 20.8323 4.5642 49.6037 28.0704
tip 30.2617 13.0900 3.6180 36.6398 20.2085
tip+percpu-rwsem 31.2730 17.9787 4.2401 37.8981 19.2944

lease02
mean variance sigma max min
4.1.0 71.2159 12.7763 3.5744 77.8432 58.0390
4.1.0+percpu-rwsem 71.4312 14.7688 3.8430 76.5036 57.8615
tip 20.2019 5.2042 2.2813 23.1071 13.4647
tip+percpu-rwsem 20.8305 6.6631 2.5813 23.8034 11.2815

posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
tip 129.2736 23752.7122 154.1191 474.0604 0.0063
tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072

posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
4.1.0+percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751
tip 13.2810 5.3958 2.3229 20.1243 8.9361
tip+percpu-rwsem 15.6802 4.7514 2.1798 21.5704 9.4074

posix03
mean variance sigma max min
4.1.0 0.9121 0.0000 0.0000 0.9121 0.9121
4.1.0+percpu-rwsem 0.9379 0.0000 0.0000 0.9379 0.9379
tip 0.8647 0.0009 0.0297 0.9274 0.7995
tip+percpu-rwsem 0.8147 0.0003 0.0161 0.8530 0.7824

posix04
mean variance sigma max min
4.1.0 0.0703 0.0044 0.0664 0.6764 0.0437
4.1.0+percpu-rwsem 0.0675 0.0007 0.0267 0.3236 0.0491
tip 0.0618 0.0027 0.0521 0.5642 0.0453
tip+percpu-rwsem 0.0658 0.0003 0.0175 0.1793 0.0493

cheers,
daniel

[1] http://monom.org/percpu-rwsem/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-06-23 10:01:12 UTC

Post by Daniel Wagner
The machine has 0.5TB memory and doesn't seem to be really concerned about
'make -j200'
make clean && time make -j200
mainline 4.1.0
2nd run
real 1m7.595s
user 28m43.125s
sys 3m48.189s
tip v4.1-2756-ge3d06bd
2nd run
real 1m6.871s
user 28m50.803s
sys 3m50.223s
3rd run
real 1m6.974s
user 28m52.093s
sys 3m50.259s
tip v4.1-2769-g6ce2591 (percpu-rwsem)
2nd run
real 1m7.847s
user 29m0.439s
sys 3m51.181s
3rd run
real 1m7.113s
user 29m3.127s
sys 3m51.516s
Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
2nd run
real 1m7.605s
user 28m3.121s
sys 3m52.541s

Btw., instead of just listing the raw runs, you can get an automatic average and
stddev numbers with this:

$ perf stat --null --repeat 5 --pre 'make clean' --post 'sync' make -j200

Performance counter stats for 'make -j200' (3 runs):

29.068162979 seconds time elapsed ( +- 0.27% )

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 14:34:30 UTC

Post by Daniel Wagner
flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
tip 6.8390 329.3037 18.1467 81.0373 0.0021
tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
tip 129.2736 23752.7122 154.1191 474.0604 0.0063
tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072

Both these tests are incredibly unstable for me (as well as for you it
appears). Variance is through the roof on them.

I get runtimes like:

***@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
0.266157011
***@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
139.303399960

That's not really inspiring, if I use bigger loop counts it more or less
settles, but then the EX is unusable because it ends up running 3000
seconds per test.

In any case, on a smaller box (ivb-ep) I got the below results:

posix01
mean variance sigma max min
data-4.1.0-02756-ge3d06bd 250.7032 40.4864 6.3629 263.7736 238.5192
data-4.1.0-02756-ge3d06bd-dirty 252.6847 35.8953 5.9913 270.1679 233.0215

Which looks better, but the difference is still well within the variance
and thus not significant.

Lemme continue playing with this for a bit more.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Daniel Wagner

2015-06-23 14:56:56 UTC

Post by Peter Zijlstra

Post by Daniel Wagner
flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
tip 6.8390 329.3037 18.1467 81.0373 0.0021
tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
tip 129.2736 23752.7122 154.1191 474.0604 0.0063
tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072

Both these tests are incredibly unstable for me (as well as for you it
appears). Variance is through the roof on them.

Since on my test machine not all 4 socket have inter connection, I pinned the
tests down to one socket to see if that reduces the variance.

Expect flock01 and posix01 show now really low variances (3 runs):

[...]
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992

lease01
mean variance sigma max min
tip-1 0.3424 0.0001 0.0110 0.3644 0.3088
tip-2 0.3627 0.0003 0.0185 0.4140 0.3312
tip-3 0.3446 0.0002 0.0125 0.3851 0.3155
tip+percpu-rswem-1 0.3464 0.0001 0.0116 0.3781 0.3113
tip+percpu-rswem-2 0.3597 0.0003 0.0162 0.3978 0.3250
tip+percpu-rswem-3 0.3513 0.0002 0.0151 0.3933 0.3122
[...]

So with this setup we can start to compare the numbers.

Post by Peter Zijlstra
0.266157011
139.303399960

Same here:

flock01
mean variance sigma max min
tip-1 242.6147 3632.6201 60.2712 313.3081 86.3743
tip-2 233.1934 3850.1995 62.0500 318.2716 101.2738
tip-3 223.0392 3944.5220 62.8054 318.1932 110.8155
tip+percpu-rswem-1 276.5913 2145.0510 46.3147 317.5385 156.1318
tip+percpu-rswem-2 270.7089 2735.7635 52.3045 318.9418 154.5902
tip+percpu-rswem-3 267.8207 3028.3557 55.0305 320.2987 150.9659

posix01
mean variance sigma max min
tip-1 18.8729 151.2810 12.2996 37.3563 0.0060
tip-2 17.6894 140.9982 11.8743 37.2080 0.0060
tip-3 18.7785 145.1217 12.0466 35.5001 0.0060
tip+percpu-rswem-1 18.9970 163.8856 12.8018 35.8795 0.0069
tip+percpu-rswem-2 18.9594 147.3197 12.1375 35.4404 0.0069
tip+percpu-rswem-3 18.8366 126.5831 11.2509 35.9014 0.0069

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 17:50:32 UTC

Post by Daniel Wagner
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992

I did indeed manage to get flock02 down to a usable level and found:

3.20 : ffffffff811ecbdf: incl %gs:0x7ee1de72(%rip) # aa58 <__preempt_count>
0.27 : ffffffff811ecbe6: mov 0xa98553(%rip),%rax # ffffffff81c85140 <file_rwsem>
10.78 : ffffffff811ecbed: incl %gs:(%rax)
0.19 : ffffffff811ecbf0: mov 0xa9855a(%rip),%edx # ffffffff81c85150 <file_rwsem+0x10>
0.00 : ffffffff811ecbf6: test %edx,%edx
0.00 : ffffffff811ecbf8: jne ffffffff811ecdd1 <flock_lock_file+0x261>
3.47 : ffffffff811ecbfe: decl %gs:0x7ee1de53(%rip) # aa58 <__preempt_count>
0.00 : ffffffff811ecc05: je ffffffff811eccec <flock_lock_file+0x17c>

Which is percpu_down_read(). Now aside from the fact that I run a
PREEMPT=y kernel, it looks like that sem->refcount increment stalls
because of the dependent load.

Manually hoisting the load very slightly improves things:

0.24 : ffffffff811ecbdf: mov 0xa9855a(%rip),%rax # ffffffff81c85140 <file_rwsem>
5.88 : ffffffff811ecbe6: incl %gs:0x7ee1de6b(%rip) # aa58 <__preempt_count>
7.94 : ffffffff811ecbed: incl %gs:(%rax)
0.30 : ffffffff811ecbf0: mov 0xa9855a(%rip),%edx # ffffffff81c85150 <file_rwsem+0x10>
0.00 : ffffffff811ecbf6: test %edx,%edx
0.00 : ffffffff811ecbf8: jne ffffffff811ecdd1 <flock_lock_file+0x261>
3.70 : ffffffff811ecbfe: decl %gs:0x7ee1de53(%rip) # aa58 <__preempt_count>
0.00 : ffffffff811ecc05: je ffffffff811eccec <flock_lock_file+0x17c>

But its not much :/

Using DEFINE_STATIC_PERCPU_RWSEM(file_rwsem) would allow GCC to omit the
sem->refcount load entirely, but its not smart enough to see that it can
(tested 4.9 and 5.1).

---
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -35,6 +35,8 @@ extern void __percpu_up_read(struct perc

static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
{
+ unsigned int __percpu *refcount = sem->refcount;
+
might_sleep();

preempt_disable();
@@ -47,7 +49,7 @@ static inline void _percpu_down_read(str
* writer will see anything we did within this RCU-sched read-side
* critical section.
*/
- __this_cpu_inc(*sem->refcount);
+ __this_cpu_inc(*refcount);
if (unlikely(!rcu_sync_is_idle(&sem->rss)))
__percpu_down_read(sem); /* Unconditional memory barrier. */
preempt_enable();
@@ -81,6 +83,8 @@ static inline bool percpu_down_read_tryl

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ unsigned int __percpu *refcount = sem->refcount;
+
/*
* The barrier() in preempt_disable() prevents the compiler from
* bleeding the critical section out.
@@ -90,7 +94,7 @@ static inline void percpu_up_read(struct
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_dec(*sem->refcount);
+ __this_cpu_dec(*refcount);
else
__percpu_up_read(sem); /* Unconditional memory barrier. */
preempt_enable();
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 19:36:43 UTC

Post by Daniel Wagner
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992

Aside from the flock_lock_file function moving up, we also get an
increase in _raw_spin_lock.

Before:

5.17% 5.17% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--99.75%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.25%-- [...]

After:

7.20% 7.20% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--52.23%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--25.92%-- flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--21.42%-- locks_delete_lock_ctx
| flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.43%-- [...]

And its not at all clear to me why this would be. It looks like
FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
would be.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-06-24 08:47:02 UTC

Post by Peter Zijlstra

Post by Daniel Wagner
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992

Aside from the flock_lock_file function moving up, we also get an
increase in _raw_spin_lock.
5.17% 5.17% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--99.75%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.25%-- [...]
7.20% 7.20% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--52.23%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--25.92%-- flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--21.42%-- locks_delete_lock_ctx
| flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.43%-- [...]
And its not at all clear to me why this would be. It looks like
FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
would be.

So I'd suggest to first compare preemption behavior: does the workload
context-switch heavily, and is it the exact same context switching rate and are
the points of preemption the same as well between the two kernels?

[ Such high variance is often caused by (dynamically) unstable load balancing and
the workload never finding a good equilibrium. Any observable locking overhead
is usually just a second order concern or a symptom. Assuming the workload
context switches heavily. ]

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-24 09:02:56 UTC

Post by Ingo Molnar

Post by Daniel Wagner
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992

[ Such high variance is often caused by (dynamically) unstable load balancing and
the workload never finding a good equilibrium. Any observable locking overhead
is usually just a second order concern or a symptom. Assuming the workload
context switches heavily. ]

flock02 is a relatively stable benchmark -- unlike some of the others
where the variance is orders of magnitude higher than the avg.

But yes, I'll go poke at it more. I just need to hunt down unrelated
fail before continuing with this.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Daniel Wagner

2015-06-24 09:18:53 UTC

Post by Ingo Molnar
So I'd suggest to first compare preemption behavior: does the workload
context-switch heavily, and is it the exact same context switching rate and are
the points of preemption the same as well between the two kernels?

If I read this correctly, the answer is yes.

First the 'stable' flock02 test:

perf stat --repeat 5 --pre 'rm -rf /tmp/a' ~/src/lockperf/flock02 -n 128 -l 64 /tmp/a
0.008793148
0.008784990
0.008587804
0.008693641
0.008776946

Performance counter stats for '/home/wagi/src/lockperf/flock02 -n 128 -l 64 /tmp/a' (5 runs):

76.509634 task-clock (msec) # 3.312 CPUs utilized ( +- 0.67% )
2 context-switches # 0.029 K/sec ( +- 26.50% )
128 cpu-migrations # 0.002 M/sec ( +- 0.31% )
5,295 page-faults # 0.069 M/sec ( +- 0.49% )
89,944,154 cycles # 1.176 GHz ( +- 0.66% )
58,670,259 stalled-cycles-frontend # 65.23% frontend cycles idle ( +- 0.88% )
0 stalled-cycles-backend # 0.00% backend cycles idle
76,991,414 instructions # 0.86 insns per cycle
# 0.76 stalled cycles per insn ( +- 0.19% )
15,239,720 branches # 199.187 M/sec ( +- 0.20% )
103,418 branch-misses # 0.68% of all branches ( +- 6.68% )

0.023102895 seconds time elapsed ( +- 1.09% )

And here posix01 which shows high variance:

perf stat --repeat 5 --pre 'rm -rf /tmp/a' ~/src/lockperf/posix01 -n 128 -l 64 /tmp/a
0.006020402
32.510838421
55.516466069
46.794470223
5.097701438

Performance counter stats for '/home/wagi/src/lockperf/posix01 -n 128 -l 64 /tmp/a' (5 runs):

4177.932106 task-clock (msec) # 14.162 CPUs utilized ( +- 34.59% )
70,646 context-switches # 0.017 M/sec ( +- 31.56% )
28,009 cpu-migrations # 0.007 M/sec ( +- 33.55% )
4,834 page-faults # 0.001 M/sec ( +- 0.98% )
7,291,160,968 cycles # 1.745 GHz ( +- 32.17% )
5,216,204,262 stalled-cycles-frontend # 71.54% frontend cycles idle ( +- 32.13% )
0 stalled-cycles-backend # 0.00% backend cycles idle
1,901,289,780 instructions # 0.26 insns per cycle
# 2.74 stalled cycles per insn ( +- 30.80% )
440,415,914 branches # 105.415 M/sec ( +- 31.06% )
1,347,021 branch-misses # 0.31% of all branches ( +- 29.17% )

0.295016987 seconds time elapsed ( +- 32.01% )

BTW, thanks for the perf stat tip. Really handy!

cheers,
daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Daniel Wagner

2015-07-01 06:04:37 UTC

Hi,

I did a sweep over the parameters for posix01. The parameters are number
of processes and number of locks taken per process. In contrast to the
other test, it looks like there is no set which ends a nice stable
result (read low variance). I have tried several things including
pinning down all processes to CPUs to avoid migration. The results
improved slightly but there was still a high variance.

Anyway I have collected some data and I like to share it. Maybe it is
still useful. All numbers here are without the above mentioned pinning.
There are some runs missing (don't know the reason yet) and I didn't let
it run till the end. So add some salt to these numbers.

The test script and raw data can be found here:

http://monom.org/posix01/

The tables reads:
nproc: number of process started
columns: number of locks taken per process

Hardware
4x E5-4610, for this test all process are scheduled on one socket

First the numbers for tip 4.1.0-02756-ge3d06bd.

nproc 8
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.075449 0.210547 0.340658 0.464083 0.590400
std 0.015550 0.024989 0.032080 0.043803 0.055003
min 0.021643 0.067456 0.211779 0.279643 0.327628
25% 0.065337 0.195664 0.318114 0.430040 0.546488
50% 0.075345 0.209411 0.338512 0.461397 0.591433
75% 0.084725 0.226517 0.364190 0.494638 0.626532
max 0.127050 0.281836 0.454558 0.607559 0.762149

nproc 16
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 1.023660 2.463384 3.891954 5.312716 6.752857
std 0.105065 0.124916 0.136476 0.172906 0.207449
min 0.351199 1.527379 3.106403 4.157478 5.519601
25% 0.961098 2.397597 3.807098 5.201875 6.633034
50% 1.031460 2.467317 3.895824 5.321227 6.757502
75% 1.093412 2.539284 3.985122 5.432336 6.889859
max 1.278603 2.785901 4.369434 5.798982 7.324263

nproc 24
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 3.460166 7.942193 11.898540 11.150066 11.060036
std 0.191564 0.232989 0.612868 0.680323 0.465967
min 2.748545 6.575510 9.977165 9.209685 8.937682
25% 3.325521 7.806847 11.440580 10.774070 10.912302
50% 3.493138 7.951859 11.852556 11.163595 11.074910
75% 3.596927 8.088036 12.443429 11.365197 11.243125
max 3.974884 8.589840 13.079780 16.341043 14.244954

nproc 32
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 6.797286 13.943421 14.373278 15.857103 20.047039
std 0.366013 0.417859 0.625967 0.377463 0.302939
min 3.323312 12.266006 12.492706 14.451931 17.496059
25% 6.649401 13.719397 14.186790 15.738348 19.958001
50% 6.868362 13.862458 14.312992 15.870438 20.083564
75% 6.995801 14.027167 14.429383 15.984881 20.215722
max 7.369007 15.631300 21.587450 19.364991 20.755793

nproc 40
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.156514 16.936808 18.930412 25.605206 32.334239
std 0.613158 0.614545 0.485336 0.344226 0.398747
min 5.609261 13.147398 16.930261 23.448985 28.992899
25% 10.999876 16.740775 18.788180 25.481274 32.188020
50% 11.251502 16.883100 18.946506 25.648879 32.369347
75% 11.439205 17.032133 19.105678 25.806715 32.565019
max 12.155905 24.116348 26.152117 26.502637 33.263763

nproc 48
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 16.523705 18.214558 27.877811 37.703763 47.655792
std 0.974732 1.118383 0.357481 0.435081 0.472945
min 7.909358 16.279568 25.989797 35.308061 45.279940
25% 16.385582 17.960832 27.729399 37.555420 47.458123
50% 16.692900 18.137635 27.920459 37.767064 47.679325
75% 16.927355 18.311502 28.092018 37.950782 47.926311
max 17.720374 35.810409 28.721941 38.746273 49.333097

nproc 56
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.567668 25.100333 38.603884 52.135564 65.716669
std 0.320771 0.369833 0.554834 0.534120 0.612844
min 10.123811 22.598875 35.668780 49.182148 62.504962
25% 11.394438 24.925338 38.389200 51.885988 65.441492
50% 11.593920 25.135043 38.641839 52.206010 65.771692
75% 11.789101 25.328558 38.895343 52.451819 66.068270
max 12.319346 25.948404 46.458428 53.605888 67.270679

nproc 64
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 15.421295 33.254418 51.073912 68.936111 86.919074
std 0.398493 0.411222 0.551629 0.690891 0.694183
min 13.269859 30.900978 48.174802 65.549282 83.099271
25% 15.203732 33.037478 50.821702 68.619365 86.579749
50% 15.467885 33.279869 51.130972 69.001664 86.953804
75% 15.694466 33.514712 51.380860 69.361632 87.341084
max 16.347321 34.475095 52.507292 70.884752 88.807083

nproc 72
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 19.762286 42.488827 65.167763 87.903430 110.666679
std 0.483660 0.480269 0.689872 0.828354 0.892759
min 15.506067 39.937453 61.196633 84.227403 107.014850
25% 19.519194 42.261548 64.834133 87.515837 110.225142
50% 19.809986 42.541263 65.265768 87.974049 110.747980
75% 20.083315 42.792858 65.603762 88.392599 111.223192
max 20.913434 43.830009 66.791452 90.184550 113.062344

nproc 80
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 24.782285 52.853068 80.902314 109.112294 137.441640
std 0.523731 0.639160 0.799033 0.952619 1.091478
min 20.126615 47.813274 77.357915 104.033857 131.978443
25% 24.498501 52.547855 80.509926 108.606293 136.877050
50% 24.835766 52.918841 80.950773 109.197236 137.498470
75% 25.137887 53.244013 81.376380 109.723791 138.101133
max 26.161997 54.372957 83.266046 111.709888 140.419400

nproc 88
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 30.196867 64.467080 98.710365 133.024282 167.330900
std 0.749476 0.691460 0.863908 1.033780 1.240237
min 16.647491 60.034797 94.053510 128.281171 161.778166
25% 29.896764 64.121607 98.290368 132.484092 166.711172
50% 30.271808 64.514222 98.742714 133.089852 167.429483
75% 30.627200 64.903154 99.262584 133.706735 168.086624
max 31.806051 66.343856 101.077264 136.143873 170.449596

nproc 96
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 36.304100 77.194851 117.958001 158.820159 199.868940
std 0.712442 0.718565 1.009163 1.220813 1.462219
min 31.128111 73.850226 112.075970 152.910227 192.977453
25% 35.928427 76.811233 117.466922 158.151278 199.058411
50% 36.378220 77.209148 117.998878 158.879704 199.861157
75% 36.761744 77.636286 118.615380 159.583272 200.701769
max 38.069263 79.445286 120.878239 162.826438 206.826424

nproc 104
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 42.731401 90.887253 138.815476 186.824953 235.055458
std 1.045572 0.742232 0.999065 1.298818 1.554890
min 23.734733 87.384048 133.462821 180.971966 227.475939
25% 42.353032 90.441055 138.213962 186.109237 234.169575
50% 42.861112 90.900274 138.836083 186.835884 235.084204
75% 43.236527 91.382487 139.460129 187.694247 236.011148
max 44.600281 93.394394 141.959512 190.171221 239.491909

nproc 112
100 200 300 400
count 460.000000 460.000000 460.000000 460.000000
mean 49.782729 105.468739 161.416099 217.385757
std 0.904312 1.011980 1.222772 1.475225
min 45.334285 100.711113 156.087707 210.639527
25% 49.394518 104.971028 160.743875 216.590612
50% 49.906665 105.604756 161.528712 217.437408
75% 50.363428 106.088852 162.187166 218.286111
max 51.800116 108.372299 164.614385 221.788613

And now the same tests for tip+percpu_rwsem:

nproc 8
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.285784 0.639623 0.935062 1.165287 1.457565
std 0.040458 0.089317 0.112704 0.094596 0.110337
min 0.118961 0.253775 0.351943 0.869095 1.026194
25% 0.263250 0.600806 0.858630 1.100281 1.376566
50% 0.287019 0.649395 0.930437 1.167166 1.461235
75% 0.312601 0.692013 1.013786 1.228887 1.533511
max 0.407264 0.860837 1.298671 1.460842 1.927867

nproc 16
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 2.338683 5.219408 8.117279 11.050641 14.035433
std 0.146102 0.270400 0.392875 0.510692 0.576044
min 1.836110 4.179970 6.491748 8.998336 11.442838
25% 2.239374 5.042915 7.860587 10.728740 13.667630
50% 2.335801 5.217732 8.125243 11.052183 14.010561
75% 2.443152 5.404223 8.396037 11.404375 14.417740
max 2.798029 5.927344 9.172875 12.203548 15.444552

nproc 24
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 6.399927 13.673487 20.729554 27.316864 34.125202
std 0.558388 1.157996 1.647191 2.066864 2.487975
min 4.961608 10.767524 17.145018 22.441426 28.566438
25% 5.987118 12.849801 19.555979 25.943463 32.399122
50% 6.388215 13.583983 20.533054 27.122120 33.959403
75% 6.915310 14.786835 22.252796 29.187176 36.308254
max 7.405319 15.823960 23.858206 31.754922 38.997955

nproc 32
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.973832 24.885823 36.705614 48.036525 57.418669
std 1.270516 2.604583 3.963139 5.283237 6.441122
min 9.395066 19.958662 27.768684 38.247046 46.265231
25% 10.955417 22.708953 33.510437 43.613011 51.901209
50% 11.801515 24.556642 35.805816 47.315635 55.933447
75% 13.294692 27.520679 40.689642 53.139912 63.860584
max 14.217272 29.968337 44.409489 58.246754 71.045867

nproc 40
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 19.307414 39.204462 55.768040 70.808627 83.830246
std 2.189803 3.982241 5.467692 6.737372 8.124025
min 14.450258 30.606836 44.342114 55.520218 64.704178
25% 17.418113 35.968251 51.341042 65.352697 77.744806
50% 19.067713 39.023460 55.548934 70.282785 83.374667
75% 21.479466 42.666118 60.379906 76.604241 91.158904
max 23.687483 47.019928 67.143361 85.084045 100.957011

nproc 48
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 28.386773 55.462523 77.886706 92.579064 104.319703
std 3.231688 6.142373 8.633285 10.950222 12.510504
min 21.703659 42.486864 56.904221 66.605689 76.529646
25% 25.635256 50.575642 71.306694 82.931995 94.222776
50% 28.136694 55.235674 77.298409 91.993559 104.909015
75% 31.484979 60.645302 85.693462 102.195018 114.141212
max 35.713537 68.342796 96.065304 115.926497 130.916876

nproc 56
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 39.037206 74.470404 97.900979 111.320283 135.943281
std 4.594741 8.940246 11.715321 13.823450 16.032080
min 29.532559 55.193557 65.590273 79.580482 98.565733
25% 35.212004 66.990273 88.066459 100.643871 122.864654
50% 38.796902 73.928176 96.771490 110.669216 136.199617
75% 43.154846 82.041731 108.937264 120.727216 147.769269
max 49.215714 92.181542 125.188702 141.113117 170.961264

nproc 64
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 51.099012 93.028015 114.649700 145.944300 178.043572
std 6.310777 12.719401 14.675830 18.019135 21.084448
min 36.770938 54.620852 80.837116 98.765936 126.207980
25% 45.955694 84.078285 103.452854 132.127548 160.746493
50% 50.275929 93.031565 114.333533 144.951788 177.105994
75% 56.955477 104.656181 128.418118 163.865640 197.275452
max 63.369715 120.360706 146.542148 182.482159 218.814651

nproc 72
100 200 300 400 500
count 506.000000 506.000000 506.000000 506.000000 506.000000
mean 64.905270 108.760098 138.811285 179.277895 222.584001
std 8.784532 16.293281 18.160401 21.203767 25.904456
min 43.035451 64.762288 96.401934 127.995159 162.341026
25% 58.658290 98.438247 126.035692 162.944645 202.228444
50% 64.756854 109.608197 139.190635 181.413255 223.359111
75% 72.488483 123.608470 152.745541 195.549278 245.454358
max 83.424516 139.214509 172.538610 218.677815 270.799895

nproc 80
100 200 300 400 500
count 61.000000 61.000000 61.000000 61.000000 61.000000
mean 76.727789 124.438489 174.095378 225.855798 272.416390
std 9.757928 18.034325 20.216132 24.868596 29.384832
min 55.988043 83.842137 130.842940 173.596051 208.508169
25% 69.218268 116.679810 162.149179 207.015727 252.194955
50% 75.392969 125.378519 173.117425 225.071270 276.188038
75% 83.748328 136.689138 192.392097 245.019530 296.407232
max 97.004966 165.172805 206.391629 266.751069 318.089290

nproc 88
100
count 157.000000
mean 90.337638
std 15.239911
min 53.393662
25% 79.648088
50% 91.075065
75% 103.530939
max 120.680507

And an attempt at visualization:

Loading Image...

Loading Image...

Let me know if these numbers help or not. I start to get better in
running those tests tough they take quite some time to finish. So if
they are useless I sleep well without doing this :)

cheers,
daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2015-06-22 20:07:01 UTC

It further removes the stop_machine lglock usage, and with it kills lglocks.

Ok. With all the conversions, and removal of lglock, my dislike of
this goes away.

I'm somewhat worried about Daniel's report about "building a kernel
with 'make -j200' was extreme slow", but that may be due to something
else (does the machine have enough memory for "make -j200"? The kernel
compile parallelizes so well, and gcc uses so much memory, that you
need a *lot* of memory to use things like "-j200").

But assuming that gets sorted out, and somebody looks at the few file
locking performance issues, I have no objections to this series any
more.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/

Davidlohr Bueso

2015-06-23 16:10:24 UTC

This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
user.

Curious, why not also mem hotplug? It seems to use the exact same
locking mayhem than cpu.

Thanks,
Davidlohr

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-06-23 16:22:12 UTC

Post by Davidlohr Bueso

This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
user.

Curious, why not also mem hotplug? It seems to use the exact same
locking mayhem than cpu.

Because it looks like they 'forgot' to copy the notifiers and therefore
I suspect we could simplify things. We might not need the recursive
nonsense.

But I've not yet actually looked at it much.

I was indeed greatly saddened that these people copied cpu hotplug;
clearly they had not gotten the memo that cpu hotplug is a trainwreck.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

104 Replies
1 View
Permalink to this page
Disable enhanced parsing

Thread Navigation

Peter Zijlstra 2015-06-22 12:24:44 UTC

Peter Zijlstra 2015-06-22 12:24:54 UTC

Peter Zijlstra 2015-06-22 12:25:14 UTC

Peter Zijlstra 2015-06-22 12:25:19 UTC

Peter Zijlstra 2015-06-22 12:25:27 UTC

Peter Zijlstra 2015-06-22 12:25:33 UTC

Oleg Nesterov 2015-06-22 22:23:29 UTC

Peter Zijlstra 2015-06-23 10:09:56 UTC

Peter Zijlstra 2015-06-23 10:56:12 UTC

Peter Zijlstra 2015-06-23 11:21:02 UTC

Peter Zijlstra 2015-06-23 13:08:57 UTC

Oleg Nesterov 2015-06-23 16:37:53 UTC

Paul E. McKenney 2015-06-23 17:30:59 UTC

Peter Zijlstra 2015-06-23 18:04:35 UTC

Paul E. McKenney 2015-06-23 18:26:46 UTC

Paul E. McKenney 2015-06-23 19:05:22 UTC

Paul E. McKenney 2015-06-24 02:24:11 UTC

Peter Zijlstra 2015-06-24 08:33:22 UTC

Peter Zijlstra 2015-06-24 09:31:28 UTC

Paul E. McKenney 2015-06-24 13:49:33 UTC

Paul E. McKenney 2015-06-24 15:20:25 UTC

Peter Zijlstra 2015-06-24 15:35:11 UTC

Peter Zijlstra 2015-06-24 07:35:25 UTC

Ingo Molnar 2015-06-24 08:43:07 UTC

Paul E. McKenney 2015-06-24 13:40:08 UTC

Ingo Molnar 2015-06-24 13:43:55 UTC

Paul E. McKenney 2015-06-24 14:07:19 UTC

Paul E. McKenney 2015-06-24 14:51:46 UTC

Peter Zijlstra 2015-06-24 15:02:30 UTC

Paul E. McKenney 2015-06-24 15:32:12 UTC

Peter Zijlstra 2015-06-24 15:40:28 UTC

Paul E. McKenney 2015-06-24 16:09:24 UTC

Peter Zijlstra 2015-06-24 16:42:25 UTC

Paul E. McKenney 2015-06-24 17:10:45 UTC

Paul E. McKenney 2015-06-24 17:20:36 UTC

Peter Zijlstra 2015-06-24 17:29:46 UTC

Peter Zijlstra 2015-06-24 17:28:32 UTC

Peter Zijlstra 2015-06-24 17:32:22 UTC

Peter Zijlstra 2015-06-24 18:15:01 UTC

Peter Zijlstra 2015-06-24 17:58:49 UTC

Paul E. McKenney 2015-06-25 03:23:49 UTC

Peter Zijlstra 2015-06-25 11:07:57 UTC

Paul E. McKenney 2015-06-25 13:48:14 UTC

Peter Zijlstra 2015-06-25 14:20:34 UTC

Paul E. McKenney 2015-06-25 14:52:06 UTC

Peter Zijlstra 2015-06-26 12:32:35 UTC

Paul E. McKenney 2015-06-26 16:14:46 UTC

Peter Zijlstra 2015-06-29 07:57:10 UTC

Paul E. McKenney 2015-06-30 21:33:20 UTC

Peter Zijlstra 2015-07-01 11:57:01 UTC

Paul E. McKenney 2015-07-01 15:57:12 UTC

Peter Zijlstra 2015-07-01 16:16:59 UTC

Paul E. McKenney 2015-06-23 14:39:54 UTC

Oleg Nesterov 2015-06-23 16:21:51 UTC

Oleg Nesterov 2015-06-23 17:25:42 UTC

Peter Zijlstra 2015-06-25 19:18:25 UTC

Peter Zijlstra 2015-06-22 12:25:40 UTC

Oleg Nesterov 2015-06-22 23:10:40 UTC

Peter Zijlstra 2015-06-22 12:26:56 UTC

Oleg Nesterov 2015-06-22 22:59:07 UTC

Peter Zijlstra 2015-06-23 07:17:12 UTC

Oleg Nesterov 2015-06-23 17:03:03 UTC

Peter Zijlstra 2015-06-23 17:53:33 UTC

Oleg Nesterov 2015-06-24 13:52:17 UTC

Peter Zijlstra 2015-06-24 14:14:25 UTC

Oleg Nesterov 2015-06-24 15:14:58 UTC

Peter Zijlstra 2015-06-24 16:15:43 UTC

Oleg Nesterov 2015-06-28 23:57:44 UTC

Oleg Nesterov 2015-06-28 23:58:11 UTC

Oleg Nesterov 2015-06-28 23:58:21 UTC

Oleg Nesterov 2015-06-28 23:58:41 UTC

Peter Zijlstra 2015-06-22 12:27:07 UTC

Peter Zijlstra 2015-06-22 12:27:12 UTC

Peter Zijlstra 2015-06-22 12:27:20 UTC

Peter Zijlstra 2015-06-22 12:27:28 UTC

Oleg Nesterov 2015-06-23 00:21:26 UTC

Peter Zijlstra 2015-06-22 12:27:32 UTC

Oleg Nesterov 2015-06-22 23:03:45 UTC

Nicholas Mc Guire 2015-06-23 07:28:24 UTC

Peter Zijlstra 2015-06-25 19:08:23 UTC

Tejun Heo 2015-06-25 19:17:19 UTC

Peter Zijlstra 2015-06-29 09:32:37 UTC

Tejun Heo 2015-06-29 15:12:35 UTC

Peter Zijlstra 2015-06-29 15:14:31 UTC

Peter Zijlstra 2015-06-22 12:36:53 UTC

Daniel Wagner 2015-06-22 18:11:38 UTC

Peter Zijlstra 2015-06-22 19:06:12 UTC

Daniel Wagner 2015-06-23 09:35:47 UTC

Ingo Molnar 2015-06-23 10:01:12 UTC

Peter Zijlstra 2015-06-23 14:34:30 UTC

Daniel Wagner 2015-06-23 14:56:56 UTC

Peter Zijlstra 2015-06-23 17:50:32 UTC

Peter Zijlstra 2015-06-23 19:36:43 UTC

Ingo Molnar 2015-06-24 08:47:02 UTC

Peter Zijlstra 2015-06-24 09:02:56 UTC

Daniel Wagner 2015-06-24 09:18:53 UTC

Daniel Wagner 2015-07-01 06:04:37 UTC

Linus Torvalds 2015-06-22 20:07:01 UTC

Davidlohr Bueso 2015-06-23 16:10:24 UTC

Peter Zijlstra 2015-06-23 16:22:12 UTC

about - legalese

Loading...