Discussion:
[PATCH] oom killer (Core)
t***@linutronix.de
2004-12-01 09:49:03 UTC
Permalink
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.

The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill

Signed-off-by: Thomas Gleixner <***@linutronix.de>

---
oom_kill.c | 113 ++++++++++++++++++++++++++++++++++++++++++-----------------
page_alloc.c | 5 ++
2 files changed, 86 insertions(+), 32 deletions(-)
---
diff -urN 2.6.10-rc2-mm4.orig/mm/oom_kill.c 2.6.10-rc2-mm4/mm/oom_kill.c
--- 2.6.10-rc2-mm4.orig/mm/oom_kill.c 2004-12-01 09:33:44.000000000 +0100
+++ 2.6.10-rc2-mm4/mm/oom_kill.c 2004-12-01 09:37:41.628396951 +0100
@@ -45,8 +45,10 @@
static unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
+ struct list_head *tsk;

- if (!p->mm)
+ /* Ignore mm-less tasks and init */
+ if (!p->mm || p->pid == 1)
return 0;

if (p->flags & PF_MEMDIE)
@@ -57,6 +59,19 @@
points = p->mm->total_vm;

/*
+ * Processes which fork a lot of child processes are likely
+ * a good choice. We add the vmsize of the childs if they
+ * have an own mm. This prevents forking servers to flood the
+ * machine with an endless amount of childs
+ */
+ list_for_each(tsk, &p->children) {
+ struct task_struct *chld;
+ chld = list_entry(tsk, struct task_struct, sibling);
+ if (chld->mm != p->mm && chld->mm)
+ points += chld->mm->total_vm;
+ }
+
+ /*
* CPU time is in tens of seconds and run time is in thousands
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
@@ -176,6 +191,27 @@
return mm;
}

+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ struct mm_struct *mm;
+ struct task_struct *g, *q;
+
+ mm = oom_kill_task(p);
+ if (!mm)
+ return NULL;
+ /*
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group
+ */
+ do_each_thread(g, q)
+ if (q->mm == mm && q->tgid != p->tgid)
+ __oom_kill_task(q);
+
+ while_each_thread(g, q);
+ if (!p->mm)
+ printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ return mm;
+}

/**
* oom_kill - kill the "best" process when we run out of memory
@@ -188,7 +224,9 @@
void oom_kill(void)
{
struct mm_struct *mm;
- struct task_struct *g, *p, *q;
+ struct task_struct *c, *p;
+ struct list_head *tsk;
+ int mmcnt = 0;

read_lock(&tasklist_lock);
retry:
@@ -200,21 +238,32 @@
panic("Out of memory and no killable processes...\n");
}

- mm = oom_kill_task(p);
- if (!mm)
- goto retry;
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * Kill the forked child processes first
*/
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
- while_each_thread(g, q);
- if (!p->mm)
- printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ list_for_each(tsk, &p->children) {
+ c = list_entry(tsk, struct task_struct, sibling);
+ /* Do not touch threads, as they get killed later */
+ if (c->mm == p->mm)
+ continue;
+ mm = oom_kill_process(c);
+ if (mm) {
+ mmcnt ++;
+ mmput(mm);
+ }
+ }
+
+ /*
+ * If we managed to kill one or more child processes
+ * then let the parent live for now
+ */
+ if (!mmcnt) {
+ mm = oom_kill_process(p);
+ if (!mm)
+ goto retry;
+ mmput(mm);
+ }
read_unlock(&tasklist_lock);
- mmput(mm);
return;
}

@@ -224,14 +273,21 @@
void out_of_memory(int gfp_mask)
{
/*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static DEFINE_SPINLOCK(oom_lock);
+ * inprogress protects out_of_memory()'s static variables
+ * and prevents reentrancy.
+ */
+ static unsigned long inprogress;
+ static unsigned int freepages = 1000000;
static unsigned long first, last, count, lastkill;
unsigned long now, since;

- spin_lock(&oom_lock);
+ if (test_and_set_bit(0, &inprogress))
+ return;
+
+ /* Check, if memory was freed since the last oom kill */
+ if (freepages < nr_free_pages())
+ goto out_unlock;
+
now = jiffies;
since = now - last;
last = now;
@@ -271,12 +327,11 @@
* Ok, really out of memory. Kill something.
*/
lastkill = now;
-
- printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+ printk(KERN_ERR "oom-killer: gfp_mask=0x%x\n", gfp_mask);
show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
+ dump_stack();
+ /* Store free pages * 2 for the check above */
+ freepages = (nr_free_pages() << 1);
oom_kill();
/*
* Make kswapd go out of the way, so "p" has a good chance of
@@ -284,17 +339,11 @@
* for more memory.
*/
yield();
- spin_lock(&oom_lock);

reset:
- /*
- * We dropped the lock above, so check to be sure the variable
- * first only ever increases to prevent false OOM's.
- */
- if (time_after(now, first))
- first = now;
+ first = jiffies;
count = 0;

out_unlock:
- spin_unlock(&oom_lock);
+ clear_bit(0, &inprogress);
}
diff -urN 2.6.10-rc2-mm4.orig/mm/page_alloc.c 2.6.10-rc2-mm4/mm/page_alloc.c
--- 2.6.10-rc2-mm4.orig/mm/page_alloc.c 2004-12-01 09:33:44.000000000 +0100
+++ 2.6.10-rc2-mm4/mm/page_alloc.c 2004-12-01 09:36:55.010034604 +0100
@@ -872,6 +872,11 @@
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

+ /* Check if try_to_free_pages called out_of_memory and
+ * if the current task is the sacrifice */
+ if ((p->flags & PF_MEMDIE) && !in_interrupt())
+ goto nopage;
+
/* go through the zonelist yet one more time */
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
t***@sophos.com
2004-12-01 10:21:17 UTC
Permalink
Post by t***@linutronix.de
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.
The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill
Ah, again. :) Rusty Lynch and me tried something similar at leat twice but
with no avail.

We had a modular OOM killers infrastructure with two new killers. One
would just panic on OOM situation, and other did account for parents which
repeatedly spawned 'bad' children. Some people called it 'a bit right
winged'. :)

Take a look at http://linux.ursulin.net if you are interested. I did not
sync it with the latest kernel since nobody was interested. It worked for
me.
Andrea Arcangeli
2004-12-01 21:16:38 UTC
Permalink
It gets invoked multiple times [..]
You didn't move the invocation in page_alloc.c which is the major bug I
can see (besides the other hacks in oom_kill.c). I'd try fixing the
major bug first.
Thomas Gleixner
2004-12-01 22:06:07 UTC
Permalink
Post by Andrea Arcangeli
It gets invoked multiple times [..]
You didn't move the invocation in page_alloc.c which is the major bug I
can see (besides the other hacks in oom_kill.c). I'd try fixing the
major bug first.
Where do you want to move it ?

I don't buy that moving the invocation to any place will solve the
problem of multiple invocations.

The multiple invocations happen with SMP and UP + PREEMPT, when the lock
is dropped in out_of_memory()

spin_unlock(&oom_lock);
oom_kill();
spin_lock(&oom_lock);

How does moving the invocation change this ?

tglx
Andrea Arcangeli
2004-12-01 22:33:24 UTC
Permalink
Post by Thomas Gleixner
Post by Andrea Arcangeli
It gets invoked multiple times [..]
You didn't move the invocation in page_alloc.c which is the major bug I
can see (besides the other hacks in oom_kill.c). I'd try fixing the
major bug first.
Where do you want to move it ?
I don't buy that moving the invocation to any place will solve the
problem of multiple invocations.
It has to check the levels of free memory before calling oom_kill.c and
that's the usual check that alloc_pages does.
Andrea Arcangeli
2004-12-02 03:36:19 UTC
Permalink
Could you please test this? Perhaps it won't be as effective as the
hacks I nuked, but it fixed basic things for me in a quick test so I
like it more. It's important to use pages_high to avoid livelocks.

Thanks.

Signed-off-by: Andrea Arcangeli <***@suse.de>

Index: x/mm/oom_kill.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/oom_kill.c,v
retrieving revision 1.31
diff -u -p -r1.31 oom_kill.c
--- x/mm/oom_kill.c 14 Oct 2004 04:27:49 -0000 1.31
+++ x/mm/oom_kill.c 2 Dec 2004 03:34:05 -0000
@@ -229,72 +229,8 @@ retry:
*/
void out_of_memory(int gfp_mask)
{
- /*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
- static unsigned long first, last, count, lastkill;
- unsigned long now, since;
-
- spin_lock(&oom_lock);
- now = jiffies;
- since = now - last;
- last = now;
-
- /*
- * If it's been a long time since last failure,
- * we're not oom.
- */
- if (since > 5*HZ)
- goto reset;
-
- /*
- * If we haven't tried for at least one second,
- * we're not really oom.
- */
- since = now - first;
- if (since < HZ)
- goto out_unlock;
-
- /*
- * If we have gotten only a few failures,
- * we're not really oom.
- */
- if (++count < 10)
- goto out_unlock;
-
- /*
- * If we just killed a process, wait a while
- * to give that task a chance to exit. This
- * avoids killing multiple processes needlessly.
- */
- since = now - lastkill;
- if (since < HZ*5)
- goto out_unlock;
-
- /*
- * Ok, really out of memory. Kill something.
- */
- lastkill = now;
-
printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
+ dump_stack();
oom_kill();
- spin_lock(&oom_lock);
-
-reset:
- /*
- * We dropped the lock above, so check to be sure the variable
- * first only ever increases to prevent false OOM's.
- */
- if (time_after(now, first))
- first = now;
- count = 0;
-
-out_unlock:
- spin_unlock(&oom_lock);
}
Index: x/mm/page_alloc.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/page_alloc.c,v
retrieving revision 1.236
diff -u -p -r1.236 page_alloc.c
--- x/mm/page_alloc.c 16 Nov 2004 03:53:53 -0000 1.236
+++ x/mm/page_alloc.c 2 Dec 2004 02:21:18 -0000
@@ -611,6 +611,7 @@ __alloc_pages(unsigned int gfp_mask, uns
int alloc_type;
int do_retry;
int can_try_harder;
+ int did_some_progress;

might_sleep_if(wait);

@@ -686,18 +687,19 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- try_to_free_pages(zones, gfp_mask, order);
+ did_some_progress = try_to_free_pages(zones, gfp_mask, order);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

- /* go through the zonelist yet one more time */
+ /*
+ * Go through the zonelist yet one more time, keep
+ * very high watermark here, this is only to catch
+ * a parallel oom killing, we must fail if we're still
+ * under heavy pressure.
+ */
for (i = 0; (z = zones[i]) != NULL; i++) {
- min = z->pages_min;
- if (gfp_mask & __GFP_HIGH)
- min /= 2;
- if (can_try_harder)
- min -= min / 4;
+ min = z->pages_high;
min += (1<<order) + z->protection[alloc_type];

if (z->free_pages < min)
@@ -708,6 +710,9 @@ rebalance:
goto got_pg;
}

+ if (unlikely(!did_some_progress) && (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
+ out_of_memory(gfp_mask);
+
/*
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
Index: x/mm/vmscan.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/vmscan.c,v
retrieving revision 1.225
diff -u -p -r1.225 vmscan.c
--- x/mm/vmscan.c 19 Nov 2004 22:54:22 -0000 1.225
+++ x/mm/vmscan.c 2 Dec 2004 01:56:50 -0000
@@ -935,8 +935,6 @@ int try_to_free_pages(struct zone **zone
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
out:
for (i = 0; zones[i] != 0; i++)
zones[i]->prev_priority = zones[i]->temp_priority;
Thomas Gleixner
2004-12-02 11:09:19 UTC
Permalink
Post by Andrea Arcangeli
Could you please test this? Perhaps it won't be as effective as the
hacks I nuked, but it fixed basic things for me in a quick test so I
like it more. It's important to use pages_high to avoid livelocks.
Sorry man. The behaviour is worse than before, but I expected that :)

It kills sshd, the application, the controlling terminal and some other
innocent processes like hotplug.

Take a decent UP machine enable PREEMPT and start # hackbench NN , where
NN is a number of processes, which exhausts the memory of the machine
and see yourself.

If there is the point where memory is exhausted you have to kill a
process _and_ wait until the memory is freed before even thinking about
another invocation of oom_kill().

There is usually more than one process on a machine which requests
memory. So as long as the first kill did not proceed to the point where
the memory is actually freed, you will call out_of_memory() from
different processes or kernel functions and make the machine less usable
than neccecary.

The oom_kill is invoked about 100 times. It ends up killing hotplug when
already 112MB memory are available. See below.

I agree, that the timed hack stuff in out_of_memory is ugly, but we need
some mechanism to avoid

1. reentrancy into oom_kill
2. invocation of oom_kill until the already killed process
has finally released memory.

Further we need a better selection of whom to kill.

tglx

<SNIP>
Free pages: 112360kB (0kB HighMem)
<SNIP>
[<c0132151>] out_of_memory+0x21/0x30
[<c0132f96>] __alloc_pages+0x2f6/0x380
[<c013303f>] __get_free_pages+0x1f/0x40
[<c013665f>] kmem_getpages+0x1f/0xd0
[<c0137339>] cache_grow+0xb9/0x180
[<c0137573>] cache_alloc_refill+0x173/0x220
[<c01378d4>] __kmalloc+0x74/0x80
[<c02534d7>] alloc_skb+0x47/0xf0
[<c0252717>] sock_alloc_send_pskb+0xc7/0x1d0
[<c025284d>] sock_alloc_send_skb+0x2d/0x40
[<c02ad466>] unix_stream_sendmsg+0x196/0x400
[<c012f987>] filemap_nopage+0x207/0x3a0
[<c024fa66>] sock_aio_write+0xf6/0x120
[<c014c967>] do_sync_write+0xb7/0xf0
[<c016048b>] do_pollfd+0x5b/0xa0
[<c0126f90>] autoremove_wake_function+0x0/0x60
[<c016053a>] do_poll+0x6a/0xd0
[<c014ca9f>] vfs_write+0xff/0x130
[<c014cba1>] sys_write+0x51/0x80
[<c01027cb>] syscall_call+0x7/0xb
Out of Memory: Killed process 3037 (hotplug).
Thomas Gleixner
2004-12-02 13:48:00 UTC
Permalink
Post by Thomas Gleixner
I agree, that the timed hack stuff in out_of_memory is ugly, but we need
some mechanism to avoid
1. reentrancy into oom_kill
2. invocation of oom_kill until the already killed process
has finally released memory.
Further we need a better selection of whom to kill.
Attached patch solves this problems more elegant than the previous
attempts and removes the ugly time, count, threshold stuff.

Reentrancy and follow up calls of oom_kill() are blocked until the task
which was killed by the first oom_kill() has actually released the
resources. I added a callback which is called from do_exit() when the
PF_MEMDIE flag is set. The reentrancy blocking is released inside the
callback.

The whom to select modification is still neccecary to prevent that
innocent processes are killed.

It now kills exactly one process and comes back to the terminal without
further damages. Also other scenarios which I used to test the oom-
killer are not showing any strange side effects.
Post by Thomas Gleixner
From the first call to out_of_memory, which activates the reentrancy
blocking until the blocking is released in the callback, out_of_memory
is called more than 300 times.

Interesting is that the patch applied to the current code in 2.6.10-rc2-
mm4, where the call to out_of_memory is still in vmscan.c, has the same
effect.

So it's up to you VM guys to fight out from which place you want call
out_of_memory(). I don't care as both places have exactly the same bad
side effects.

My concern is to make it reliable when it is finally invoked.

tglx
Andrew Morton
2004-12-02 16:55:18 UTC
Permalink
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.
Well no ;)

Thomas, could you please put together a description of how to reproduce
this behaviour?
Andrew Morton
2004-12-02 17:27:44 UTC
Permalink
Post by Andrew Morton
Thomas, could you please put together a description of how to reproduce
this behaviour?
As I mentioned before. I'm using a PIII,500Mhz,128MB Machine. Kernel
compiled with PREEMPT=y.
I boot into runlevel 3 and start
# hackbench 40
from the shell.
Just adjust the number so hackbench eats up all the memory.
How much swap space is online?
Thomas Gleixner
2004-12-02 17:17:31 UTC
Permalink
Post by Andrew Morton
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.
Well no ;)
:)
Post by Andrew Morton
Thomas, could you please put together a description of how to reproduce
this behaviour?
As I mentioned before. I'm using a PIII,500Mhz,128MB Machine. Kernel
compiled with PREEMPT=y.
I boot into runlevel 3 and start
# hackbench 40
from the shell.

Just adjust the number so hackbench eats up all the memory.

I have some more test cases with real applications, but they are not so
easy to reproduce. Hackbench resembles the forking server quite well.

tglx
Marcelo Tosatti
2004-12-02 11:18:29 UTC
Permalink
Post by Andrew Morton
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.
Well no ;)
I bet zone->all_unreclaimable is one of the main issues here as Andrea notes.
Post by Andrew Morton
Thomas, could you please put together a description of how to reproduce
this behaviour?
A simple "fillmem" works for me - the OOM killer kills the hog and the bash
which its being ran from. Have you tried that?

If you fire up a few fillmem's at the same time I bet you'll see the problem in a
greater degree.
Andrew Morton
2004-12-02 18:29:35 UTC
Permalink
Post by Andrew Morton
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.
Well no ;)
Could you explain why do we need all_unreclaimable? What is the point
of all_unreclaimable if we bypass it at priority = 0? Just to loop a
few times (between DEF_PRIORITY and 1) at 100% cpu load?
It addresses the situation where all of a zone's pages are pinned down for
some reason (say, they're all mlocked, or we're out of swapspace). There's
no point in chewing tons of CPU scanning this zone on every reclaim attempt,
so the VM marks the zone as being full of unreclaimable pages.

When the zone is marked all_unreclaimable, we *logically* don't scan it at
all - just skip it. But in practice we do need to detect the situation
where some of the zone's pages have become reclaimable again. That could
happen because more swapspace has become available, or an app ran munlock()
or whatever. So to perform this detection we perform tiny scans to just
poll the zone. If that tiny scan results in a page getting reclaimed then
we clear all_reclaimable and the zone returns to normal state.

As an alternative to the tiny-scan-polling we could clear all_unreclaimable
in all zones when releasing swapspace, when running munlock(), etc.

Probably free_pages_bulk() should only be clearing all_unreclaimable if
current->reclaim_state != NULL, because random page freeings in the zone
don't necessarily mean that any pages have become reclaimable. Or clear
all_unreclaimable in shrink_list() rather than free_pages_bulk().
OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
never sends signals. That might be why 2.4 doesn't kill more than one
task by mistake, even without a callback-wakeup. So if we keep sending
signals I certainly agree with Thomas that using a callback to stop the
VM until the task is rescheduled is a good idea, and potentially it may
be even the only possible fix when the oom killer is enabled like in 2.6
(though the 300 kills in between SIGKILL and the reschedule sounds like
the VM isn't even trying anymore). Otherwise perhaps his workload is
spawning enough tasks, that it takes an huge time for the rechedule
(that would explain it too).
Could be, I dunno. I've never done any work on the oom-killer and I tend
to regard it as a stupid thing which is only there so you can get back into
the machine to shut down and restart everything.

I mean, if you ran the machine out of memory then it is underprovisioned
and it *will* become unreliable whatever we do. The answer is to Not Do
That. As long as the oom-killer allows you to get in and admin the machine
later on then that's good enough as far as I'm concerned. Others probably
disagree ;)
- if (p->flags & PF_MEMDIE)
- return 0;
Thomas can you try the above?
I'd rather fix this by removing buggy code, than by adding additional
logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
race and can corrupt other bitflags),
Yes, that needs a separate task_struct field.
but at least the
oom-wakeup-callback from do_exit still makes a lot of sense (even if
PF_MEMDIE must be dropped since it's buggy, and something else should be
used instead).
Probably.
does the right watermark checks before killing the next task
yes, we should do that.
BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
it for years but nobody listened yet, hope Thomas will be better at
convincing the VM mainline folks than me.
hm. I thought we were already doing that, but it seems not.
Thomas Gleixner
2004-12-02 19:01:58 UTC
Permalink
Post by Andrew Morton
Could be, I dunno. I've never done any work on the oom-killer and I tend
to regard it as a stupid thing which is only there so you can get back into
the machine to shut down and restart everything.
I mean, if you ran the machine out of memory then it is underprovisioned
and it *will* become unreliable whatever we do. The answer is to Not Do
That. As long as the oom-killer allows you to get in and admin the machine
later on then that's good enough as far as I'm concerned. Others probably
disagree ;)
I agree, but the current situation made me drive 150km, because sshd or
even init was killed.

I hit this problem, when a forking server application got out of control
because there were suddenly connecting hundreds of clients due to a
different problem.

As long as I can log into the machine and fix the crap it's ok. There's
no need for anything else than accessability and a half way
deterministic behaviour.

tglx
Andrea Arcangeli
2004-12-02 18:08:23 UTC
Permalink
Post by Andrew Morton
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.
Well no ;)
Could you explain why do we need all_unreclaimable? What is the point
of all_unreclaimable if we bypass it at priority = 0? Just to loop a
few times (between DEF_PRIORITY and 1) at 100% cpu load?

OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
never sends signals. That might be why 2.4 doesn't kill more than one
task by mistake, even without a callback-wakeup. So if we keep sending
signals I certainly agree with Thomas that using a callback to stop the
VM until the task is rescheduled is a good idea, and potentially it may
be even the only possible fix when the oom killer is enabled like in 2.6
(though the 300 kills in between SIGKILL and the reschedule sounds like
the VM isn't even trying anymore). Otherwise perhaps his workload is
spawning enough tasks, that it takes an huge time for the rechedule
(that would explain it too).

Actually this should fix it too btw:

- if (p->flags & PF_MEMDIE)
- return 0;

Thomas can you try the above?

I'd rather fix this by removing buggy code, than by adding additional
logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
race and can corrupt other bitflags), but at least the
oom-wakeup-callback from do_exit still makes a lot of sense (even if
PF_MEMDIE must be dropped since it's buggy, and something else should be
used instead).

Whatever we change I'd like to change it on top of my last patch that
already removes the 5 seconds fixed waits, and does the right watermark
checks before killing the next task (Thomas already attempted that with
a not accurate nr_free_pages check, so he at least agrees about the need
of checking watermarks before firing up the oom killer).

BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
it for years but nobody listened yet, hope Thomas will be better at
convincing the VM mainline folks than me.
Thomas Gleixner
2004-12-02 18:55:16 UTC
Permalink
OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
never sends signals. That might be why 2.4 doesn't kill more than one
task by mistake, even without a callback-wakeup.
I just run the same test on 2.4.27 and the behaviour is completely
different.

The box seems to be stuck in a swap in/out loop for quite a long time.
During this time the box is not responsive at all. It finally stops the
forking after quite a long time of swapping with
fork() (error: resource temporarily not available).

There is no output in dmesg, but I'm not able to remove the remaining
hackbench processes as even a kill -SIGKILL returns with
fork() (error: resource temporarily not available)

I'm not sure, which of the two scenarios I like better :)

FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.

With the callback the machine did not come back after 20 Minutes.

Without the callback the machine dies after about 10 Minutes and killing
all available processes except init with:
Kernel panic - not syncing: Out of memory and no killable processes...
So if we keep sending
signals I certainly agree with Thomas that using a callback to stop the
VM until the task is rescheduled is a good idea, and potentially it may
be even the only possible fix when the oom killer is enabled like in 2.6
(though the 300 kills in between SIGKILL and the reschedule sounds like
the VM isn't even trying anymore). Otherwise perhaps his workload is
spawning enough tasks, that it takes an huge time for the rechedule
(that would explain it too).
Yeah, there are enough tasks and with preempt enabled more than one
tasks requests memory. That explains the repetitive calls to
out_of_memory(). This only happens on UP + PREEMPT=y and SMP. See above.
- if (p->flags & PF_MEMDIE)
- return 0;
Thomas can you try the above?
You meant the one in badness() right ?
Well it makes it better, but I was able to have a second invocation
before the first killed tasks exited. That's simple to explain. The task
is on the way out and releases resources, so the VM size is reduced and
the killer picks another process.
I'd rather fix this by removing buggy code, than by adding additional
logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
race and can corrupt other bitflags), but at least the
oom-wakeup-callback from do_exit still makes a lot of sense (even if
PF_MEMDIE must be dropped since it's buggy, and something else should be
used instead).
I think the callback is the only safe way to fix that. If PF_MEMDIE is
racy then I'm sure we will find a different indicator for that.
Whatever we change I'd like to change it on top of my last patch that
already removes the 5 seconds fixed waits, and does the right watermark
checks before killing the next task (Thomas already attempted that with
a not accurate nr_free_pages check, so he at least agrees about the need
of checking watermarks before firing up the oom killer).
Yep, but the reentrancy blocking with the callback makes the time, count
crap and the watermark check go away, as it is safe to reenable the
killer at this point because we definitely freed memory resources. So
the watermark comes for free.
BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
it for years but nobody listened yet, hope Thomas will be better at
convincing the VM mainline folks than me.
:)

tglx
Andrew Morton
2004-12-02 19:07:29 UTC
Permalink
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
There's a patch in -mm which causes the oom-killer to be invoked each time
you hit sysrq-F, which sounds like a fine idea to me.

Unfortunately it calls the oom-killer from interrupt context, and I need to
fix that up before the patch goes any further.
Thomas Gleixner
2004-12-02 19:08:13 UTC
Permalink
Post by Andrew Morton
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
There's a patch in -mm which causes the oom-killer to be invoked each time
you hit sysrq-F, which sounds like a fine idea to me.
Can you please explain, how I can hit sysrq-F when I can't log into the
remote machine ?

tglx
Andrew Morton
2004-12-02 19:22:08 UTC
Permalink
Post by Thomas Gleixner
Post by Andrew Morton
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
There's a patch in -mm which causes the oom-killer to be invoked each time
you hit sysrq-F, which sounds like a fine idea to me.
Can you please explain, how I can hit sysrq-F when I can't log into the
remote machine ?
umm, in the same way you're using "SysRq-T and the reset button"?

You can issue sysrq commands over serial consoles too.
Thomas Gleixner
2004-12-02 19:24:10 UTC
Permalink
Post by Andrew Morton
Post by Thomas Gleixner
Post by Andrew Morton
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
There's a patch in -mm which causes the oom-killer to be invoked each time
you hit sysrq-F, which sounds like a fine idea to me.
Can you please explain, how I can hit sysrq-F when I can't log into the
remote machine ?
umm, in the same way you're using "SysRq-T and the reset button"?
You can issue sysrq commands over serial consoles too.
I know, but the console and the reset button are 150km away. When I dial
into the machine or try to connect via the network, I cannot connect
with the current kernels. Neither 2.4, because the fork fails, nor 2.6
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.

If this happens the kernel just should make sure to make the machine
accessible and let me repair the problem. Nothing else.

tglx
Andre Tomt
2004-12-02 20:11:12 UTC
Permalink
Post by Thomas Gleixner
Post by Andrew Morton
You can issue sysrq commands over serial consoles too.
I know, but the console and the reset button are 150km away. When I dial
into the machine or try to connect via the network, I cannot connect
with the current kernels. Neither 2.4, because the fork fails, nor 2.6
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.
Get one of those terminal server/concentrators that export the serial
consoles over IP. Or one of those KVM-over-IP extenders. Worth every penny.

[sorry thomas, forgot reply all first time aound, so you'll get a dupe.]
Thomas Gleixner
2004-12-03 22:45:40 UTC
Permalink
Post by Andre Tomt
Post by Thomas Gleixner
Post by Andrew Morton
You can issue sysrq commands over serial consoles too.
I know, but the console and the reset button are 150km away. When I dial
into the machine or try to connect via the network, I cannot connect
with the current kernels. Neither 2.4, because the fork fails, nor 2.6
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.
Get one of those terminal server/concentrators that export the serial
consoles over IP. Or one of those KVM-over-IP extenders. Worth every penny.
Makes totally sense.

I add 1000$ equipment to a 300$ embedded box because there is a software
bug.

Is it possible that we are living in a different universe ?

tglx
Andrea Arcangeli
2004-12-02 23:47:23 UTC
Permalink
Post by Thomas Gleixner
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.
You'd need a little old computer next to your server with serial console
attached to it for it to be effective. But it sounds more like a last
resort and it makes economical sense only if you're not hosting your
server and you've your own server room. Normally one doesn't need to
drive 150km just because you can call somebody at the phone to click
reboot (worse than SYSRQ+F [unless it was the critical app itself
hitting a memleak], but much cheaper than hosting a serial console
server too).

The SYSRQ+F sounds more useful on a desktop usage, if you've tons and
tons of swap. Don't forget that with an huge amount of swap, you're
telling the kernel "please make my machine extremely slow if all apps
uses the ram at the same time". In many situations it may be more
efficicient for you to kill one of these apps, wait the other to finish,
and restart the killed app from scratch (to avoid trashing, and the
kernel _can't_ avoid trashing or it wouldn't be fair anymore). So if you
do a mistake and you want to recover responsiveness in less than a
second, the SYSRQ+F trick should make it.
Helge Hafting
2004-12-03 14:41:34 UTC
Permalink
Post by Thomas Gleixner
I know, but the console and the reset button are 150km away. When I dial
into the machine or try to connect via the network, I cannot connect
with the current kernels. Neither 2.4, because the fork fails, nor 2.6
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.
The case of OOM killed sshd is fixable without touching the kernel:
Make sure sshd is started from init, init will then restart sshd whenever
it quits for some reason. This will get you your essential sshd back
assuming the machine is still running and the OOM killer managed
to free up some memory by killing some other processes.

One might still wish for better OOM behaviour, but it is a case
where something has to give.

Helge Hafting
Thomas Gleixner
2004-12-03 21:20:22 UTC
Permalink
Post by Helge Hafting
Make sure sshd is started from init, init will then restart sshd whenever
it quits for some reason. This will get you your essential sshd back
assuming the machine is still running and the OOM killer managed
to free up some memory by killing some other processes.
One might still wish for better OOM behaviour, but it is a case
where something has to give.
Hey, are you kidding ?

2.4 lets me not in, because the fork of sshd fails. How do you fix this
with changing the userspace ?

2.6 oom is plain buggy

I have no problem to help myself, but I want to get this fixed in a
reliable way which meets the comment in oom_kill.c: "least surprise"

tglx
Helge Hafting
2004-12-05 21:14:43 UTC
Permalink
Post by Thomas Gleixner
Post by Helge Hafting
Make sure sshd is started from init, init will then restart sshd whenever
it quits for some reason. This will get you your essential sshd back
assuming the machine is still running and the OOM killer managed
to free up some memory by killing some other processes.
One might still wish for better OOM behaviour, but it is a case
where something has to give.
Hey, are you kidding ?
Please don't misunderstand. I'm not saing that 2.6 is fine,
only that there is a way to automatically restart a sshd accidentally
killed by the OOM killer. This could probably save you some of those
trips, if you keep experimenting with 2.6.
Post by Thomas Gleixner
2.4 lets me not in, because the fork of sshd fails. How do you fix this
with changing the userspace ?
Fork failing is another issue than a killed sshd. It is usually fixed
by using ulimit so a buggy process simply cant fork off way too
many children. (Or use up too much memory.)
Post by Thomas Gleixner
2.6 oom is plain buggy
Quite possible, but be aware that these things can happen with 2.4 too.
It is possible to get 2.4 into a shortage where fork fails,
you should use ulimit anyway to avoid that. Also, having
a sshd that is restarted by "init" is a good idea anyway
on such a remote machine. 2.4 might not kill sshd by kernel bug, but
who knows what some future exploit or future bug could do.
Post by Thomas Gleixner
I have no problem to help myself, but I want to get this fixed in a
reliable way which meets the comment in oom_kill.c: "least surprise"
I too want a well-behaved OOM killer. Workarounds are available though,
until this happens.

Helge Hafting
Andrea Arcangeli
2004-12-02 23:35:00 UTC
Permalink
Post by Thomas Gleixner
OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
never sends signals. That might be why 2.4 doesn't kill more than one
task by mistake, even without a callback-wakeup.
I just run the same test on 2.4.27 and the behaviour is completely
different.
The box seems to be stuck in a swap in/out loop for quite a long time.
During this time the box is not responsive at all. It finally stops the
forking after quite a long time of swapping with
fork() (error: resource temporarily not available).
Fork eventually failing is very reasonable if you're executing a fork
loop.
Post by Thomas Gleixner
There is no output in dmesg, but I'm not able to remove the remaining
hackbench processes as even a kill -SIGKILL returns with
fork() (error: resource temporarily not available)
I'm not sure, which of the two scenarios I like better :)
Please try with 2.4.23aa3, I think there was some oom killer change
after I had no resources to track 2.4 anymore. I'm not saying 2.4.23aa3
will work better though, but I would like to know if there's some corner
case still open in 2.4-aa. Careful, 2.4.23aa3 has security bugs (only
local security of course, i.e. normally not a big issue, sure good
enough for a quick test).

I doubt your testcase simulates anything remotely realistic but
anyway it's still informative.

What I'm simulating here is very real life scenario with a couple of
apps allocating more memory than ram.
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
With the callback the machine did not come back after 20 Minutes.
Was the oom killer invoked at all? If yes, and it works with preempt,
that could mean a cond_resched is simply missing...
Post by Thomas Gleixner
You meant the one in badness() right ?
yes.
Post by Thomas Gleixner
Well it makes it better, but I was able to have a second invocation
before the first killed tasks exited. That's simple to explain. The task
is on the way out and releases resources, so the VM size is reduced and
the killer picks another process.
That's trivial to fix checking for PF_DEAD/PF_EXITING.
Post by Thomas Gleixner
I'd rather fix this by removing buggy code, than by adding additional
logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
race and can corrupt other bitflags), but at least the
oom-wakeup-callback from do_exit still makes a lot of sense (even if
PF_MEMDIE must be dropped since it's buggy, and something else should be
used instead).
I think the callback is the only safe way to fix that. If PF_MEMDIE is
racy then I'm sure we will find a different indicator for that.
The callback adds overhead to the exit path. Plus strictly speaking it's
not actually a callback, you're just "polling" for the bitflag :)
Post by Thomas Gleixner
Yep, but the reentrancy blocking with the callback makes the time, count
crap and the watermark check go away, as it is safe to reenable the
killer at this point because we definitely freed memory resources. So
the watermark comes for free.
You can get an I/O race where your program is about to finish a failing
try_to_free_pages pass (note that a task exiting won't make
try_to_free_pages work any easier, try_to_free_pages has to free
allocated memory, it doesn't care if there's 1M or 100M of free memory).
If you don't check the watermarks after waiting for I/O, you're going to
generate a suprious oom-killing. Your changes can't help.

Note that even the watermark checks leaves a race window open, but at
least it's not an I/O window. While try_to_free_pages can wait for I/O
and then fail.

I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
cond_resched. Then you can try again with my simple way (w/ and w/o
PREEMPT ;).

Thanks for the great feedback.
Andrea Arcangeli
2004-12-03 02:28:54 UTC
Permalink
Post by Andrea Arcangeli
I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
cond_resched. Then you can try again with my simple way (w/ and w/o
PREEMPT ;).
Ok, I expect this patch to fix the problem completely. The biggest
difference is that it doesn't affect the exit fast path and it doesn't need
notification. It's not even slower than your patch since you were
really polling too. This schedule properly, if you get any PREEMPT=n
problem I'd really like to know. This is on top of my previous patch so
it does the checks for the watermarks correctly (those are not obsoleted
by whatever thing we do in out_of_memory()). This still make use of the
PF_MEMDIE info but not in kernel/, only in mm/oom_kill.c where it born
and dies there, so the race is somewhat hidden in there. As you said
converting PF_MEMDIE to a set_tsk_thread_flag or some other non-racy
thing is a due change but I'm not doing it in the below patch.

With this thing, I doubt any wrong task will ever be killed again and
you won't risk to drive 150km again (or at least not for this problem ;).
Please test and apply to mainline if you agree.

oom_kill.c | 158 ++++++++++++++++++++---------------------------------------
page_alloc.c | 60 ++++++++++++++++------
swap_state.c | 2
vmscan.c | 2
4 files changed, 103 insertions(+), 119 deletions(-)

Signed-off-by: Andrea Arcangeli <***@suse.de>

Index: x/mm/oom_kill.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/oom_kill.c,v
retrieving revision 1.31
diff -u -p -r1.31 oom_kill.c
--- x/mm/oom_kill.c 14 Oct 2004 04:27:49 -0000 1.31
+++ x/mm/oom_kill.c 3 Dec 2004 02:12:29 -0000
@@ -49,8 +49,6 @@ static unsigned long badness(struct task
if (!p->mm)
return 0;

- if (p->flags & PF_MEMDIE)
- return 0;
/*
* The memory size of the process is the basis for the badness.
*/
@@ -120,14 +118,25 @@ static struct task_struct * select_bad_p

do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p)
- if (p->pid) {
- unsigned long points = badness(p, uptime.tv_sec);
+ /* skip the init task with pid == 1 */
+ if (p->pid > 1) {
+ unsigned long points;
+
+ /*
+ * This is in the process of releasing memory so wait it
+ * to finish before killing some other task by mistake.
+ */
+ if ((p->flags & PF_MEMDIE) ||
+ ((p->flags & PF_EXITING) && !(p->flags & PF_DEAD)))
+ return ERR_PTR(-1UL);
+ if (p->flags & PF_SWAPOFF)
+ return p;
+
+ points = badness(p, uptime.tv_sec);
if (points > maxpoints) {
chosen = p;
maxpoints = points;
}
- if (p->flags & PF_SWAPOFF)
- return p;
}
while_each_thread(g, p);
return chosen;
@@ -140,6 +149,12 @@ static struct task_struct * select_bad_p
*/
static void __oom_kill_task(task_t *p)
{
+ if (p->pid == 1) {
+ WARN_ON(1);
+ printk(KERN_WARNING "tried to kill init!\n");
+ return;
+ }
+
task_lock(p);
if (!p->mm || p->mm == &init_mm) {
WARN_ON(1);
@@ -169,9 +184,25 @@ static void __oom_kill_task(task_t *p)
static struct mm_struct *oom_kill_task(task_t *p)
{
struct mm_struct *mm = get_task_mm(p);
- if (!mm || mm == &init_mm)
+ task_t * g, * q;
+
+ if (!mm)
return NULL;
+ if (mm == &init_mm) {
+ mmput(mm);
+ return NULL;
+ }
+
__oom_kill_task(p);
+ /*
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group
+ */
+ do_each_thread(g, q)
+ if (q->mm == mm && q->tgid != p->tgid)
+ __oom_kill_task(q);
+ while_each_thread(g, q);
+
return mm;
}

@@ -184,117 +215,40 @@ static struct mm_struct *oom_kill_task(t
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-static void oom_kill(void)
+void out_of_memory(int gfp_mask)
{
- struct mm_struct *mm;
- struct task_struct *g, *p, *q;
-
+ struct mm_struct *mm = NULL;
+ task_t * p;
+
read_lock(&tasklist_lock);
retry:
p = select_bad_process();

+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
+ read_unlock(&tasklist_lock);
show_free_areas();
panic("Out of memory and no killable processes...\n");
}

+ printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+ show_free_areas();
mm = oom_kill_task(p);
if (!mm)
goto retry;
- /*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
- */
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
- while_each_thread(g, q);
- if (!p->mm)
- printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
- read_unlock(&tasklist_lock);
- mmput(mm);

- /*
- * Make kswapd go out of the way, so "p" has a good chance of
- * killing itself before someone else gets the chance to ask
- * for more memory.
- */
- yield();
- return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
- /*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
- static unsigned long first, last, count, lastkill;
- unsigned long now, since;
-
- spin_lock(&oom_lock);
- now = jiffies;
- since = now - last;
- last = now;
-
- /*
- * If it's been a long time since last failure,
- * we're not oom.
- */
- if (since > 5*HZ)
- goto reset;
-
- /*
- * If we haven't tried for at least one second,
- * we're not really oom.
- */
- since = now - first;
- if (since < HZ)
- goto out_unlock;
-
- /*
- * If we have gotten only a few failures,
- * we're not really oom.
- */
- if (++count < 10)
- goto out_unlock;
-
- /*
- * If we just killed a process, wait a while
- * to give that task a chance to exit. This
- * avoids killing multiple processes needlessly.
- */
- since = now - lastkill;
- if (since < HZ*5)
- goto out_unlock;
+ out:
+ read_unlock(&tasklist_lock);
+ if (mm)
+ mmput(mm);

/*
- * Ok, really out of memory. Kill something.
+ * Give "p" a good chance of killing itself before we
+ * retry to allocate memory.
*/
- lastkill = now;
-
- printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
- show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
- oom_kill();
- spin_lock(&oom_lock);
-
-reset:
- /*
- * We dropped the lock above, so check to be sure the variable
- * first only ever increases to prevent false OOM's.
- */
- if (time_after(now, first))
- first = now;
- count = 0;
-
-out_unlock:
- spin_unlock(&oom_lock);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
}
Index: x/mm/page_alloc.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/page_alloc.c,v
retrieving revision 1.236
diff -u -p -r1.236 page_alloc.c
--- x/mm/page_alloc.c 16 Nov 2004 03:53:53 -0000 1.236
+++ x/mm/page_alloc.c 3 Dec 2004 01:58:19 -0000
@@ -611,8 +611,10 @@ __alloc_pages(unsigned int gfp_mask, uns
int alloc_type;
int do_retry;
int can_try_harder;
+ int did_some_progress;

- might_sleep_if(wait);
+ if (wait)
+ cond_resched();

/*
* The caller may dip into page reserves a bit more if the caller
@@ -645,6 +647,7 @@ __alloc_pages(unsigned int gfp_mask, uns
for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z);

+ restart:
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
@@ -681,31 +684,58 @@ __alloc_pages(unsigned int gfp_mask, uns
goto nopage;

rebalance:
+ cond_resched();
+
/* We now go into synchronous reclaim */
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- try_to_free_pages(zones, gfp_mask, order);
+ did_some_progress = try_to_free_pages(zones, gfp_mask, order);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

- /* go through the zonelist yet one more time */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- min = z->pages_min;
- if (gfp_mask & __GFP_HIGH)
- min /= 2;
- if (can_try_harder)
- min -= min / 4;
- min += (1<<order) + z->protection[alloc_type];
+ cond_resched();

- if (z->free_pages < min)
- continue;
+ if (likely(did_some_progress)) {
+ /* go through the zonelist yet one more time */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ min = z->pages_min;
+ if (gfp_mask & __GFP_HIGH)
+ min /= 2;
+ if (can_try_harder)
+ min -= min / 4;
+ min += (1<<order) + z->protection[alloc_type];

- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
+ if (z->free_pages < min)
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+ } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ /*
+ * Go through the zonelist yet one more time, keep
+ * very high watermark here, this is only to catch
+ * a parallel oom killing, we must fail if we're still
+ * under heavy pressure.
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ min = z->pages_high;
+ min += (1<<order) + z->protection[alloc_type];
+
+ if (z->free_pages < min)
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+
+ out_of_memory(gfp_mask);
+ goto restart;
}

/*
Index: x/mm/swap_state.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/swap_state.c,v
retrieving revision 1.79
diff -u -p -r1.79 swap_state.c
--- x/mm/swap_state.c 29 Oct 2004 20:21:14 -0000 1.79
+++ x/mm/swap_state.c 3 Dec 2004 01:48:22 -0000
@@ -59,6 +59,8 @@ void show_swap_cache_info(void)
swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total,
swap_cache_info.noent_race, swap_cache_info.exist_race);
+ printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
Index: x/mm/vmscan.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/vmscan.c,v
retrieving revision 1.225
diff -u -p -r1.225 vmscan.c
--- x/mm/vmscan.c 19 Nov 2004 22:54:22 -0000 1.225
+++ x/mm/vmscan.c 2 Dec 2004 01:56:50 -0000
@@ -935,8 +935,6 @@ int try_to_free_pages(struct zone **zone
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
out:
for (i = 0; zones[i] != 0; i++)
zones[i]->prev_priority = zones[i]->temp_priority;
Thomas Gleixner
2004-12-03 22:37:17 UTC
Permalink
Post by Andrea Arcangeli
Post by Andrea Arcangeli
I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
cond_resched. Then you can try again with my simple way (w/ and w/o
PREEMPT ;).
Ok, I expect this patch to fix the problem completely.
<SNIP>
With this thing, I doubt any wrong task will ever be killed again...
You're right. oom-kill() did not do anything wrong. See log below

This is w/o PREEMPT. Is it neccecary to verify w/ PREEMPT too ?

If it would have booted it still would have killed sshd instead of the
application which was forking a lot of childs.

tglx

Dentry cache hash table entries: 32768 (order: 5, 131072 bytes)
Inode-cache hash table entries: 16384 (order: 4, 65536 bytes)
Memory: 126476k/131060k available (1690k kernel code, 4044k reserved,
732k data)Checking if this processor honours the WP bit even in
supervisor mode... Ok.
Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 128K
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#0.
CPU: Intel Celeron (Mendocino) stepping 00
Enabling fast FPU save and restore... done.
Checking 'hlt' instruction... OK.

END OF LOG
Thomas Gleixner
2004-12-03 22:51:50 UTC
Permalink
Ooops, sorry it did add something to the Log after 10 minutes
Post by Thomas Gleixner
You're right. oom-kill() did not do anything wrong. See log below
This is w/o PREEMPT. Is it neccecary to verify w/ PREEMPT too ?
If it would have booted it still would have killed sshd instead of the
application which was forking a lot of childs.
Enabling fast FPU save and restore... done.
Checking 'hlt' instruction... OK.
END OF LOG
Unable to handle kernel NULL pointer dereference at virtual address
0000000c
printing eip:
c011fe30
*pde = 00000000
Oops: 0000 [#1]
Modules linked in:
CPU: 0
EIP: 0060:[<c011fe30>] Not tainted VLI
EFLAGS: 00010082 (2.6.10-rc2)
EIP is at __queue_work+0x20/0x60
eax: 00000000 ebx: c032cc80 ecx: 00000000 edx: 00000008
esi: c03acd14 edi: 00000282 ebp: c035ff64 esp: c035ff34
ds: 007b es: 007b ss: 0068
Process swapper (pid: 0, threadinfo=c035e000 task=c02ebb00)
Stack: c035ffa4 00000000 c03acd14 c035ff64 c011fe99 00000000 c032cc80
c01dd8b0
c0119c20 00000000 00000000 c035ffa4 c035ff64 c035ff64 00000000
00000001
c03a5868 0000000a 003d9007 c0115f3b c03a5868 00000046 00099100
c039e120
Call Trace:
[<c011fe99>] queue_work+0x29/0x50
[<c01dd8b0>] blank_screen_t+0x0/0x20
[<c0119c20>] run_timer_softirq+0xb0/0x170
[<c0115f3b>] __do_softirq+0x7b/0x90
[<c0115f77>] do_softirq+0x27/0x30
[<c010411e>] do_IRQ+0x1e/0x30
[<c010271e>] common_interrupt+0x1a/0x20
[<c0100623>] default_idle+0x23/0x40
[<c01006b4>] cpu_idle+0x34/0x40
[<c03607d8>] start_kernel+0x168/0x1b0
[<c0360370>] unknown_bootoption+0x0/0x1e0
Code: eb e5 90 90 90 90 90 90 90 90 90 83 ec 10 8b 44 24 14 89 5c 24 04
8b 5c 2
<0>Kernel panic - not syncing: Fatal exception in interrupt
Andrea Arcangeli
2004-12-03 23:08:55 UTC
Permalink
Post by Thomas Gleixner
Ooops, sorry it did add something to the Log after 10 minutes
You mean my patch is preventing your machine to boot? Then you're doing
something else wrong because it's impossible my patch is preventing your
machine to boot. I also tested it before posting. You must have changed
something more than the PREEMPT bit after applying my patch.
William Lee Irwin III
2004-12-10 16:36:14 UTC
Permalink
Post by Andrea Arcangeli
+ if (mm == &init_mm) {
+ mmput(mm);
+ return NULL;
+ }
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
+ read_unlock(&tasklist_lock);
show_free_areas();
panic("Out of memory and no killable processes...\n");
}
Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
a sign of a transient error, not cause for a hard panic.


-- wli
Andrea Arcangeli
2004-12-10 17:35:54 UTC
Permalink
Post by William Lee Irwin III
Post by Andrea Arcangeli
+ if (mm == &init_mm) {
+ mmput(mm);
+ return NULL;
+ }
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
+ read_unlock(&tasklist_lock);
show_free_areas();
panic("Out of memory and no killable processes...\n");
}
Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
a sign of a transient error, not cause for a hard panic.
It can't be a transient error as far as I can tell, it's just like the
issue of alloc_pages returning NULL (and potentially scheduling first)
before mounting the root fs.
William Lee Irwin III
2004-12-10 17:43:36 UTC
Permalink
Post by Andrea Arcangeli
Post by William Lee Irwin III
Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
a sign of a transient error, not cause for a hard panic.
It can't be a transient error as far as I can tell, it's just like the
issue of alloc_pages returning NULL (and potentially scheduling first)
before mounting the root fs.
Well, the only way I see this happening is the process exiting followed
by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
the tree).


-- wli
Andrea Arcangeli
2004-12-10 17:55:04 UTC
Permalink
Post by William Lee Irwin III
Well, the only way I see this happening is the process exiting followed
by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
the tree).
I don't see the problem with use_mm. use_mm has either the mm set to
ctx->mm or to NULL, and ctx->mm is set to the mm of the process calling
io_setup.

The only thing using init_mm is the idle task/swapper as far as I can
tell, kernel threads and exiting tasks have a NULL mm.
William Lee Irwin III
2004-12-10 18:00:31 UTC
Permalink
Post by Andrea Arcangeli
Post by William Lee Irwin III
Well, the only way I see this happening is the process exiting followed
by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
the tree).
I don't see the problem with use_mm. use_mm has either the mm set to
ctx->mm or to NULL, and ctx->mm is set to the mm of the process calling
io_setup.
The only thing using init_mm is the idle task/swapper as far as I can
tell, kernel threads and exiting tasks have a NULL mm.
Yet those don't appear in the tasklist, so some task in the tasklist
has to get ->mm set to &init_mm. The notion above was that the init_mm
check was to handle some out-of-tree attempt to do aio from kernel threads.


-- wli
Andrea Arcangeli
2004-12-10 18:15:29 UTC
Permalink
Post by William Lee Irwin III
Yet those don't appear in the tasklist, so some task in the tasklist
has to get ->mm set to &init_mm. The notion above was that the init_mm
check was to handle some out-of-tree attempt to do aio from kernel threads.
Not sure to understand correctly but, aio has always been done through
kernel threads, and that's the whole thing about aio. Not sure what
you're doing out-of-tree, but you don't need to use init_mm to deal with
kernel threads, infact kernel threads can only have ->mm = NULL. When
switching mm with use_mm the aio thread is only going to use a real mm
with mappings in userspace, so even in that case you don't need init_mm.
I didn't see the out of tree code though.
William Lee Irwin III
2004-12-10 18:19:54 UTC
Permalink
Post by Andrea Arcangeli
Post by William Lee Irwin III
Yet those don't appear in the tasklist, so some task in the tasklist
has to get ->mm set to &init_mm. The notion above was that the init_mm
check was to handle some out-of-tree attempt to do aio from kernel threads.
Not sure to understand correctly but, aio has always been done through
kernel threads, and that's the whole thing about aio. Not sure what
you're doing out-of-tree, but you don't need to use init_mm to deal with
kernel threads, infact kernel threads can only have ->mm = NULL. When
switching mm with use_mm the aio thread is only going to use a real mm
with mappings in userspace, so even in that case you don't need init_mm.
I didn't see the out of tree code though.
Maybe the whole init_mm check should die since nothing in-tree could
cause it?


-- wli
Andrea Arcangeli
2004-12-10 19:05:48 UTC
Permalink
Post by William Lee Irwin III
Maybe the whole init_mm check should die since nothing in-tree could
cause it?
Well it's a bugcheck after all so it certainly can be removed. I
wouldn't mind to remove it completely, but this is just to be sure
nothing is going wrong, though I agree it isn't going to help very much ;).
William Lee Irwin III
2004-12-10 16:51:50 UTC
Permalink
Post by Andrea Arcangeli
Ok, I expect this patch to fix the problem completely. The biggest
difference is that it doesn't affect the exit fast path and it doesn't need
notification. It's not even slower than your patch since you were
really polling too. This schedule properly, if you get any PREEMPT=n
problem I'd really like to know. This is on top of my previous patch so
it does the checks for the watermarks correctly (those are not obsoleted
by whatever thing we do in out_of_memory()). This still make use of the
PF_MEMDIE info but not in kernel/, only in mm/oom_kill.c where it born
and dies there, so the race is somewhat hidden in there. As you said
converting PF_MEMDIE to a set_tsk_thread_flag or some other non-racy
thing is a due change but I'm not doing it in the below patch.
I see some real bugfixes in here. I'll start up a fresh thread with some
of those broken out and cc: the relevant people shortly. It's not really
any "new material", just reorganizing things into smaller patches, in
particular the ones I see needing merging ASAP that aren't controversial.


-- wli
Thomas Gleixner
2004-12-03 21:10:06 UTC
Permalink
Post by Andrea Arcangeli
Fork eventually failing is very reasonable if you're executing a fork
loop.
Yes, it's reasonable, but the effect that any consequent command is
aborted then is not so reasonable.
Post by Andrea Arcangeli
Post by Thomas Gleixner
There is no output in dmesg, but I'm not able to remove the remaining
hackbench processes as even a kill -SIGKILL returns with
fork() (error: resource temporarily not available)
I'm not sure, which of the two scenarios I like better :)
Please try with 2.4.23aa3, I think there was some oom killer change
after I had no resources to track 2.4 anymore. I'm not saying 2.4.23aa3
will work better though, but I would like to know if there's some corner
case still open in 2.4-aa. Careful, 2.4.23aa3 has security bugs (only
local security of course, i.e. normally not a big issue, sure good
enough for a quick test).
I try, if I find some time, but I'm not too interested in 2.4 :)
Post by Andrea Arcangeli
I doubt your testcase simulates anything remotely realistic but
anyway it's still informative.
The hackbench testcase resembles the real life problem I encountered
quite realistic. Of course we have fixed the application since then, but
I bet that I'm not the only one who uses forking servers.
Post by Andrea Arcangeli
What I'm simulating here is very real life scenario with a couple of
apps allocating more memory than ram.
Use a forking server, connect a lot of clients and it is real life. :)
Post by Andrea Arcangeli
Post by Thomas Gleixner
FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.
With the callback the machine did not come back after 20 Minutes.
Was the oom killer invoked at all? If yes, and it works with preempt,
that could mean a cond_resched is simply missing...
Yes, it was invoked
Post by Andrea Arcangeli
Post by Thomas Gleixner
I think the callback is the only safe way to fix that. If PF_MEMDIE is
racy then I'm sure we will find a different indicator for that.
The callback adds overhead to the exit path. Plus strictly speaking it's
not actually a callback, you're just "polling" for the bitflag :)
I know :)
Post by Andrea Arcangeli
Post by Thomas Gleixner
Yep, but the reentrancy blocking with the callback makes the time, count
crap and the watermark check go away, as it is safe to reenable the
killer at this point because we definitely freed memory resources. So
the watermark comes for free.
You can get an I/O race where your program is about to finish a failing
try_to_free_pages pass (note that a task exiting won't make
try_to_free_pages work any easier, try_to_free_pages has to free
allocated memory, it doesn't care if there's 1M or 100M of free memory).
If you don't check the watermarks after waiting for I/O, you're going to
generate a suprious oom-killing. Your changes can't help.
True, I did not take the I/O into account.
Post by Andrea Arcangeli
Note that even the watermark checks leaves a race window open, but at
least it's not an I/O window. While try_to_free_pages can wait for I/O
and then fail.
I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
cond_resched. Then you can try again with my simple way (w/ and w/o
PREEMPT ;).
Thanks for the great feedback.
Andrea Arcangeli
2004-12-03 22:21:45 UTC
Permalink
Post by Thomas Gleixner
Post by Andrea Arcangeli
Fork eventually failing is very reasonable if you're executing a fork
loop.
Yes, it's reasonable, but the effect that any consequent command is
aborted then is not so reasonable.
Did you use the 4k stacks on 2.6 btw?
Post by Thomas Gleixner
Use a forking server, connect a lot of clients and it is real life. :)
;)
Post by Thomas Gleixner
Yes, it was invoked
Ok good.
Andrea Arcangeli
2004-12-02 16:47:25 UTC
Permalink
Post by Thomas Gleixner
Reentrancy and follow up calls of oom_kill() are blocked until the task
which was killed by the first oom_kill() has actually released the
resources. I added a callback which is called from do_exit() when the
PF_MEMDIE flag is set. The reentrancy blocking is released inside the
callback.
This logic will certainly fix it, and I'm not against this logic to fix
the problem in a definitive way. It's only unfortunate PF_MEMDIE is smp
racy (plus having to check for PF_MEMDIE in the fast path).
Post by Thomas Gleixner
Post by Thomas Gleixner
From the first call to out_of_memory, which activates the reentrancy
blocking until the blocking is released in the callback, out_of_memory
is called more than 300 times.
I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.

zone->all_unreclaimable and zone->pages_scanned _must_ be set to 0 by
the oom_killer invocation, did you try to fix that?
Post by Thomas Gleixner
So it's up to you VM guys to fight out from which place you want call
out_of_memory(). I don't care as both places have exactly the same bad
side effects.
the reason for oom in page_alloc.c, is that you must check the free
pages levels correctly, your previous nr_free_pages() attempt was the
unreliable way to do that (it couldn't take into account all the
lowmem_reserve levels etc..).
Post by Thomas Gleixner
My concern is to make it reliable when it is finally invoked.
This approach of using a callback will sure work fine for that but the
300 times invocation of oom kill shows something is wrong in the VM, and
I believe the screwup is zone->all_unreclaimable and zone->pages_scanned
not being resetted by the oom killer invocation. I suspect if you fix
that single bit, things will start working even without the callback.

Note that in 2.4 only one task gets killed, and there's no need of any
callback in any fast path to make it work. I'm not conceptually against
the callback to "guarantee" oom-killing racing avoidance, but right now
it sounds like it's hiding some more fundamental problem in 2.6.

Thanks.
Voluspa
2004-12-04 07:00:04 UTC
Permalink
Post by Andrea Arcangeli
You mean my patch is preventing your machine to boot? Then you're doing
something else wrong because it's impossible my patch is preventing
your machine to boot.
Same experience as Thomas here. Full stop like his first log (no errors)
. PIII (Celeron) ***@1 gig, 256 meg mem, 1 gig swap, preempt enabled.

Tried your patch since the oom killer slaughtered a very important app
here when another one ran amok. Not fork spawnings, just ram-eating. Was
blender (3d renderer) in "Sequence Editor" mode when i hit alt-a (for
animate) on a pretty large set of stills. Eventually blender got killed
also, twice...

Kernel 2.6.9 with nick p-s? patch for the buggy kswapd (100 percent cpu,
without using any swap).

Mvh
Mats Johannesson
Andrea Arcangeli
2004-12-04 08:08:40 UTC
Permalink
Post by Voluspa
Post by Andrea Arcangeli
You mean my patch is preventing your machine to boot? Then you're doing
something else wrong because it's impossible my patch is preventing
your machine to boot.
Same experience as Thomas here. Full stop like his first log (no errors)
The places I modified are never invoked during boot (except the below
one). It works like a charm here. I tried again building on top of
2.6.10-rc3 and it works again just fine here (previously I was on top of
the kernel CVS out of sync currently). No idea what's preventing you to
boot, but it's very hard to believe that my patch is to blame for it.

The only modified piece of code that may run during boot is this:

- might_sleep_if(wait);
+ if (wait)
+ cond_resched();


You can try to put back a might_slee_if(wait), but if it deadlocks with
that change sure it's not a bug in my patch, it's instead a bug
somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
lowlatency patch would expose the same bug too since they're aliasing
the might_sleep to cond_resched.

But I can't reproduce anything wrong here.

Enabling unmasked SIMD FPU exception support... done.
Checking 'hlt' instruction... OK.
tbxface-0118 [02] acpi_load_tables : ACPI Tables successfully
acquired
Parsing all Control
Methods:...........................................................................................................................................
Table [DSDT](id F004) - 502 Objects with 45 Devices 139 Methods 29
Regions
ACPI Namespace successfully loaded at root c0503d40
evxfevnt-0094 [03] acpi_enable : Transition to ACPI mode
successful
CPU0: Intel(R) XEON(TM) CPU 2.40GHz stepping 04
per-CPU timeslice cutoff: 1462.70 usecs.
[..]

machine boots and run just fine and the patch is definitely applied.

PREEMPT=n but it can't be a bug in my patch even if PREEMPT=y breaks.
Voluspa
2004-12-04 12:42:54 UTC
Permalink
Post by Andrea Arcangeli
You can try to put back a might_slee_if(wait), but if it deadlocks
with
Post by Andrea Arcangeli
that change sure it's not a bug in my patch, it's instead a bug
somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
lowlatency patch would expose the same bug too since they're aliasing
the might_sleep to cond_resched.
Putting it back doesn't alter the outcome - hanging. And the original
patch, (hope it was the right one) from:

http://marc.theaimsgroup.com/?l=linux-kernel&m=110204117506557&w=2

root:loke:/usr/src/linux-2.6.9-oomkill# patch -Np1 -i ../oomkill.patch
patching file mm/oom_kill.c
patching file mm/page_alloc.c
Hunk #1 succeeded at 608 (offset -3 lines).
Hunk #3 succeeded at 681 (offset -3 lines).
patching file mm/swap_state.c
patching file mm/vmscan.c

has been tried with the following variations. With and without
optimizing for size, with and without preempt, with and without kernel
boot params (cfq, lapic), cold and hot starts, and then I threw in a smp
compile for measure. All have the same behaviour:

[...]
Checking 'hlt' instruction... OK.

[10 minutes wait. Then a long callback trace
scrolls off the screen ending like Thomas']

<0>Kernel panic - not syncing: Fatal exception in interrupt

My toolchain (well, the whole software system) is quite contemporary
within the stable branches. Built from scratch with gcc-3.4.3, glibc-
20041011 (nptl) and binutils-2.15.92.0.2

No energy control, acpi-pm or whatever it's called, is used here. The
machine is extremely stable. Running with 100 percent utilization 24/7.

Don't shoot the messenger ;)
Mats Johannesson
Andrea Arcangeli
2004-12-04 16:43:53 UTC
Permalink
Post by Voluspa
Post by Andrea Arcangeli
You can try to put back a might_slee_if(wait), but if it deadlocks
with
Post by Andrea Arcangeli
that change sure it's not a bug in my patch, it's instead a bug
somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
lowlatency patch would expose the same bug too since they're aliasing
the might_sleep to cond_resched.
Putting it back doesn't alter the outcome - hanging. And the original
http://marc.theaimsgroup.com/?l=linux-kernel&m=110204117506557&w=2
yes it's the right one ;)
Post by Voluspa
root:loke:/usr/src/linux-2.6.9-oomkill# patch -Np1 -i ../oomkill.patch
patching file mm/oom_kill.c
patching file mm/page_alloc.c
Hunk #1 succeeded at 608 (offset -3 lines).
Hunk #3 succeeded at 681 (offset -3 lines).
patching file mm/swap_state.c
patching file mm/vmscan.c
has been tried with the following variations. With and without
optimizing for size, with and without preempt, with and without kernel
boot params (cfq, lapic), cold and hot starts, and then I threw in a smp
[...]
Checking 'hlt' instruction... OK.
[10 minutes wait. Then a long callback trace
scrolls off the screen ending like Thomas']
<0>Kernel panic - not syncing: Fatal exception in interrupt
My toolchain (well, the whole software system) is quite contemporary
within the stable branches. Built from scratch with gcc-3.4.3, glibc-
20041011 (nptl) and binutils-2.15.92.0.2
No energy control, acpi-pm or whatever it's called, is used here. The
machine is extremely stable. Running with 100 percent utilization 24/7.
Don't shoot the messenger ;)
I trust you of course but I've absolutely no idea how can my patch ever
change any code that runs at that point during boot. mm/oom_kill.c can
be obviously ruled out. The changes in mm/swap_state.c (two printk in
show_swap_cache_info) as well can be obviously ruled out. The change in
mm/vmscan.c as well only makes a difference during an oom condition.

This mean it has to be the change in mm/page_alloc.c that broke
something. But even that should never run during boot (except for the
cond_resched instead of might_sleep_if that you already tried to backout
separately from the rest). There's simply not enough memory pressure at
boot in order to recall try_to_free_pages and run the modified code.

If try_to_free_pages is being recalled during boot them we've a problem
somewhere else, it should never happen!

Plus it works like a charm here.

Can you send me your .config so that I will try to send you privately a
kernel image built on my machine? (and before sending I'll try to boot
it locally ;) My .config sure is happily running.

Many thanks.
Thomas Gleixner
2004-12-04 18:33:24 UTC
Permalink
Post by Andrea Arcangeli
If try_to_free_pages is being recalled during boot them we've a problem
somewhere else, it should never happen!
Plus it works like a charm here.
Can you send me your .config so that I will try to send you privately a
kernel image built on my machine? (and before sending I'll try to boot
it locally ;) My .config sure is happily running.
You want my .config too ? :)

I tried again from scratch and the kernel is booting without your patch.
Adding your patch with the same config still does not boot. It does not
depend on PREEMPT=y/n.

I added some debug output and it calls __alloc_pages a couple of times.
All those calls get out from the first goto got_pg as expected.

I will try to add some more debug later

tglx
Thomas Gleixner
2004-12-04 21:02:03 UTC
Permalink
Post by Thomas Gleixner
I added some debug output and it calls __alloc_pages a couple of times.
All those calls get out from the first goto got_pg as expected.
I will try to add some more debug later
Your assumption that reverting the

- might_sleep_if(wait);
+ if (wait)
+ cond_resched();

change does solve the problem is correct. Looking at the diffs its the
only change which can have any influence at this point.

Mats. I don't understand why this did not work for you. The change has
to be reverted to the original line "might_sleep_if(wait)" !

Scheduling in this init stage causes the breakage.
might_sleep_if() is a nop and only does a state check when compiled with
CONFIG_DEBUG_SPINLOCK_SLEEP=y, but it does neither sleep nor schedule.

It then works so far except that it kills the wrong process (sshd), but
I did expect that from the previous experience.

There is no multi kill or other strange things happening. I tested it
with hackbench and the real application _after_ adding my "whom to kill
patch" on top.

Looks much better now.

Can you agree to add the selection patch, which takes the multi child
forking process into account ? I don't explain again why it makes
sense :)

tglx
Andrea Arcangeli
2004-12-05 00:27:36 UTC
Permalink
Post by Thomas Gleixner
Post by Thomas Gleixner
I added some debug output and it calls __alloc_pages a couple of times.
All those calls get out from the first goto got_pg as expected.
I will try to add some more debug later
Your assumption that reverting the
- might_sleep_if(wait);
+ if (wait)
+ cond_resched();
change does solve the problem is correct. Looking at the diffs its the
only change which can have any influence at this point.
Mats. I don't understand why this did not work for you. The change has
to be reverted to the original line "might_sleep_if(wait)" !
Ok, so some piece of code is buggy: somebody is using GFP_KERNEL instead
of GFP_ATOMIC. Reverting my change will only hide the real bug so I
wouldn't recommend it (except for testing purposes).

Would be very nice to find the real bug.
Post by Thomas Gleixner
It then works so far except that it kills the wrong process (sshd), but
I did expect that from the previous experience.
There is no multi kill or other strange things happening. I tested it
with hackbench and the real application _after_ adding my "whom to kill
patch" on top.
Looks much better now.
So you mean there's a separate issue with the task selection right? I
didn't touch the task selection at all.
Post by Thomas Gleixner
Can you agree to add the selection patch, which takes the multi child
forking process into account ? I don't explain again why it makes
sense :)
I didn't recall that part of your patch, but it seems very orthogonal. I
didn't want to change the process selection at the same time. If I will
touch the task selection I'll probably rewrite it from scratch to choose
tasks only in function of the allocation rate, sure not with anything
similar to the current algorithm which is close to a DoS with big
database servers if some other smaller app hits a memleak and allocates
in a loop.
Thomas Gleixner
2004-12-05 14:55:07 UTC
Permalink
Post by Andrea Arcangeli
Ok, so some piece of code is buggy: somebody is using GFP_KERNEL instead
of GFP_ATOMIC. Reverting my change will only hide the real bug so I
wouldn't recommend it (except for testing purposes).
Would be very nice to find the real bug.
Hmm, most of the allocations in early init have the GFP_WAIT bit set.
When I block the call to cond_resched() up to cpu_idle() in rest_init()
everything works. Enabling the call to cond_resched() at any place
before brings the problem back.
Post by Andrea Arcangeli
So you mean there's a separate issue with the task selection right? I
didn't touch the task selection at all.
I know.
Post by Andrea Arcangeli
Post by Thomas Gleixner
Can you agree to add the selection patch, which takes the multi child
forking process into account ? I don't explain again why it makes
sense :)
I didn't recall that part of your patch, but it seems very orthogonal. I
Yes, the modification are not interfering with your patch. They just add
the accounting of child processes to the selection.
Post by Andrea Arcangeli
didn't want to change the process selection at the same time. If I will
touch the task selection I'll probably rewrite it from scratch to choose
tasks only in function of the allocation rate,
That makes sense, but it does not catch processes forking a lot of
childs, because the allocation rate is not accounted to the parent.
Post by Andrea Arcangeli
sure not with anything
similar to the current algorithm which is close to a DoS with big
database servers if some other smaller app hits a memleak and allocates
in a loop.
The comment above badness() is the best part of it, especially the
"least surprise part" :)

* 5) we try to kill the process the user expects us to kill, this
* algorithm has been meticulously tuned to meet the principle
* of least surprise ... (be careful when you change it)

tglx
Andrea Arcangeli
2004-12-05 15:34:04 UTC
Permalink
Post by Thomas Gleixner
Yes, the modification are not interfering with your patch. They just add
the accounting of child processes to the selection.
Could you post your modifications on top of my patch so we can combine
them easily?
Post by Thomas Gleixner
That makes sense, but it does not catch processes forking a lot of
childs, because the allocation rate is not accounted to the parent.
Not sure, the child could easily inherit the allocation rate of the parent.
So if the fork bomb spreads the last leaf in the process tree would
easily get accounted for every kernel stack/fds/etc.. and userspace
allocation done from its previous parents too.
Thomas Gleixner
2004-12-05 16:29:31 UTC
Permalink
Post by Andrea Arcangeli
Post by Thomas Gleixner
Yes, the modification are not interfering with your patch. They just add
the accounting of child processes to the selection.
Could you post your modifications on top of my patch so we can combine
them easily?
The patch is against 2.6.10-rc2-your-latest-patch

It adds the VM size of the child processes and tries to kill a child of
the selected process first. It could be discussed, if we just kill the
parent, but it turned out that the kill child first approach has less
unexpected results.
Post by Andrea Arcangeli
Post by Thomas Gleixner
That makes sense, but it does not catch processes forking a lot of
childs, because the allocation rate is not accounted to the parent.
Not sure, the child could easily inherit the allocation rate of the parent.
So if the fork bomb spreads the last leaf in the process tree would
easily get accounted for every kernel stack/fds/etc.. and userspace
allocation done from its previous parents too.
I think the current solution is just fine. I tested it thoroughly with
different scenarios and was not able to break it. The ugly time, count
hacks are gone and I have the impression that it tries harder to swap
out stuff now. I also tested with swap off and it works without hassle.
There may be some remaining "surprise" hidden in the selection, but I'm
not sure if it is worth the time to optimize it further or even rewrite
it with some other builtin surprises.

tglx
Voluspa
2004-12-05 02:22:21 UTC
Permalink
Post by Thomas Gleixner
Mats. I don't understand why this did not work for you. The change has
to be reverted to the original line "might_sleep_if(wait)" !
Well, I'm no coder and therefore follow instructions to the letter,
almost (; <-- ehum), which meant that "put back" became lines 613-615 in
mm/page_alloc.c

might_sleep_if(wait);
if (wait)
cond_resched();

Now interpreting it as the lines should be

might_sleep_if(wait);

/*

the kernel boots without problems. Have yet to test the oom-killer with
my rogue app. Won't return unless there's problems.

Mvh
Mats Johannesson
William Lee Irwin III
2004-12-05 02:52:36 UTC
Permalink
Post by t***@linutronix.de
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.
The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill
Hmm, this thread seems to be serious. I'll audit the policy adjustments
for issues with the mechanisms (e.g. killing kernel threads, races with
timeouts).


-- wli
Thomas Gleixner
2004-12-05 13:38:22 UTC
Permalink
Post by William Lee Irwin III
Post by t***@linutronix.de
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.
The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill
Hmm, this thread seems to be serious.
:)
Post by William Lee Irwin III
I'll audit the policy adjustments
for issues with the mechanisms (e.g. killing kernel threads, races with
timeouts).
Andrea provided a quite good fix for the invokation problem. I'm just
tracking down a different problem which was revieled by his changes.

The selection mechanism is currently taking a couple of things into
account:

- VM size
- nice value
- CPU time
- owner == root ?
- CAP_SYS_RAWIO

I found out, that it is neccecary to take the child processes into
account to detect processes which fork a lot of childs.

The current mechanism rather kills sshd or portmap as they happen to
have a bigger VM size than the process which forked a lot of childs.

So I added a check for the child processes with an own VM. This solved
the problem quite well. Of course it does not count the childs of PID <
2.

tglx
Andrea Arcangeli
2004-12-05 15:22:02 UTC
Permalink
Post by Thomas Gleixner
Andrea provided a quite good fix for the invokation problem. I'm just
tracking down a different problem which was revieled by his changes.
The selection mechanism is currently taking a couple of things into
- VM size
- nice value
- CPU time
- owner == root ?
- CAP_SYS_RAWIO
I found out, that it is neccecary to take the child processes into
account to detect processes which fork a lot of childs.
The current mechanism rather kills sshd or portmap as they happen to
have a bigger VM size than the process which forked a lot of childs.
Taking childs into account looks fine to me.
Voluspa
2004-12-05 08:32:09 UTC
Permalink
I know... Said that I wouldn't come back unless there was a problem. But
with these results! The patch is fantastic.

My problem app, blender, allocated all remaining physical memory and 360
megs (of the 1 gig) swap but remained running. No oom-kill involved at
all. This is a first with that app (in such a mode, with such a large
working set of pictures). 500 1.2 meg uncompressed targa pictures
animated in the graphical window of "Sequence Editor" on my system (256
meg real mem). Wow.

The oom-kill/swap/memory handling/whatever must have been really sick.

Thank You!
Mats Johannesson (time to pay back by trying 2.6.10-rc3)
William Lee Irwin III
2004-12-10 16:32:47 UTC
Permalink
Post by t***@linutronix.de
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.
The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill
It appears the net functional change here is the reentrancy prevention;
the choice of tasks is policy. The functional change may be accomplished
with the following:


Index: linux-2.6.9/mm/oom_kill.c
===================================================================
--- linux-2.6.9.orig/mm/oom_kill.c 2004-10-18 14:54:30.000000000 -0700
+++ linux-2.6.9/mm/oom_kill.c 2004-12-10 08:21:31.000000000 -0800
@@ -237,7 +237,8 @@
static unsigned long first, last, count, lastkill;
unsigned long now, since;

- spin_lock(&oom_lock);
+ if (!spin_trylock(&oom_lock))
+ return;
now = jiffies;
since = now - last;
last = now;
@@ -282,9 +283,7 @@
show_free_areas();

/* oom_kill() sleeps */
- spin_unlock(&oom_lock);
oom_kill();
- spin_lock(&oom_lock);

reset:
/*
Thomas Gleixner
2004-12-10 16:52:33 UTC
Permalink
Post by William Lee Irwin III
Post by t***@linutronix.de
The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.
The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill
It appears the net functional change here is the reentrancy prevention;
the choice of tasks is policy. The functional change may be accomplished
Your patch would call yield() with the lock held. On an UP machine you
end up in the same code, as spin_locks are NOPs except for the preempt
part.

It's now obsolete by the fixes which were done by Andrea.

I'm wondering why he did not post the final version. Andrea ???

Attached is the latest working and tested patch. It contains Andrea's
fixes to the oom invocation and my modifications to the selection whom
to kill.

This should really go into mainline.

tglx
William Lee Irwin III
2004-12-10 17:43:42 UTC
Permalink
Post by Thomas Gleixner
Post by William Lee Irwin III
It appears the net functional change here is the reentrancy prevention;
the choice of tasks is policy. The functional change may be accomplished
Your patch would call yield() with the lock held. On an UP machine you
end up in the same code, as spin_locks are NOPs except for the preempt
part.
It's now obsolete by the fixes which were done by Andrea.
I'm wondering why he did not post the final version. Andrea ???
Attached is the latest working and tested patch. It contains Andrea's
fixes to the oom invocation and my modifications to the selection whom
to kill.
This should really go into mainline.
ARGH, preempt.


-- wli
William Lee Irwin III
2004-12-10 17:47:30 UTC
Permalink
Post by Thomas Gleixner
Post by William Lee Irwin III
It appears the net functional change here is the reentrancy prevention;
the choice of tasks is policy. The functional change may be accomplished
Your patch would call yield() with the lock held. On an UP machine you
end up in the same code, as spin_locks are NOPs except for the preempt
part.
It's now obsolete by the fixes which were done by Andrea.
I'm wondering why he did not post the final version. Andrea ???
Attached is the latest working and tested patch. It contains Andrea's
fixes to the oom invocation and my modifications to the selection whom
to kill.
This should really go into mainline.
This should all be split up; it's doing a dozen things at a time.


-- wli
Andrea Arcangeli
2004-12-10 17:49:38 UTC
Permalink
Post by Thomas Gleixner
I'm wondering why he did not post the final version. Andrea ???
I already posted the final version since it had no bugs and I asked to
get it merged twice. The only bugs are obviously in the drivers (or the
callers) and they needs urgent fixing and additionally the
might_sleep_if must stop checking if the system is running so these bugs
can see the light of the day. Not being allowed to schedule in
alloc_pages with __GFP_WAIT set is a mistake.

Your patch was orthogonal to mine, so I didn't merge it. Go figure that
every time I post something it gets splitted into trivial pieces, so
it's a waste of time to try to merge any additional patch and post a
final one since it'll never be final anyway.

I am about to merge the things together for some other tree (not
mainline), that is a worthwhile effort but with the split behaviour of
mainline, for mainline it'd be a waste of time.

One last thing worth discussing on my side is if we should worry about
the tiny race between the watermark checks and the entering of the oom
killing. In theory we could wrap the thing around a semaphore and close
the race completely, though current code is simpler and as you find
it works fine in practice.
William Lee Irwin III
2004-12-10 17:57:06 UTC
Permalink
Post by Andrea Arcangeli
Your patch was orthogonal to mine, so I didn't merge it. Go figure that
every time I post something it gets splitted into trivial pieces, so
it's a waste of time to try to merge any additional patch and post a
final one since it'll never be final anyway.
I am about to merge the things together for some other tree (not
mainline), that is a worthwhile effort but with the split behaviour of
mainline, for mainline it'd be a waste of time.
One last thing worth discussing on my side is if we should worry about
the tiny race between the watermark checks and the entering of the oom
killing. In theory we could wrap the thing around a semaphore and close
the race completely, though current code is simpler and as you find
it works fine in practice.
The easy way to fix that issue is to take the whole diff and break off
pieces, with the remainder always as the last patch. That way the whole
set of changes stays pending and appears intact at the end of the series.

I will personally be held responsible for identifying the causes of
behavioral changes in the OOM killer, and am having to investigate
several instances of bad OOM killer behavior already, so I have to do
this anyway, and so it might as well be done for mainline.


-- wli
William Lee Irwin III
2004-12-12 00:12:41 UTC
Permalink
Post by William Lee Irwin III
The easy way to fix that issue is to take the whole diff and break off
pieces, with the remainder always as the last patch. That way the whole
set of changes stays pending and appears intact at the end of the series.
I will personally be held responsible for identifying the causes of
behavioral changes in the OOM killer, and am having to investigate
several instances of bad OOM killer behavior already, so I have to do
this anyway, and so it might as well be done for mainline.
Actually, I'm dropping my whole public effort and am going to stick to
using what I do in this respect for internal testing etc.


-- wli
Andrea Arcangeli
2004-12-24 01:18:33 UTC
Permalink
Post by Andrea Arcangeli
+ if ((p->flags & PF_MEMDIE) ||
+ ((p->flags & PF_EXITING) && !(p->flags & PF_DEAD)))
this had to be:

if (((p->flags & PF_MEMDIE) || (p->flags & PF_EXITING)) &&
!(p->flags & PF_DEAD))

I didn't take zombies into account. A task may be in memdie state and zombie at
the same time. We must not wait a PF_MEMDIE to go away completely, we must wait
only until PF_DEAD is not set. After PF_DEAD is set we know all ram we
were waiting for is already in the free list.

I noticed some deadlocks during an oom-torture-test before applying the stuff
into the suse kernel. The above change fixed all my deadlocks. Everything else
was working fine already.

Actually before finding the above bug I fixed PF_MEMDIE too and I converted it
to p->memdie, an unsigned char. All archs should support 1 byte granularity
for per-process atomic instructions, it's near used_math that I also converted
to a char to signal it cannot be a bitflag sharing the same char with globals.

Struct layout looks like this.

/*
* All archs should support atomic ops with
* 1 byte granularity.
*/
unsigned char memdie;
/*
* Must be changed atomically so it shouldn't be
* be a shareable bitflag.
*/
unsigned char used_math;
/*
* OOM kill score adjustment (bit shift).
* Cannot live together with used_math since
* used_math and oomkilladj can be changed at the
* same time, so they would race if they're in the
* same atomic block.
*/
short oomkilladj;

As for the |= PF_MEMALLOC in oom_kill_task that was a gratuitous smp breakage,
I didn't need to do anything in PF_MEMALLOC since alloc_pages checks _both_
PF_MEMALLOC and p->memdie already.

I also added a !chosen in the below code to make that logic more self
contained and less dependent on the badness implementation (should never
be necessary though):

if (points > maxpoints || !chosen) {
chosen = p;
maxpoints = points;
}

I can port the final patch (including fixage of PF_MEMDIE races) to 2.6.10-rc
after I finished.

Loading...