[libav-devel] [PATCH 1/8] h264: fix overreads in cabac reader.

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"

- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
);
return val;
}

--
1.7.2.1

Måns Rullgård

2012-03-17 16:54:50 UTC

Post by Ronald S. Bultje
---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"

%c5? Last I checked, the code to get a plain number was 'a'.

[...]

Post by Ronald S. Bultje
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-17 18:57:34 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)

Ronald

Ronald S. Bultje

2012-03-18 14:55:22 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left? I want this patchset in,
it's our last big class of overread bugs (in addition to all decoders
that we haven't converted from bytestream to bytestream2 yet).

Ronald

Ronald S. Bultje

2012-03-19 13:40:46 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping2.

Ronald

Måns Rullgård

2012-03-19 13:42:49 UTC

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

Post by Ronald S. Bultje
I want this patchset in,

That's not a valid argument.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-19 13:51:04 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.

Ronald

Måns Rullgård

2012-03-19 13:54:21 UTC

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it. Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens. You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-19 14:16:05 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

My compiler has been like that for years.

Post by MÃ¥ns RullgÃ¥rd
You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?

Ronald

Måns Rullgård

2012-03-19 15:10:05 UTC

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

My compiler has been like that for years.

Post by MÃ¥ns RullgÃ¥rd
You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

You've made changes that have very unexpected results. This is never a
good thing unless the reasons are understood.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-19 15:34:32 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

My compiler has been like that for years.

Post by MÃ¥ns RullgÃ¥rd
You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

You've made changes that have very unexpected results. This is never a
good thing unless the reasons are understood.

Yes: the compiler screwed up, and I fixed it.

Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do, else I'll simply have to
commit as-is.

Ronald

Måns Rullgård

2012-03-19 15:42:27 UTC

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

My compiler has been like that for years.

Post by MÃ¥ns RullgÃ¥rd
You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

You've made changes that have very unexpected results. This is never a
good thing unless the reasons are understood.

Yes: the compiler screwed up, and I fixed it.

No, you did not fix it. You randomly hacked around until it by chance
did what you wanted.

Post by Ronald S. Bultje
Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do,

I want to understand what is causing the compiler to screw up in the
first place. If we figure that out, we might find a clean solution.
Usually the first step is to reduce the problem to a smaller test case.
The function where this is happening isn't very large, so this should be
fairly easy.

Post by Ronald S. Bultje
else I'll simply have to commit as-is.

I don't like such threats.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-19 15:46:54 UTC

Hi,

%c5? Last I checked, the code to get a plain number was 'a'.

Fixed.

We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.

Ping, are there any practical concerns left?

We're still no closer to understanding what really is going on here.

My compiler has been like that for years.

Post by MÃ¥ns RullgÃ¥rd
You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.

You've made changes that have very unexpected results. This is never a
good thing unless the reasons are understood.

Yes: the compiler screwed up, and I fixed it.

No, you did not fix it. You randomly hacked around until it by chance
did what you wanted.

Post by Ronald S. Bultje
Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do,

I'm waiting. This is not concrete. What is your plan? You're currently
holding this up. I need an ETA.

Ronald

Jason Garrett-Glaser

2012-03-19 16:48:52 UTC

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

Jason

Ronald S. Bultje

2012-03-19 17:14:31 UTC

Hi,

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).

Ronald

Jason Garrett-Glaser

2012-03-19 19:03:54 UTC

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

Jason

Ronald S. Bultje

2012-03-19 19:25:11 UTC

Hi,

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

Ronald

Jason Garrett-Glaser

2012-03-19 20:12:09 UTC

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

Just add +m arguments and don't use them, that's all.

Here's an example from an unfinished patch of mine:

+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+ asm(
+ "call %P8\n"
+ :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+ :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+ :"%ecx"
+ );
+}

Jason

Ronald S. Bultje

2012-03-19 20:43:21 UTC

Hi,

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

Just add +m arguments and don't use them, that's all.
+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+ asm(
+ "call %P8\n"
+ :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+ :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+ :"%ecx"
+ );
+}

That's how we got here in the first place. gcc-4.2.1 and clang-2.9
allocate a register for "+m"(struct->val) pairs, causing the compiler
to run out of registers. It simply won't compile, as silly as that
sounds.

Ronald

Jason Garrett-Glaser

2012-03-19 20:51:25 UTC

---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

That's a compiler bug, make them fix it.

Jason

Alex Converse

2012-03-19 23:08:54 UTC

On Mon, Mar 19, 2012 at 10:14 AM, Ronald S. Bultje <

On Mon, Mar 19, 2012 at 9:48 AM, Jason Garrett-Glaser <

On Sat, Mar 17, 2012 at 9:34 AM, Ronald S. Bultje <

get_cabac_bypass_sign_x86(CABACContext *c, int val)

Post by Ronald S. Bultje
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int

get_cabac_bypass_sign_x86(CABACContext *c, int val)

Post by Ronald S. Bultje
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int

get_cabac_bypass_sign_x86(CABACContext *c, int val)

Post by Ronald S. Bultje
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"

IMO clobbering memory looks very very hacky, and I don't like it.

you need to clobber something, it'd be much better if we could

clobber

exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

That's a compiler bug, make them fix it.

Måns Rullgård

2012-03-19 23:43:25 UTC

On Mon, Mar 19, 2012 at 10:14 AM, Ronald S. Bultje <

On Mon, Mar 19, 2012 at 9:48 AM, Jason Garrett-Glaser <

On Sat, Mar 17, 2012 at 9:34 AM, Ronald S. Bultje <

get_cabac_bypass_sign_x86(CABACContext *c, int val)

get_cabac_bypass_sign_x86(CABACContext *c, int val)

get_cabac_bypass_sign_x86(CABACContext *c, int val)

IMO clobbering memory looks very very hacky, and I don't like it.

you need to clobber something, it'd be much better if we could

clobber

exactly what needs clobbering, and nothing more.

You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

That sounds useful, how do I do that?

That's a compiler bug, make them fix it.

Does it do that even if the operand is never used explicitly?

Luca Barbato

2012-03-20 00:37:27 UTC

On 19/03/12 16:43, Måns Rullgård wrote:

Gil Pedersen

2012-03-20 08:33:21 UTC

Martin Storsjö

2012-03-20 09:42:05 UTC

Ronald S. Bultje

2012-03-22 03:25:06 UTC

Hi,

Måns Rullgård

2012-03-22 11:17:11 UTC

Hi,

Diego Biurrun

2012-03-22 11:37:30 UTC

[...]

Cut it out already, you two!

Ronald: Don't respond.

Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.

Diego

Måns Rullgård

2012-03-22 11:39:42 UTC

[...]

Cut it out already, you two!
Ronald: Don't respond.
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.

That's not how it works.

--
Måns Rullgård
***@mansr.com

Diego Biurrun

2012-03-22 11:44:12 UTC

[...]

Cut it out already, you two!
Ronald: Don't respond.
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.

That's not how it works.

Blanket statements will not help us get anywhere at this point.

You told me you would debug it yourself if only you could reproduce the
problem. Would you or would you not? If yes, who can give you a shell,
if no, what specific steps do you suggest instead?

Diego

Måns Rullgård

2012-03-22 11:55:55 UTC

Post by Diego Biurrun
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.

That's not how it works.

Blanket statements will not help us get anywhere at this point.

Ronald wants the patch applied. Thus it is his responsibility to prove
it correct. That's how it works.

Post by Diego Biurrun
You told me you would debug it yourself if only you could reproduce the
problem. Would you or would you not? If yes, who can give you a shell,
if no, what specific steps do you suggest instead?

The problem is the function decode_cabac_mb_mvd() from h264_cabac.c.
The first thing to do is to isolate just this function and see if the
problem remains. If it does, trim the function down to the bare minimum
that still exhibits the problem. It's about 30 lines to begin with, so
it really isn't that much work. I already tried to explain this once,
only to be told to "fuck off."

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-17 16:34:53 UTC

---
libavcodec/x86/cabac.h | 44 ++++++++++++++++++++++----------------------
1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index c4832c3..6809309 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,28 +105,28 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %c5(%2), %k1 \n\t"
- "movl %c3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "xor %%edx, %%ecx \n\t"
- "sub %%edx, %%ecx \n\t"
- "test %%ax, %%ax \n\t"
- " jnz 1f \n\t"
- "mov %c4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%edx \n\t"
- "bswap %%edx \n\t"
- "shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
- "addl %%edx, %%eax \n\t"
- "mov %1, %3 \n\t"
- "1: \n\t"
- "movl %%eax, %c4(%2) \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
+ "shl $17, %k1 \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %k1, %%eax \n\t"
+ "cltd \n\t"
+ "and %%edx, %k1 \n\t"
+ "add %k1, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ "jnz 1f \n\t"
+ "mov %c4(%2), %1 \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%1), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %c4(%2) \n\t"
+ "1: \n\t"
+ "movl %%eax, %c3(%2) \n\t"

: "+c"(val), "=&r"(tmp)
: "r"(c),

--
1.7.2.1

Ronald S. Bultje

2012-03-17 16:34:54 UTC

---
libavcodec/x86/cabac.h | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 6809309..acf1c46 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,7 +105,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %c5(%2), %k1 \n\t"
+ "movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
@@ -122,9 +122,10 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %c4(%2) \n\t"
+ "cmp %c5(%2), %1 \n\t"
+ "jge 1f \n\t"
+ "add"OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"

@@ -132,6 +133,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);

--
1.7.2.1

Ronald S. Bultje

2012-03-17 16:34:55 UTC

---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index acf1c46..525ace6 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -89,11 +89,11 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
int bit, tmp;

__asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "(%5)", "%1", "%w1", "%2",
- "%3", "%b3", "%4")
- :"=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp),
- "+m"(c->bytestream)
- :"r"(state)
+ BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
+ "%2", "%3", "%b3", "%c6(%5)")
+ : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "r"(state), "r"(c),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index e195e04..86066db 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -48,15 +48,15 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
__asm__ volatile(
"2: \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c11(%6)")

"test $1, %4 \n\t"
" jz 3f \n\t"
"add %10, %1 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c11(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -80,10 +80,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"4: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range),
- "+m"(c->bytestream)
- :"m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+ : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+ "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+ : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -105,8 +105,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c12(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -115,8 +115,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c12(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -137,9 +137,11 @@ static int decode_significance_8x8_x86(CABACContext *c,
"4: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count),"+m"(last), "+m"(index), "+&r"(c->low), "=&r"(bit),
- "+&r"(c->range), "=&r"(state), "+m"(c->bytestream)
- :"m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_coeff_ctx_base)
+ : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ "=&r"(bit), "+&r"(c->range), "=&r"(state)
+ : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+ "m"(sig_off), "m"(last_coeff_ctx_base),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;

--
1.7.2.1

Ronald S. Bultje

2012-03-19 14:18:56 UTC

Hi,

---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.

Ronald

Alexander Strange

2012-03-25 06:17:19 UTC

---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.
Ronald

What is this for? I can see it making code density a bit worse
(compiler can't keep c->bytestream in a register between asm
statements).

Ronald S. Bultje

2012-03-25 14:54:16 UTC

Hi,

On Sat, Mar 24, 2012 at 11:17 PM, Alexander Strange

Post by Alexander Strange

---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.

What is this for? I can see it making code density a bit worse
(compiler can't keep c->bytestream in a register between asm
statements).

Without this patch, some compilers (see other email) mess up register
allocations and adding another "+m"(..) makes it run out of register
and fail to compile. Yes, it's a compiler bug, but widespread enough
that a workaround is warranted, especially since it also helps
"bug-free" compilers generate better code (again, see other patch).

Ronald

Ronald S. Bultje

2012-03-17 16:34:56 UTC

---
libavcodec/x86/cabac.h | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 525ace6..78d8af7 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,7 +27,7 @@
#include "config.h"

#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
@@ -37,7 +37,7 @@
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
@@ -57,7 +57,7 @@
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp) \
+ BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp) \
"movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\

--
1.7.2.1

Benjamin Larsson

2012-03-20 11:58:32 UTC

Ronald S. Bultje

2012-03-17 16:34:57 UTC

---
libavcodec/x86/h264_i386.h | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 86066db..131d29d 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -46,13 +46,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
int bit;
x86_reg coeff_count;
__asm__ volatile(
- "2: \n\t"
+ "3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
"%5", "%k0", "%b0", "%c11(%6)")

"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
@@ -65,19 +65,19 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"movl %%ecx, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
@@ -99,7 +99,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
x86_reg state;
__asm__ volatile(
"mov %1, %6 \n\t"
- "2: \n\t"
+ "3: \n\t"

"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
@@ -110,7 +110,7 @@ static int decode_significance_8x8_x86(CABACContext *c,

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"

"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"
@@ -123,18 +123,18 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movl %k6, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),

--
1.7.2.1

Benjamin Larsson

2012-03-20 11:59:13 UTC

Ronald S. Bultje

2012-03-17 16:34:58 UTC

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */

-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
@@ -79,7 +82,7 @@
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
- "1: \n\t"
+ "2: \n\t"

#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS)
#define get_cabac_inline get_cabac_inline_x86
@@ -90,10 +93,12 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,

__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
- "%2", "%3", "%b3", "%c6(%5)")
+ "%2", "%3", "%b3",
+ "%c6(%5)", "%c7(%5)")
: "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 131d29d..9fc05f4 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -49,14 +49,16 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c11(%6)")
+ "%5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)")

"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c11(%6)")
+ "%5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -83,7 +85,8 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -106,7 +109,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %9, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c12(%7)")
+ "%5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -116,7 +120,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %11, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c12(%7)")
+ "%5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -141,7 +146,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
"m"(sig_off), "m"(last_coeff_ctx_base),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;

--
1.7.2.1

Benjamin Larsson

2012-03-20 12:02:37 UTC

Måns Rullgård

2012-03-20 12:44:41 UTC

Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

--
Måns Rullgård
***@mansr.com

Uoti Urpala

2012-03-20 13:38:50 UTC

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

Branches don't have to be expensive if they're never actually taken and
are predicted correctly. It's not obvious whether cmov would be better.

BTW the code calling this would really benefit from using named asm
arguments.

Måns Rullgård

2012-03-20 13:45:42 UTC

Post by Uoti Urpala

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

Branches don't have to be expensive if they're never actually taken and
are predicted correctly. It's not obvious whether cmov would be better.

Every branch puts extra pressure on the prediction resources, so
avoiding them can be beneficial even if they'd be predicted. If there
is a reasonable branch-free alternative, it's worth testing both.

--
Måns Rullgård
***@mansr.com

Ronald S. Bultje

2012-03-20 18:55:20 UTC

Hi,

Post by Uoti Urpala
BTW the code calling this would really benefit from using named asm
arguments.

I agree. Patch welcome.

(I have other things to finish first, I'd love to do it but it's not
high on my priority list.)

Ronald

Ronald S. Bultje

2012-03-22 13:53:14 UTC

Hi,

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

No. If you have suggestions, I'm naturally very open to them.

Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.

Ping.

Ronald

Måns Rullgård

2012-03-22 13:55:01 UTC

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

The patch certainly is not.

--
Måns Rullgård
***@mansr.com

Diego Biurrun

2012-03-22 14:07:37 UTC

Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

The patch certainly is not.

How does this comment help us move forward?

Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.

The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Diego

Måns Rullgård

2012-03-22 14:12:22 UTC

Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

The patch certainly is not.

How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Why are you so hostile? Are you also on google payroll now?

--
Måns Rullgård
***@mansr.com

İsmail Dönmez

2012-03-22 14:14:51 UTC

Hi;

tmp, tmpbyte, byte) \

Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,

tmp, tmpbyte, byte, end) \

Post by Ronald S. Bultje
"movzbl "statep" , "ret"

\n\t"\

Post by Ronald S. Bultje
"mov "range" , "tmp"

\n\t"\

Post by Ronald S. Bultje
"and $0xC0 , "range"

\n\t"\

Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"

\n\t"\

Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"

\n\t"\

Post by Ronald S. Bultje
"test "lowword" , "lowword"

\n\t"\

Post by Ronald S. Bultje
- " jnz 1f

\n\t"\

Post by Ronald S. Bultje
+ " jnz 2f

\n\t"\

Post by MÃ¥ns RullgÃ¥rd
Why do you renumber these? Number labels don't need to be in

ascending

Post by MÃ¥ns RullgÃ¥rd
order or anything like that.

Because it's cleaner.

The patch certainly is not.

How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Why are you so hostile? Are you also on google payroll now?

This is getting off topic. Lets concentrate on the patch itself.

Thanks,
ismail

Måns Rullgård

2012-03-22 14:23:12 UTC

Post by Ä°smail DÃ¶nmez
Hi;

tmp, tmpbyte, byte) \

Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,

tmp, tmpbyte, byte, end) \

Post by Ronald S. Bultje
"movzbl "statep" , "ret"

\n\t"\

Post by Ronald S. Bultje
"mov "range" , "tmp"

\n\t"\

Post by Ronald S. Bultje
"and $0xC0 , "range"

\n\t"\

Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"

\n\t"\

Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"

\n\t"\

Post by Ronald S. Bultje
"test "lowword" , "lowword"

\n\t"\

Post by Ronald S. Bultje
- " jnz 1f

\n\t"\

Post by Ronald S. Bultje
+ " jnz 2f

\n\t"\

Post by MÃ¥ns RullgÃ¥rd
Why do you renumber these? Number labels don't need to be in

ascending

Post by MÃ¥ns RullgÃ¥rd
order or anything like that.

Because it's cleaner.

The patch certainly is not.

How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Why are you so hostile? Are you also on google payroll now?

This is getting off topic. Lets concentrate on the patch itself.

This is not off-topic. It is about Ronald going mental because I dared
question the quality of a patch he submitted on behalf of the almighty,
infallible Google.

--
Måns Rullgård
***@mansr.com

Diego Biurrun

2012-03-22 14:32:42 UTC

Post by Ä°smail DÃ¶nmez

tmp, tmpbyte, byte) \

Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,

tmp, tmpbyte, byte, end) \

Post by Ronald S. Bultje
"movzbl "statep" , "ret"

\n\t"\

Post by Ronald S. Bultje
"mov "range" , "tmp"

\n\t"\

Post by Ronald S. Bultje
"and $0xC0 , "range"

\n\t"\

Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"

\n\t"\

Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"

\n\t"\

Post by Ronald S. Bultje
"test "lowword" , "lowword"

\n\t"\

Post by Ronald S. Bultje
- " jnz 1f

\n\t"\

Post by Ronald S. Bultje
+ " jnz 2f

\n\t"\

Post by MÃ¥ns RullgÃ¥rd
Why do you renumber these? Number labels don't need to be in

ascending

Post by MÃ¥ns RullgÃ¥rd
order or anything like that.

Because it's cleaner.

The patch certainly is not.

How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Why are you so hostile? Are you also on google payroll now?

This is getting off topic. Lets concentrate on the patch itself.

This is not off-topic. It is about Ronald going mental because I dared
question the quality of a patch he submitted on behalf of the almighty,
infallible Google.

We will talk about this and other things when all parties have cooled
down, but we will surely not have a trollfest here on this mailing list.

If you guys continue posting into this thread I will ask for the mailing
list to be set on moderation until the flamewar is over and for all flames
to be discarded.

Diego

Kostya Shishkov

2012-03-22 14:05:28 UTC

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.

Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.

Ronald S. Bultje

2012-03-24 19:32:55 UTC

Hi,

On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov

Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.

gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch

Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Ronald

Kostya Shishkov

2012-03-26 18:54:48 UTC

Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov

gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Looks legit, what do other people think?

Luca Barbato

2012-03-26 20:36:35 UTC

Post by Kostya Shishkov

Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov

gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Looks legit, what do other people think?

I tried to compare it and seems that the patch speeds up everything
sensibly on linux/gcc-4.6.2. (ran 30 times for each interesting patch of
the set, few times it got worse many times it got better I thrown away
outliers and seems overall better)

lu

--
Luca Barbato
Gentoo/linux
http://dev.gentoo.org/~lu_zero

Ronald S. Bultje

2012-03-27 15:08:42 UTC

Hi,

Post by Luca Barbato

Post by Kostya Shishkov

Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov

gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Looks legit, what do other people think?

OK, so are there no more objections to the whole patchset then? I'd
like to push this.

Ronald

Måns Rullgård

2012-03-27 15:11:30 UTC

Post by Luca Barbato

Post by Kostya Shishkov

Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov

gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Looks legit, what do other people think?

OK, so are there no more objections to the whole patchset then? I'd
like to push this.

You have not done anything to address the issue of why code that
_should_ be worse is actually compiling better. Allowing such anomalies
to go without investigation is irresponsible at best.

I realise, however, that you don't give a fuck about this and that you
are determined to push this hack of a patch no matter what. Enjoy your
victory.

--
Måns Rullgård
***@mansr.com

Luca Barbato

2012-03-27 17:04:34 UTC

Post by MÃ¥ns RullgÃ¥rd
You have not done anything to address the issue of why code that
_should_ be worse is actually compiling better. Allowing such anomalies
to go without investigation is irresponsible at best.

I'll try to reduce that code soon myself, lately I had been too busy and
the best I could was to try and test it myself with the timer.

Post by MÃ¥ns RullgÃ¥rd
I realise, however, that you don't give a fuck about this and that you
are determined to push this hack of a patch no matter what. Enjoy your
victory.

As I said, hadn't it performing decently, I'd had just have that bit
disabled for the non-working gcc.

Would going this route address your concern till I, Ronald, or whoever
manages to get the time to investigate the issue in depth?

lu

--
Luca Barbato
Gentoo/linux
http://dev.gentoo.org/~lu_zero

Alexander Strange

2012-03-25 06:26:15 UTC

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.
Ping.
Ronald

Because this only changes the refill operation I don't think it's
seriously performance-sensitive; that branch doesn't run often. But
how about a benchmark

Jason Garrett-Glaser

2012-03-25 17:01:49 UTC

On Sat, Mar 24, 2012 at 11:26 PM, Alexander Strange

Post by Alexander Strange

Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.

Because it's cleaner.

Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\

Is there no way of doing this with cmov instead of branching?

No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.
Ping.
Ronald

Because this only changes the refill operation I don't think it's
seriously performance-sensitive; that branch doesn't run often. But
how about a benchmark?

I agree; if we're just adding a check in refill, the cost is
completely negligible.

Jason

Kostya Shishkov

2012-03-17 16:49:23 UTC

Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/cabac_functions.h | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index b150aab..4c74cf7 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -47,7 +47,8 @@ static void refill(CABACContext *c){
c->low+= c->bytestream[0]<<1;
#endif
c->low -= CABAC_MASK;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS / 8;
}
static inline void renorm_cabac_decoder_once(CABACContext *c){
@@ -74,7 +75,8 @@ static void refill2(CABACContext *c){
#endif
c->low += x<<i;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS/8;
}
static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
--

probably OK

Ronald S. Bultje

2012-03-17 18:54:49 UTC

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:50 UTC

---
libavcodec/x86/cabac.h | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..7d8976c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %a5(%2), %k1 \n\t"
+ "movl %a3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,20 +117,23 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %a4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %3 \n\t"
+ "mov %1, %a4(%2) \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %a3(%2) \n\t"

- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
);
return val;
}

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:51 UTC

---
libavcodec/x86/cabac.h | 44 ++++++++++++++++++++++----------------------
1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 7d8976c..b00652b 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,28 +105,28 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %a5(%2), %k1 \n\t"
- "movl %a3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "xor %%edx, %%ecx \n\t"
- "sub %%edx, %%ecx \n\t"
- "test %%ax, %%ax \n\t"
- " jnz 1f \n\t"
- "mov %a4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%edx \n\t"
- "bswap %%edx \n\t"
- "shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
- "addl %%edx, %%eax \n\t"
- "mov %1, %a4(%2) \n\t"
- "1: \n\t"
- "movl %%eax, %a3(%2) \n\t"
+ "movl %a5(%2), %k1 \n\t"
+ "movl %a3(%2), %%eax \n\t"
+ "shl $17, %k1 \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %k1, %%eax \n\t"
+ "cltd \n\t"
+ "and %%edx, %k1 \n\t"
+ "add %k1, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ "jnz 1f \n\t"
+ "mov %a4(%2), %1 \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%1), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %a4(%2) \n\t"
+ "1: \n\t"
+ "movl %%eax, %a3(%2) \n\t"

: "+c"(val), "=&r"(tmp)
: "r"(c),

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:52 UTC

---
libavcodec/x86/cabac.h | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index b00652b..adf4fc3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,7 +105,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %a5(%2), %k1 \n\t"
+ "movl %a6(%2), %k1 \n\t"
"movl %a3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
@@ -122,9 +122,10 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %a4(%2) \n\t"
+ "cmp %a5(%2), %1 \n\t"
+ "jge 1f \n\t"
+ "add"OPSIZE" $2, %a4(%2) \n\t"
"1: \n\t"
"movl %%eax, %a3(%2) \n\t"

@@ -132,6 +133,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:53 UTC

---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index adf4fc3..e03a4de 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -89,11 +89,11 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
int bit, tmp;

__asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "(%5)", "%1", "%w1", "%2",
- "%3", "%b3", "%4")
- :"=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp),
- "+m"(c->bytestream)
- :"r"(state)
+ BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
+ "%2", "%3", "%b3", "%a6(%5)")
+ : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "r"(state), "r"(c),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index e195e04..2cfcbdd 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -48,15 +48,15 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
__asm__ volatile(
"2: \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a11(%6)")

"test $1, %4 \n\t"
" jz 3f \n\t"
"add %10, %1 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a11(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -80,10 +80,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"4: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range),
- "+m"(c->bytestream)
- :"m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+ : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+ "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+ : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -105,8 +105,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a12(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -115,8 +115,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a12(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -137,9 +137,11 @@ static int decode_significance_8x8_x86(CABACContext *c,
"4: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count),"+m"(last), "+m"(index), "+&r"(c->low), "=&r"(bit),
- "+&r"(c->range), "=&r"(state), "+m"(c->bytestream)
- :"m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_coeff_ctx_base)
+ : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ "=&r"(bit), "+&r"(c->range), "=&r"(state)
+ : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+ "m"(sig_off), "m"(last_coeff_ctx_base),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:54 UTC

---
libavcodec/x86/cabac.h | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index e03a4de..ca8a1d5 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,7 +27,7 @@
#include "config.h"

#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
@@ -37,7 +37,7 @@
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
@@ -57,7 +57,7 @@
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp) \
+ BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp) \
"movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:55 UTC

---
libavcodec/x86/h264_i386.h | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 2cfcbdd..31ddaf6 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -46,13 +46,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
int bit;
x86_reg coeff_count;
__asm__ volatile(
- "2: \n\t"
+ "3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
"%5", "%k0", "%b0", "%a11(%6)")

"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
@@ -65,19 +65,19 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"movl %%ecx, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
@@ -99,7 +99,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
x86_reg state;
__asm__ volatile(
"mov %1, %6 \n\t"
- "2: \n\t"
+ "3: \n\t"

"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
@@ -110,7 +110,7 @@ static int decode_significance_8x8_x86(CABACContext *c,

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"

"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"
@@ -123,18 +123,18 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movl %k6, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),

--
1.7.2.1

Ronald S. Bultje

2012-03-17 18:54:56 UTC