Discussion:
[libav-devel] [PATCH 1/8] h264: fix overreads in cabac reader.
Ronald S. Bultje
2012-03-17 16:34:51 UTC
Permalink
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-***@libav.org
---
libavcodec/cabac_functions.h | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index b150aab..4c74cf7 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -47,7 +47,8 @@ static void refill(CABACContext *c){
c->low+= c->bytestream[0]<<1;
#endif
c->low -= CABAC_MASK;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS / 8;
}

static inline void renorm_cabac_decoder_once(CABACContext *c){
@@ -74,7 +75,8 @@ static void refill2(CABACContext *c){
#endif

c->low += x<<i;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS/8;
}

static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
--
1.7.2.1
Ronald S. Bultje
2012-03-17 16:34:52 UTC
Permalink
---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"

- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
);
return val;
}
--
1.7.2.1
Måns Rullgård
2012-03-17 16:54:50 UTC
Permalink
Post by Ronald S. Bultje
---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
%c5? Last I checked, the code to get a plain number was 'a'.

[...]
Post by Ronald S. Bultje
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber. I know
why you're doing this, but I think it's the wrong approach.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-17 18:57:34 UTC
Permalink
Hi,
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)

Ronald
Ronald S. Bultje
2012-03-18 14:55:22 UTC
Permalink
Hi,
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left? I want this patchset in,
it's our last big class of overread bugs (in addition to all decoders
that we haven't converted from bytestream to bytestream2 yet).

Ronald
Ronald S. Bultje
2012-03-19 13:40:46 UTC
Permalink
Hi,
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left? I want this patchset in,
it's our last big class of overread bugs (in addition to all decoders
that we haven't converted from bytestream to bytestream2 yet).
Ping2.

Ronald
Måns Rullgård
2012-03-19 13:42:49 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
Post by Ronald S. Bultje
I want this patchset in,
That's not a valid argument.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-19 13:51:04 UTC
Permalink
Hi,
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.

Ronald
Måns Rullgård
2012-03-19 13:54:21 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it. Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens. You can, and you're the one
pushing for these patches, so the work falls to you. Tough luck.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-19 14:16:05 UTC
Permalink
Hi,
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it.  Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens.
My compiler has been like that for years.
Post by Måns Rullgård
 You can, and you're the one
pushing for these patches, so the work falls to you.  Tough luck.
You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?

Ronald
Måns Rullgård
2012-03-19 15:10:05 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it.  Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens.
My compiler has been like that for years.
Post by Måns Rullgård
 You can, and you're the one
pushing for these patches, so the work falls to you.  Tough luck.
You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?
You've made changes that have very unexpected results. This is never a
good thing unless the reasons are understood.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-19 15:34:32 UTC
Permalink
Hi,
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it.  Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens.
My compiler has been like that for years.
Post by Måns Rullgård
 You can, and you're the one
pushing for these patches, so the work falls to you.  Tough luck.
You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?
You've made changes that have very unexpected results.  This is never a
good thing unless the reasons are understood.
Yes: the compiler screwed up, and I fixed it.

Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do, else I'll simply have to
commit as-is.

Ronald
Måns Rullgård
2012-03-19 15:42:27 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it.  Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens.
My compiler has been like that for years.
Post by Måns Rullgård
 You can, and you're the one
pushing for these patches, so the work falls to you.  Tough luck.
You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?
You've made changes that have very unexpected results.  This is never a
good thing unless the reasons are understood.
Yes: the compiler screwed up, and I fixed it.
No, you did not fix it. You randomly hacked around until it by chance
did what you wanted.
Post by Ronald S. Bultje
Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do,
I want to understand what is causing the compiler to screw up in the
first place. If we figure that out, we might find a clean solution.
Usually the first step is to reduce the problem to a smaller test case.
The function where this is happening isn't very large, so this should be
fairly easy.
Post by Ronald S. Bultje
else I'll simply have to commit as-is.
I don't like such threats.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-19 15:46:54 UTC
Permalink
Hi,
Post by Ronald S. Bultje
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
     x86_reg tmp;
     __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
%c5?  Last I checked, the code to get a plain number was 'a'.
Fixed.
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
We changed this to use "m" operands to avoid the memory clobber.  I know
why you're doing this, but I think it's the wrong approach.
It generates better code (less instructions for e.g.
decode_cabac_mb_mvd()) with gcc-4.2.1 (which is shipped with XCode).
Does it generate worse code anywhere? (It's true that later on it adds
instructions for the overread protection again, but this commit in
isolation makes things better, not worse.)
Ping, are there any practical concerns left?
We're still no closer to understanding what really is going on here.
You'll have to be more practical than "I don't get it, so let's do
nothing". Do something to understand it. This patchset improves things
on my end (better code, compiler doesn't bomb out on adding extra
argument such as bytestream_end), which is more than sufficient.
Your compiler seems to be the only one where it gives better code.
There is no guarantee that your compiler will keep doing this next time
you upgrade it.  Since I can't reproduce the problem, I'm not in a very
good position to figure out why it happens.
My compiler has been like that for years.
Post by Måns Rullgård
 You can, and you're the one
pushing for these patches, so the work falls to you.  Tough luck.
You're not very clear on what you want. You want the holy grail? You
want a time machine? You want a better pension? What falls on me? I've
written code that is (if I understand you correctly) the same for you,
and better for me. That's fantastic! So does that mean we agree I can
commit it? If not, what exactly is your problem with this code?
You've made changes that have very unexpected results.  This is never a
good thing unless the reasons are understood.
Yes: the compiler screwed up, and I fixed it.
No, you did not fix it.  You randomly hacked around until it by chance
did what you wanted.
Post by Ronald S. Bultje
Now, this isn't going anywhere. What are you looking for? I need a
concrete thing that you intend me to do,
I want to understand what is causing the compiler to screw up in the
first place.  If we figure that out, we might find a clean solution.
Usually the first step is to reduce the problem to a smaller test case.
The function where this is happening isn't very large, so this should be
fairly easy.
I'm waiting. This is not concrete. What is your plan? You're currently
holding this up. I need an ETA.

Ronald
Jason Garrett-Glaser
2012-03-19 16:48:52 UTC
Permalink
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it. If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.

Jason
Ronald S. Bultje
2012-03-19 17:14:31 UTC
Permalink
Hi,
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).

Ronald
Jason Garrett-Glaser
2012-03-19 19:03:54 UTC
Permalink
Post by Ronald S. Bultje
Hi,
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.

Jason
Ronald S. Bultje
2012-03-19 19:25:11 UTC
Permalink
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?

Ronald
Jason Garrett-Glaser
2012-03-19 20:12:09 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?
Just add +m arguments and don't use them, that's all.

Here's an example from an unfinished patch of mine:

+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+ asm(
+ "call %P8\n"
+ :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+ :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+ :"%ecx"
+ );
+}

Jason
Ronald S. Bultje
2012-03-19 20:43:21 UTC
Permalink
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?
Just add +m arguments and don't use them, that's all.
+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+    asm(
+        "call %P8\n"
+        :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+        :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+        :"%ecx"
+    );
+}
That's how we got here in the first place. gcc-4.2.1 and clang-2.9
allocate a register for "+m"(struct->val) pairs, causing the compiler
to run out of registers. It simply won't compile, as silly as that
sounds.

Ronald
Jason Garrett-Glaser
2012-03-19 20:51:25 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h |   17 ++++++++++-------
 1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
    x86_reg tmp;
    __asm__ volatile(
-        "movl %4, %k1                           \n\t"
-        "movl %2, %%eax                         \n\t"
+        "movl %c5(%2), %k1                      \n\t"
+        "movl %c3(%2), %%eax                    \n\t"
        "shl $17, %k1                           \n\t"
        "add %%eax, %%eax                       \n\t"
        "sub %k1, %%eax                         \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "sub %%edx, %%ecx                       \n\t"
        "test %%ax, %%ax                        \n\t"
        " jnz 1f                                \n\t"
-        "mov  %3, %1                            \n\t"
+        "mov  %c4(%2), %1                       \n\t"
        "subl $0xFFFF, %%eax                    \n\t"
        "movzwl (%1), %%edx                     \n\t"
        "bswap %%edx                            \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
        "addl %%edx, %%eax                      \n\t"
        "mov  %1, %3                            \n\t"
        "1:                                     \n\t"
-        "movl %%eax, %2                         \n\t"
+        "movl %%eax, %c4(%2)                    \n\t"
-        :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
-        :"m"(c->range)
-        : "%eax", "%edx"
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.  If
you need to clobber something, it'd be much better if we could clobber
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?
Just add +m arguments and don't use them, that's all.
+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+    asm(
+        "call %P8\n"
+        :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+        :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+        :"%ecx"
+    );
+}
That's how we got here in the first place. gcc-4.2.1 and clang-2.9
allocate a register for "+m"(struct->val) pairs, causing the compiler
to run out of registers. It simply won't compile, as silly as that
sounds.
That's a compiler bug, make them fix it.

Jason
Alex Converse
2012-03-19 23:08:54 UTC
Permalink
Post by Ronald S. Bultje
Post by Ronald S. Bultje
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
On Mon, Mar 19, 2012 at 10:14 AM, Ronald S. Bultje <
On Mon, Mar 19, 2012 at 9:48 AM, Jason Garrett-Glaser <
On Sat, Mar 17, 2012 at 9:34 AM, Ronald S. Bultje <
Post by Ronald S. Bultje
---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.
If
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
you need to clobber something, it'd be much better if we could
clobber
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?
Just add +m arguments and don't use them, that's all.
+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+ asm(
+ "call %P8\n"
+ :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+ :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+ :"%ecx"
+ );
+}
That's how we got here in the first place. gcc-4.2.1 and clang-2.9
allocate a register for "+m"(struct->val) pairs, causing the compiler
to run out of registers. It simply won't compile, as silly as that
sounds.
That's a compiler bug, make them fix it.
Måns Rullgård
2012-03-19 23:43:25 UTC
Permalink
Post by Ronald S. Bultje
Post by Ronald S. Bultje
Hi,
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
On Mon, Mar 19, 2012 at 10:14 AM, Ronald S. Bultje <
On Mon, Mar 19, 2012 at 9:48 AM, Jason Garrett-Glaser <
On Sat, Mar 17, 2012 at 9:34 AM, Ronald S. Bultje <
Post by Ronald S. Bultje
---
libavcodec/x86/cabac.h | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..c4832c3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,7 +117,7 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
@@ -126,11 +126,14 @@ static av_always_inline int
get_cabac_bypass_sign_x86(CABACContext *c, int val)
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
Post by Ronald S. Bultje
"addl %%edx, %%eax \n\t"
"mov %1, %3 \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %c4(%2) \n\t"
- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
IMO clobbering memory looks very very hacky, and I don't like it.
If
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
you need to clobber something, it'd be much better if we could
clobber
Post by Ronald S. Bultje
Post by Jason Garrett-Glaser
Post by Ronald S. Bultje
exactly what needs clobbering, and nothing more.
Well, I don't think inline assembly supports explicitely clobbering
variables without marking them as "+m" or "+r", which messes up the
register allocator for at least gcc-4.2.1 (it uses a different
register for each "m"(c->...), thus running out of registers; yes,
there's many things wrong there).
You can clobber a memory location without referencing it in the asm,
and thus without allocating a register for it.
That sounds useful, how do I do that?
Just add +m arguments and don't use them, that's all.
+static ALWAYS_INLINE void x264_cabac_encode_decision( x264_cabac_t
*cb, int i_ctx, int b )
+{
+ asm(
+ "call %P8\n"
+ :"+S"(i_ctx),"+d"(b), "+D"(cb->i_range), "+m"(cb->i_low),
"+m"(cb->i_queue), "+m"(cb->i_bytes_outstanding), "+m"(cb->p)
+ :"a"(cb),"X"(x264_cabac_encode_decision_asm)
+ :"%ecx"
+ );
+}
That's how we got here in the first place. gcc-4.2.1 and clang-2.9
allocate a register for "+m"(struct->val) pairs, causing the compiler
to run out of registers. It simply won't compile, as silly as that
sounds.
That's a compiler bug, make them fix it.
Does it do that even if the operand is never used explicitly?
Luca Barbato
2012-03-20 00:37:27 UTC
Permalink
On 19/03/12 16:43, Måns Rullgård wrote:
Gil Pedersen
2012-03-20 08:33:21 UTC
Permalink
Martin Storsjö
2012-03-20 09:42:05 UTC
Permalink
Ronald S. Bultje
2012-03-22 03:25:06 UTC
Permalink
Hi,
Måns Rullgård
2012-03-22 11:17:11 UTC
Permalink
Hi,
Diego Biurrun
2012-03-22 11:37:30 UTC
Permalink
[...]
Cut it out already, you two!

Ronald: Don't respond.

Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.

Diego
Måns Rullgård
2012-03-22 11:39:42 UTC
Permalink
Post by Diego Biurrun
[...]
Cut it out already, you two!
Ronald: Don't respond.
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.
That's not how it works.
--
Måns Rullgård
***@mansr.com
Diego Biurrun
2012-03-22 11:44:12 UTC
Permalink
Post by Måns Rullgård
Post by Diego Biurrun
[...]
Cut it out already, you two!
Ronald: Don't respond.
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.
That's not how it works.
Blanket statements will not help us get anywhere at this point.

You told me you would debug it yourself if only you could reproduce the
problem. Would you or would you not? If yes, who can give you a shell,
if no, what specific steps do you suggest instead?

Diego
Måns Rullgård
2012-03-22 11:55:55 UTC
Permalink
Post by Diego Biurrun
Post by Måns Rullgård
Post by Diego Biurrun
Mans: If you wish to help debugging this problem properly as you told me,
get a shell on an affected system.
That's not how it works.
Blanket statements will not help us get anywhere at this point.
Ronald wants the patch applied. Thus it is his responsibility to prove
it correct. That's how it works.
Post by Diego Biurrun
You told me you would debug it yourself if only you could reproduce the
problem. Would you or would you not? If yes, who can give you a shell,
if no, what specific steps do you suggest instead?
The problem is the function decode_cabac_mb_mvd() from h264_cabac.c.
The first thing to do is to isolate just this function and see if the
problem remains. If it does, trim the function down to the bare minimum
that still exhibits the problem. It's about 30 lines to begin with, so
it really isn't that much work. I already tried to explain this once,
only to be told to "fuck off."
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-17 16:34:53 UTC
Permalink
---
libavcodec/x86/cabac.h | 44 ++++++++++++++++++++++----------------------
1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index c4832c3..6809309 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,28 +105,28 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %c5(%2), %k1 \n\t"
- "movl %c3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "xor %%edx, %%ecx \n\t"
- "sub %%edx, %%ecx \n\t"
- "test %%ax, %%ax \n\t"
- " jnz 1f \n\t"
- "mov %c4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%edx \n\t"
- "bswap %%edx \n\t"
- "shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
- "addl %%edx, %%eax \n\t"
- "mov %1, %3 \n\t"
- "1: \n\t"
- "movl %%eax, %c4(%2) \n\t"
+ "movl %c5(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
+ "shl $17, %k1 \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %k1, %%eax \n\t"
+ "cltd \n\t"
+ "and %%edx, %k1 \n\t"
+ "add %k1, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ "jnz 1f \n\t"
+ "mov %c4(%2), %1 \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%1), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %c4(%2) \n\t"
+ "1: \n\t"
+ "movl %%eax, %c3(%2) \n\t"

: "+c"(val), "=&r"(tmp)
: "r"(c),
--
1.7.2.1
Ronald S. Bultje
2012-03-17 16:34:54 UTC
Permalink
---
libavcodec/x86/cabac.h | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 6809309..acf1c46 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,7 +105,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %c5(%2), %k1 \n\t"
+ "movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
@@ -122,9 +122,10 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %c4(%2) \n\t"
+ "cmp %c5(%2), %1 \n\t"
+ "jge 1f \n\t"
+ "add"OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"

@@ -132,6 +133,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
--
1.7.2.1
Ronald S. Bultje
2012-03-17 16:34:55 UTC
Permalink
---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index acf1c46..525ace6 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -89,11 +89,11 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
int bit, tmp;

__asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "(%5)", "%1", "%w1", "%2",
- "%3", "%b3", "%4")
- :"=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp),
- "+m"(c->bytestream)
- :"r"(state)
+ BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
+ "%2", "%3", "%b3", "%c6(%5)")
+ : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "r"(state), "r"(c),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index e195e04..86066db 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -48,15 +48,15 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
__asm__ volatile(
"2: \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c11(%6)")

"test $1, %4 \n\t"
" jz 3f \n\t"
"add %10, %1 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c11(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -80,10 +80,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"4: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range),
- "+m"(c->bytestream)
- :"m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+ : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+ "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+ : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -105,8 +105,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c12(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -115,8 +115,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%c12(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -137,9 +137,11 @@ static int decode_significance_8x8_x86(CABACContext *c,
"4: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count),"+m"(last), "+m"(index), "+&r"(c->low), "=&r"(bit),
- "+&r"(c->range), "=&r"(state), "+m"(c->bytestream)
- :"m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_coeff_ctx_base)
+ : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ "=&r"(bit), "+&r"(c->range), "=&r"(state)
+ : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+ "m"(sig_off), "m"(last_coeff_ctx_base),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
--
1.7.2.1
Ronald S. Bultje
2012-03-19 14:18:56 UTC
Permalink
Hi,
---
 libavcodec/x86/cabac.h     |   10 +++++-----
 libavcodec/x86/h264_i386.h |   32 +++++++++++++++++---------------
 2 files changed, 22 insertions(+), 20 deletions(-)
Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.

Ronald
Alexander Strange
2012-03-25 06:17:19 UTC
Permalink
Post by Ronald S. Bultje
Hi,
---
 libavcodec/x86/cabac.h     |   10 +++++-----
 libavcodec/x86/h264_i386.h |   32 +++++++++++++++++---------------
 2 files changed, 22 insertions(+), 20 deletions(-)
Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.
Ronald
What is this for? I can see it making code density a bit worse
(compiler can't keep c->bytestream in a register between asm
statements).
Ronald S. Bultje
2012-03-25 14:54:16 UTC
Permalink
Hi,

On Sat, Mar 24, 2012 at 11:17 PM, Alexander Strange
Post by Alexander Strange
Post by Ronald S. Bultje
---
 libavcodec/x86/cabac.h     |   10 +++++-----
 libavcodec/x86/h264_i386.h |   32 +++++++++++++++++---------------
 2 files changed, 22 insertions(+), 20 deletions(-)
Ping for these 4 patches also (5/8-8/8), they touch a different part
of the code and can thus be done separately.
What is this for? I can see it making code density a bit worse
(compiler can't keep c->bytestream in a register between asm
statements).
Without this patch, some compilers (see other email) mess up register
allocations and adding another "+m"(..) makes it run out of register
and fail to compile. Yes, it's a compiler bug, but widespread enough
that a workaround is warranted, especially since it also helps
"bug-free" compilers generate better code (again, see other patch).

Ronald
Ronald S. Bultje
2012-03-17 16:34:56 UTC
Permalink
---
libavcodec/x86/cabac.h | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 525ace6..78d8af7 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,7 +27,7 @@
#include "config.h"

#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
@@ -37,7 +37,7 @@
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
@@ -57,7 +57,7 @@
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp) \
+ BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp) \
"movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
--
1.7.2.1
Benjamin Larsson
2012-03-20 11:58:32 UTC
Permalink
OK
Ronald S. Bultje
2012-03-17 16:34:57 UTC
Permalink
---
libavcodec/x86/h264_i386.h | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 86066db..131d29d 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -46,13 +46,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
int bit;
x86_reg coeff_count;
__asm__ volatile(
- "2: \n\t"
+ "3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
"%5", "%k0", "%b0", "%c11(%6)")

"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
@@ -65,19 +65,19 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"movl %%ecx, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
@@ -99,7 +99,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
x86_reg state;
__asm__ volatile(
"mov %1, %6 \n\t"
- "2: \n\t"
+ "3: \n\t"

"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
@@ -110,7 +110,7 @@ static int decode_significance_8x8_x86(CABACContext *c,

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"

"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"
@@ -123,18 +123,18 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movl %k6, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
--
1.7.2.1
Benjamin Larsson
2012-03-20 11:59:13 UTC
Permalink
OK
Ronald S. Bultje
2012-03-17 16:34:58 UTC
Permalink
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */

-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
@@ -79,7 +82,7 @@
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
- "1: \n\t"
+ "2: \n\t"

#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS)
#define get_cabac_inline get_cabac_inline_x86
@@ -90,10 +93,12 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,

__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
- "%2", "%3", "%b3", "%c6(%5)")
+ "%2", "%3", "%b3",
+ "%c6(%5)", "%c7(%5)")
: "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 131d29d..9fc05f4 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -49,14 +49,16 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c11(%6)")
+ "%5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)")

"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c11(%6)")
+ "%5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -83,7 +85,8 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -106,7 +109,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %9, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c12(%7)")
+ "%5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -116,7 +120,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %11, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%c12(%7)")
+ "%5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -141,7 +146,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
"m"(sig_off), "m"(last_coeff_ctx_base),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;
--
1.7.2.1
Benjamin Larsson
2012-03-20 12:02:37 UTC
Permalink
OK
Måns Rullgård
2012-03-20 12:44:41 UTC
Permalink
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\
Why do you renumber these? Number labels don't need to be in ascending
order or anything like that.
Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
Is there no way of doing this with cmov instead of branching?
--
Måns Rullgård
***@mansr.com
Uoti Urpala
2012-03-20 13:38:50 UTC
Permalink
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
Is there no way of doing this with cmov instead of branching?
Branches don't have to be expensive if they're never actually taken and
are predicted correctly. It's not obvious whether cmov would be better.

BTW the code calling this would really benefit from using named asm
arguments.
Måns Rullgård
2012-03-20 13:45:42 UTC
Permalink
Post by Uoti Urpala
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
Is there no way of doing this with cmov instead of branching?
Branches don't have to be expensive if they're never actually taken and
are predicted correctly. It's not obvious whether cmov would be better.
Every branch puts extra pressure on the prediction resources, so
avoiding them can be beneficial even if they'd be predicted. If there
is a reasonable branch-free alternative, it's worth testing both.
--
Måns Rullgård
***@mansr.com
Ronald S. Bultje
2012-03-20 18:55:20 UTC
Permalink
Hi,
Post by Uoti Urpala
BTW the code calling this would really benefit from using named asm
arguments.
I agree. Patch welcome.

(I have other things to finish first, I'd love to do it but it's not
high on my priority list.)

Ronald
Ronald S. Bultje
2012-03-22 13:53:14 UTC
Permalink
Hi,
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
Post by Ronald S. Bultje
         "mov    "byte"      , %%"REG_c"                             \n\t"\
+        "cmp    "end"       , %%"REG_c"                             \n\t"\
+        "jge    1f                                                  \n\t"\
         "add"OPSIZE" $2     , "byte"                                \n\t"\
+        "1:                                                         \n\t"\
Is there no way of doing this with cmov instead of branching?
No. If you have suggestions, I'm naturally very open to them.

Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.

Ping.

Ronald
Måns Rullgård
2012-03-22 13:55:01 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
The patch certainly is not.
--
Måns Rullgård
***@mansr.com
Diego Biurrun
2012-03-22 14:07:37 UTC
Permalink
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
The patch certainly is not.
How does this comment help us move forward?

Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.

The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.

Diego
Måns Rullgård
2012-03-22 14:12:22 UTC
Permalink
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
The patch certainly is not.
How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.
Why are you so hostile? Are you also on google payroll now?
--
Måns Rullgård
***@mansr.com
İsmail Dönmez
2012-03-22 14:14:51 UTC
Permalink
Hi;
Post by Ronald S. Bultje
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte, end) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"movzbl "statep" , "ret"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "range" , "tmp"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"and $0xC0 , "range"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"test "lowword" , "lowword"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
- " jnz 1f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+ " jnz 2f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Why do you renumber these? Number labels don't need to be in
ascending
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
order or anything like that.
Because it's cleaner.
The patch certainly is not.
How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.
Why are you so hostile? Are you also on google payroll now?
This is getting off topic. Lets concentrate on the patch itself.

Thanks,
ismail
Måns Rullgård
2012-03-22 14:23:12 UTC
Permalink
Post by İsmail Dönmez
Hi;
Post by Ronald S. Bultje
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte, end) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"movzbl "statep" , "ret"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "range" , "tmp"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"and $0xC0 , "range"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"test "lowword" , "lowword"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
- " jnz 1f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+ " jnz 2f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Why do you renumber these? Number labels don't need to be in
ascending
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
order or anything like that.
Because it's cleaner.
The patch certainly is not.
How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.
Why are you so hostile? Are you also on google payroll now?
This is getting off topic. Lets concentrate on the patch itself.
This is not off-topic. It is about Ronald going mental because I dared
question the quality of a patch he submitted on behalf of the almighty,
infallible Google.
--
Måns Rullgård
***@mansr.com
Diego Biurrun
2012-03-22 14:32:42 UTC
Permalink
Post by Måns Rullgård
Post by İsmail Dönmez
Post by Ronald S. Bultje
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range,
tmp, tmpbyte, byte, end) \
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"movzbl "statep" , "ret"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "range" , "tmp"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"and $0xC0 , "range"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
@@ -64,9 +64,12 @@
"shl %%cl , "low"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"mov "tmpbyte" , "statep"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
"test "lowword" , "lowword"
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
- " jnz 1f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Post by Ronald S. Bultje
+ " jnz 2f
\n\t"\
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
Why do you renumber these? Number labels don't need to be in
ascending
Post by Diego Biurrun
Post by Måns Rullgård
Post by Ronald S. Bultje
Post by Måns Rullgård
order or anything like that.
Because it's cleaner.
The patch certainly is not.
How does this comment help us move forward?
Thanks for sharing your opinion with us, but we heard you loud and clear
the first time around.
The labels get renumbered, so be it. Now let's move on towards solving
the problem at hand, which is the overread and the compiler magic.
Why are you so hostile? Are you also on google payroll now?
This is getting off topic. Lets concentrate on the patch itself.
This is not off-topic. It is about Ronald going mental because I dared
question the quality of a patch he submitted on behalf of the almighty,
infallible Google.
We will talk about this and other things when all parties have cooled
down, but we will surely not have a trollfest here on this mailing list.

If you guys continue posting into this thread I will ask for the mailing
list to be set on moderation until the flamewar is over and for all flames
to be discarded.

Diego
Kostya Shishkov
2012-03-22 14:05:28 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
Post by Ronald S. Bultje
         "mov    "byte"      , %%"REG_c"                             \n\t"\
+        "cmp    "end"       , %%"REG_c"                             \n\t"\
+        "jge    1f                                                  \n\t"\
         "add"OPSIZE" $2     , "byte"                                \n\t"\
+        "1:                                                         \n\t"\
Is there no way of doing this with cmov instead of branching?
No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
Ronald S. Bultje
2012-03-24 19:32:55 UTC
Permalink
Hi,

On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov
Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch

Complete disassembly attached in before.txt and after.txt with each of
the above compilers.

Ronald
Kostya Shishkov
2012-03-26 18:54:48 UTC
Permalink
Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov
Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.
Looks legit, what do other people think?
Luca Barbato
2012-03-26 20:36:35 UTC
Permalink
Post by Kostya Shishkov
Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov
Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.
Looks legit, what do other people think?
I tried to compare it and seems that the patch speeds up everything
sensibly on linux/gcc-4.6.2. (ran 30 times for each interesting patch of
the set, few times it got worse many times it got better I thrown away
outliers and seems overall better)

lu
--
Luca Barbato
Gentoo/linux
http://dev.gentoo.org/~lu_zero
Ronald S. Bultje
2012-03-27 15:08:42 UTC
Permalink
Hi,
Post by Luca Barbato
Post by Kostya Shishkov
Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov
Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.
Looks legit, what do other people think?
I tried to compare it and seems that the patch speeds up everything
sensibly on linux/gcc-4.6.2. (ran 30 times for each interesting patch of
the set, few times it got worse many times it got better I thrown away
outliers and seems overall better)
OK, so are there no more objections to the whole patchset then? I'd
like to push this.

Ronald
Måns Rullgård
2012-03-27 15:11:30 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Luca Barbato
Post by Kostya Shishkov
Post by Ronald S. Bultje
Hi,
On Thu, Mar 22, 2012 at 7:05 AM, Kostya Shishkov
Post by Kostya Shishkov
Can we get at least several compilers and compiler version output for the
functions that use it? It's inline assembly, so compiler output for these
may vary greatly and fail register allocation for some other GCC version,
for instance. When we have it, then we can discuss it further.
gcc-4.2.1: better after patch (less and shorter instructions)
gcc-4.2.1/llvm: same number of instructions before/after, but shorter
instructions after patch
gcc-4.5.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.6.3: same number of instructions before/after, but shorter
instructions after patch
gcc-4.7.0: better after patch (less and shorter instructions)
clang-3.0: same number of instructions before/after, but shorter
instructions after patch
Complete disassembly attached in before.txt and after.txt with each of
the above compilers.
Looks legit, what do other people think?
I tried to compare it and seems that the patch speeds up everything
sensibly on linux/gcc-4.6.2. (ran 30 times for each interesting patch of
the set, few times it got worse many times it got better I thrown away
outliers and seems overall better)
OK, so are there no more objections to the whole patchset then? I'd
like to push this.
You have not done anything to address the issue of why code that
_should_ be worse is actually compiling better. Allowing such anomalies
to go without investigation is irresponsible at best.

I realise, however, that you don't give a fuck about this and that you
are determined to push this hack of a patch no matter what. Enjoy your
victory.
--
Måns Rullgård
***@mansr.com
Luca Barbato
2012-03-27 17:04:34 UTC
Permalink
Post by Måns Rullgård
You have not done anything to address the issue of why code that
_should_ be worse is actually compiling better. Allowing such anomalies
to go without investigation is irresponsible at best.
I'll try to reduce that code soon myself, lately I had been too busy and
the best I could was to try and test it myself with the timer.
Post by Måns Rullgård
I realise, however, that you don't give a fuck about this and that you
are determined to push this hack of a patch no matter what. Enjoy your
victory.
As I said, hadn't it performing decently, I'd had just have that bit
disabled for the non-working gcc.

Would going this route address your concern till I, Ronald, or whoever
manages to get the time to investigate the issue in depth?

lu
--
Luca Barbato
Gentoo/linux
http://dev.gentoo.org/~lu_zero
Alexander Strange
2012-03-25 06:26:15 UTC
Permalink
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
Post by Ronald S. Bultje
         "mov    "byte"      , %%"REG_c"                             \n\t"\
+        "cmp    "end"       , %%"REG_c"                             \n\t"\
+        "jge    1f                                                  \n\t"\
         "add"OPSIZE" $2     , "byte"                                \n\t"\
+        "1:                                                         \n\t"\
Is there no way of doing this with cmov instead of branching?
No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.
Ping.
Ronald
Because this only changes the refill operation I don't think it's
seriously performance-sensitive; that branch doesn't run often. But
how about a benchmark
Jason Garrett-Glaser
2012-03-25 17:01:49 UTC
Permalink
On Sat, Mar 24, 2012 at 11:26 PM, Alexander Strange
Post by Alexander Strange
Post by Ronald S. Bultje
Hi,
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/x86/cabac.h     |   15 ++++++++++-----
 libavcodec/x86/h264_i386.h |   18 ++++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 78d8af7..082395c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
         "movzbl "statep"    , "ret"                                 \n\t"\
         "mov    "range"     , "tmp"                                 \n\t"\
         "and    $0xC0       , "range"                               \n\t"\
@@ -64,9 +64,12 @@
         "shl    %%cl        , "low"                                 \n\t"\
         "mov    "tmpbyte"   , "statep"                              \n\t"\
         "test   "lowword"   , "lowword"                             \n\t"\
-        " jnz   1f                                                  \n\t"\
+        " jnz   2f                                                  \n\t"\
Why do you renumber these?  Number labels don't need to be in ascending
order or anything like that.
Because it's cleaner.
Post by Ronald S. Bultje
         "mov    "byte"      , %%"REG_c"                             \n\t"\
+        "cmp    "end"       , %%"REG_c"                             \n\t"\
+        "jge    1f                                                  \n\t"\
         "add"OPSIZE" $2     , "byte"                                \n\t"\
+        "1:                                                         \n\t"\
Is there no way of doing this with cmov instead of branching?
No. If you have suggestions, I'm naturally very open to them.
Any other review? Again, this patch + other cabac ones depending on it
has been sitting here for days with no review, no progress and no
plan.
Ping.
Ronald
Because this only changes the refill operation I don't think it's
seriously performance-sensitive; that branch doesn't run often. But
how about a benchmark?
I agree; if we're just adding a check in refill, the cost is
completely negligible.

Jason
Kostya Shishkov
2012-03-17 16:49:23 UTC
Permalink
Post by Ronald S. Bultje
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/cabac_functions.h | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index b150aab..4c74cf7 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -47,7 +47,8 @@ static void refill(CABACContext *c){
c->low+= c->bytestream[0]<<1;
#endif
c->low -= CABAC_MASK;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS / 8;
}
static inline void renorm_cabac_decoder_once(CABACContext *c){
@@ -74,7 +75,8 @@ static void refill2(CABACContext *c){
#endif
c->low += x<<i;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS/8;
}
static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
--
probably OK
Ronald S. Bultje
2012-03-17 18:54:49 UTC
Permalink
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-***@libav.org
---
libavcodec/cabac_functions.h | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index b150aab..4c74cf7 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -47,7 +47,8 @@ static void refill(CABACContext *c){
c->low+= c->bytestream[0]<<1;
#endif
c->low -= CABAC_MASK;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS / 8;
}

static inline void renorm_cabac_decoder_once(CABACContext *c){
@@ -74,7 +75,8 @@ static void refill2(CABACContext *c){
#endif

c->low += x<<i;
- c->bytestream+= CABAC_BITS/8;
+ if (c->bytestream < c->bytestream_end)
+ c->bytestream += CABAC_BITS/8;
}

static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:50 UTC
Permalink
---
libavcodec/x86/cabac.h | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 3c3652d..7d8976c 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,8 +105,8 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %4, %k1 \n\t"
- "movl %2, %%eax \n\t"
+ "movl %a5(%2), %k1 \n\t"
+ "movl %a3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
@@ -117,20 +117,23 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
" jnz 1f \n\t"
- "mov %3, %1 \n\t"
+ "mov %a4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %3 \n\t"
+ "mov %1, %a4(%2) \n\t"
"1: \n\t"
- "movl %%eax, %2 \n\t"
+ "movl %%eax, %a3(%2) \n\t"

- :"+c"(val), "=&r"(tmp), "+m"(c->low), "+m"(c->bytestream)
- :"m"(c->range)
- : "%eax", "%edx"
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
);
return val;
}
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:51 UTC
Permalink
---
libavcodec/x86/cabac.h | 44 ++++++++++++++++++++++----------------------
1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 7d8976c..b00652b 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,28 +105,28 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %a5(%2), %k1 \n\t"
- "movl %a3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "xor %%edx, %%ecx \n\t"
- "sub %%edx, %%ecx \n\t"
- "test %%ax, %%ax \n\t"
- " jnz 1f \n\t"
- "mov %a4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%edx \n\t"
- "bswap %%edx \n\t"
- "shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
- "addl %%edx, %%eax \n\t"
- "mov %1, %a4(%2) \n\t"
- "1: \n\t"
- "movl %%eax, %a3(%2) \n\t"
+ "movl %a5(%2), %k1 \n\t"
+ "movl %a3(%2), %%eax \n\t"
+ "shl $17, %k1 \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %k1, %%eax \n\t"
+ "cltd \n\t"
+ "and %%edx, %k1 \n\t"
+ "add %k1, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ "jnz 1f \n\t"
+ "mov %a4(%2), %1 \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%1), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %a4(%2) \n\t"
+ "1: \n\t"
+ "movl %%eax, %a3(%2) \n\t"

: "+c"(val), "=&r"(tmp)
: "r"(c),
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:52 UTC
Permalink
---
libavcodec/x86/cabac.h | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index b00652b..adf4fc3 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -105,7 +105,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
- "movl %a5(%2), %k1 \n\t"
+ "movl %a6(%2), %k1 \n\t"
"movl %a3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
@@ -122,9 +122,10 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
- "add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
- "mov %1, %a4(%2) \n\t"
+ "cmp %a5(%2), %1 \n\t"
+ "jge 1f \n\t"
+ "add"OPSIZE" $2, %a4(%2) \n\t"
"1: \n\t"
"movl %%eax, %a3(%2) \n\t"

@@ -132,6 +133,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:53 UTC
Permalink
---
libavcodec/x86/cabac.h | 10 +++++-----
libavcodec/x86/h264_i386.h | 32 +++++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index adf4fc3..e03a4de 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -89,11 +89,11 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
int bit, tmp;

__asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "(%5)", "%1", "%w1", "%2",
- "%3", "%b3", "%4")
- :"=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp),
- "+m"(c->bytestream)
- :"r"(state)
+ BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
+ "%2", "%3", "%b3", "%a6(%5)")
+ : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "r"(state), "r"(c),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index e195e04..2cfcbdd 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -48,15 +48,15 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
__asm__ volatile(
"2: \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a11(%6)")

"test $1, %4 \n\t"
" jz 3f \n\t"
"add %10, %1 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%1)", "%3",
- "%w3", "%5", "%k0", "%b0", "%6")
+ BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a11(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -80,10 +80,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"4: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range),
- "+m"(c->bytestream)
- :"m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+ : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+ "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+ : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -105,8 +105,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a12(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -115,8 +115,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"

- BRANCHLESS_GET_CABAC("%4", "(%6)", "%3",
- "%w3", "%5", "%k0", "%b0", "%7")
+ BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
+ "%5", "%k0", "%b0", "%a12(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -137,9 +137,11 @@ static int decode_significance_8x8_x86(CABACContext *c,
"4: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
- :"=&q"(coeff_count),"+m"(last), "+m"(index), "+&r"(c->low), "=&r"(bit),
- "+&r"(c->range), "=&r"(state), "+m"(c->bytestream)
- :"m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_coeff_ctx_base)
+ : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ "=&r"(bit), "+&r"(c->range), "=&r"(state)
+ : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+ "m"(sig_off), "m"(last_coeff_ctx_base),
+ "i"(offsetof(CABACContext, bytestream))
: "%"REG_c, "memory"
);
return coeff_count;
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:54 UTC
Permalink
---
libavcodec/x86/cabac.h | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index e03a4de..ca8a1d5 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,7 +27,7 @@
#include "config.h"

#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
@@ -37,7 +37,7 @@
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp)\
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
@@ -57,7 +57,7 @@
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, lowword, range, tmp) \
+ BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp) \
"movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:55 UTC
Permalink
---
libavcodec/x86/h264_i386.h | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 2cfcbdd..31ddaf6 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -46,13 +46,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
int bit;
x86_reg coeff_count;
__asm__ volatile(
- "2: \n\t"
+ "3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
"%5", "%k0", "%b0", "%a11(%6)")

"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
@@ -65,19 +65,19 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"movl %%ecx, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
@@ -99,7 +99,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
x86_reg state;
__asm__ volatile(
"mov %1, %6 \n\t"
- "2: \n\t"
+ "3: \n\t"

"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
@@ -110,7 +110,7 @@ static int decode_significance_8x8_x86(CABACContext *c,

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
- " jz 3f \n\t"
+ " jz 4f \n\t"

"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t"
"add %11, %6 \n\t"
@@ -123,18 +123,18 @@ static int decode_significance_8x8_x86(CABACContext *c,
"movl %k6, (%0) \n\t"

"test $1, %4 \n\t"
- " jnz 4f \n\t"
+ " jnz 5f \n\t"

"add"OPSIZE" $4, %2 \n\t"

- "3: \n\t"
+ "4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
- " jb 2b \n\t"
+ " jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
- "4: \n\t"
+ "5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
--
1.7.2.1
Ronald S. Bultje
2012-03-17 18:54:56 UTC
Permalink
Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
libavcodec/x86/cabac.h | 15 ++++++++++-----
libavcodec/x86/h264_i386.h | 18 ++++++++++++------
2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index ca8a1d5..a6ec228 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -51,7 +51,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */

-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte) \
+#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -64,9 +64,12 @@
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
- " jnz 1f \n\t"\
+ " jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"\
"add"OPSIZE" $2 , "byte" \n\t"\
+ "1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
@@ -79,7 +82,7 @@
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
- "1: \n\t"
+ "2: \n\t"

#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS)
#define get_cabac_inline get_cabac_inline_x86
@@ -90,10 +93,12 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,

__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
- "%2", "%3", "%b3", "%a6(%5)")
+ "%2", "%3", "%b3",
+ "%a6(%5)", "%a7(%5)")
: "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return bit & 1;
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 31ddaf6..e849a3d 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -49,14 +49,16 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
"3: \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%a11(%6)")
+ "%5", "%k0", "%b0",
+ "%a11(%6)", "%a12(%6)")

"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
- "%5", "%k0", "%b0", "%a11(%6)")
+ "%5", "%k0", "%b0",
+ "%a11(%6)", "%a12(%6)")

"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
@@ -83,7 +85,8 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;
@@ -106,7 +109,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %9, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%a12(%7)")
+ "%5", "%k0", "%b0",
+ "%a12(%7)", "%a13(%7)")

"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
@@ -116,7 +120,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add %11, %6 \n\t"

BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
- "%5", "%k0", "%b0", "%a12(%7)")
+ "%5", "%k0", "%b0",
+ "%a12(%7)", "%a13(%7)")

"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
@@ -141,7 +146,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
"m"(sig_off), "m"(last_coeff_ctx_base),
- "i"(offsetof(CABACContext, bytestream))
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
: "%"REG_c, "memory"
);
return coeff_count;
--
1.7.2.1
Loading...