[3/4] AArch64 Add SVE2 implementation for pow2 bitmask division

Message ID Yy19es5TOyWlHsnk@arm.com
State New, archived
Headers
Series [1/4] middle-end Support not decomposing specific divisions during vectorization. |

Commit Message

Tamar Christina Sept. 23, 2022, 9:33 a.m. UTC
  Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an named function to allow us to emit an optimized sequence
when doing an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1)

For SVE2 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

        mov     z3.b, #1
.L3:
        ld1b    z0.h, p0/z, [x0, x3]
        mul     z0.h, p1/m, z0.h, z2.h
        addhnb  z1.b, z0.h, z3.h
        addhnb  z0.b, z0.h, z1.h
        st1b    z0.h, p0, [x0, x3]
        inch    x3
        whilelo p0.h, w3, w2
        b.any   .L3

instead of:

.L3:
        ld1b    z0.h, p1/z, [x0, x3]
        mul     z0.h, p0/m, z0.h, z1.h
        umulh   z0.h, p0/m, z0.h, z2.h
        lsr     z0.h, z0.h, #7
        st1b    z0.h, p1, [x0, x3]
        inch    x3
        whilelo p1.h, w3, w2
        b.any   .L3

Which results in significantly faster code.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644




--
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
  

Comments

Tamar Christina Oct. 31, 2022, 11:34 a.m. UTC | #1
Ping

> -----Original Message-----
> From: Tamar Christina <tamar.christina@arm.com>
> Sent: Friday, September 23, 2022 10:34 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask
> division
> 
> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel
> values by a widening operation and then scale them back into range by
> dividing by 255.
> 
> This patch adds an named function to allow us to emit an optimized
> sequence when doing an unsigned division that is equivalent to:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1)
> 
> For SVE2 this means we generate for:
> 
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] = (pixel[i] * level) / 0xff; }
> 
> the following:
> 
>         mov     z3.b, #1
> .L3:
>         ld1b    z0.h, p0/z, [x0, x3]
>         mul     z0.h, p1/m, z0.h, z2.h
>         addhnb  z1.b, z0.h, z3.h
>         addhnb  z0.b, z0.h, z1.h
>         st1b    z0.h, p0, [x0, x3]
>         inch    x3
>         whilelo p0.h, w3, w2
>         b.any   .L3
> 
> instead of:
> 
> .L3:
>         ld1b    z0.h, p1/z, [x0, x3]
>         mul     z0.h, p0/m, z0.h, z1.h
>         umulh   z0.h, p0/m, z0.h, z2.h
>         lsr     z0.h, z0.h, #7
>         st1b    z0.h, p1, [x0, x3]
>         inch    x3
>         whilelo p1.h, w3, w2
>         b.any   .L3
> 
> Which results in significantly faster code.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve2.md
> (@aarch64_bitmask_udiv<mode>3): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-sve2.md
> b/gcc/config/aarch64/aarch64-sve2.md
> index
> f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234
> a1023a6eba0d1 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -71,6 +71,7 @@
>  ;; ---- [INT] Reciprocal approximation
>  ;; ---- [INT<-FP] Base-2 logarithm
>  ;; ---- [INT] Polynomial multiplication
> +;; ---- [INT] Misc optab implementations
>  ;;
>  ;; == Permutation
>  ;; ---- [INT,FP] General permutes
> @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
>    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
>  )
> 
> +;;
> +-----------------------------------------------------------------------
> +-- ;; ---- [INT] Misc optab implementations ;;
> +-----------------------------------------------------------------------
> +--
> +;; Includes:
> +;; - aarch64_bitmask_udiv
> +;;
> +-----------------------------------------------------------------------
> +--
> +
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it
> +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;;
> +double the precision of x.
> +;;
> +;; See aarch64-simd.md for bigger explanation.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> +  "TARGET_SVE2"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;
> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> +<VNARROW>mode));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1,
> operands[1],
> +			      addend));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2,
> operands[1],
> +			      lowpart_subreg (<MODE>mode, tmp1,
> +					      <VNARROW>mode)));
> +  emit_move_insn (operands[0],
> +		  lowpart_subreg (<MODE>mode, tmp2,
> <VNARROW>mode));
> +  DONE;
> +})
> +
>  ;;
> ==========================================================
> ===============
>  ;; == Permutation
>  ;;
> ==========================================================
> ===============
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c
> 0bb0d204cda6d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> 
> 
> 
> 
> --
  
Tamar Christina Nov. 9, 2022, 8:33 a.m. UTC | #2
ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Tamar Christina <tamar.christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2
> bitmask division
> 
> Ping
> 
> > -----Original Message-----
> > From: Tamar Christina <tamar.christina@arm.com>
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask
> > division
> >
> > Hi All,
> >
> > In plenty of image and video processing code it's common to modify
> > pixel values by a widening operation and then scale them back into
> > range by dividing by 255.
> >
> > This patch adds an named function to allow us to emit an optimized
> > sequence when doing an unsigned division that is equivalent to:
> >
> >    x = y / (2 ^ (bitsize (y)/2)-1)
> >
> > For SVE2 this means we generate for:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] = (pixel[i] * level) / 0xff; }
> >
> > the following:
> >
> >         mov     z3.b, #1
> > .L3:
> >         ld1b    z0.h, p0/z, [x0, x3]
> >         mul     z0.h, p1/m, z0.h, z2.h
> >         addhnb  z1.b, z0.h, z3.h
> >         addhnb  z0.b, z0.h, z1.h
> >         st1b    z0.h, p0, [x0, x3]
> >         inch    x3
> >         whilelo p0.h, w3, w2
> >         b.any   .L3
> >
> > instead of:
> >
> > .L3:
> >         ld1b    z0.h, p1/z, [x0, x3]
> >         mul     z0.h, p0/m, z0.h, z1.h
> >         umulh   z0.h, p0/m, z0.h, z2.h
> >         lsr     z0.h, z0.h, #7
> >         st1b    z0.h, p1, [x0, x3]
> >         inch    x3
> >         whilelo p1.h, w3, w2
> >         b.any   .L3
> >
> > Which results in significantly faster code.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-sve2.md
> > (@aarch64_bitmask_udiv<mode>3): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index
> >
> f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234
> > a1023a6eba0d1 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -71,6 +71,7 @@
> >  ;; ---- [INT] Reciprocal approximation  ;; ---- [INT<-FP] Base-2
> > logarithm  ;; ---- [INT] Polynomial multiplication
> > +;; ---- [INT] Misc optab implementations
> >  ;;
> >  ;; == Permutation
> >  ;; ---- [INT,FP] General permutes
> > @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
> >    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
> >  )
> >
> > +;;
> > +---------------------------------------------------------------------
> > +--
> > +-- ;; ---- [INT] Misc optab implementations ;;
> > +---------------------------------------------------------------------
> > +--
> > +--
> > +;; Includes:
> > +;; - aarch64_bitmask_udiv
> > +;;
> > +---------------------------------------------------------------------
> > +--
> > +--
> > +
> > +;; div optimizations using narrowings ;; we can do the division e.g.
> > +shorts by 255 faster by calculating it as ;; (x + ((x + 257) >> 8))
> > +>> 8 assuming the operation is done in ;; double the precision of x.
> > +;;
> > +;; See aarch64-simd.md for bigger explanation.
> > +(define_expand "@aarch64_bitmask_udiv<mode>3"
> > +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> > +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> > +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> > +  "TARGET_SVE2"
> > +{
> > +  unsigned HOST_WIDE_INT size
> > +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> > +  if (!CONST_VECTOR_P (operands[2])
> > +      || const_vector_encoded_nelts (operands[2]) != 1
> > +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> > +    FAIL;
> > +
> > +  rtx addend = gen_reg_rtx (<MODE>mode);
> > +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> > +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> > +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> > +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > +<VNARROW>mode));
> > +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1,
> > operands[1],
> > +			      addend));
> > +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2,
> > operands[1],
> > +			      lowpart_subreg (<MODE>mode, tmp1,
> > +					      <VNARROW>mode)));
> > +  emit_move_insn (operands[0],
> > +		  lowpart_subreg (<MODE>mode, tmp2,
> > <VNARROW>mode));
> > +  DONE;
> > +})
> > +
> >  ;;
> >
> ==========================================================
> > ===============
> >  ;; == Permutation
> >  ;;
> >
> ==========================================================
> > ===============
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c
> > 0bb0d204cda6d9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> > +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> > +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> > +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> > +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> >
> >
> >
> >
> > --
  
Richard Sandiford Nov. 12, 2022, 12:17 p.m. UTC | #3
Sorry for the slow review, been snowed under with stage1 stuff.

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> In plenty of image and video processing code it's common to modify pixel values
> by a widening operation and then scale them back into range by dividing by 255.
>
> This patch adds an named function to allow us to emit an optimized sequence
> when doing an unsigned division that is equivalent to:
>
>    x = y / (2 ^ (bitsize (y)/2)-1)
>
> For SVE2 this means we generate for:
>
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] = (pixel[i] * level) / 0xff;
> }
>
> the following:
>
>         mov     z3.b, #1
> .L3:
>         ld1b    z0.h, p0/z, [x0, x3]
>         mul     z0.h, p1/m, z0.h, z2.h
>         addhnb  z1.b, z0.h, z3.h
>         addhnb  z0.b, z0.h, z1.h
>         st1b    z0.h, p0, [x0, x3]
>         inch    x3
>         whilelo p0.h, w3, w2
>         b.any   .L3
>
> instead of:
>
> .L3:
>         ld1b    z0.h, p1/z, [x0, x3]
>         mul     z0.h, p0/m, z0.h, z1.h
>         umulh   z0.h, p0/m, z0.h, z2.h
>         lsr     z0.h, z0.h, #7
>         st1b    z0.h, p1, [x0, x3]
>         inch    x3
>         whilelo p1.h, w3, w2
>         b.any   .L3
>
> Which results in significantly faster code.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -71,6 +71,7 @@
>  ;; ---- [INT] Reciprocal approximation
>  ;; ---- [INT<-FP] Base-2 logarithm
>  ;; ---- [INT] Polynomial multiplication
> +;; ---- [INT] Misc optab implementations
>  ;;
>  ;; == Permutation
>  ;; ---- [INT,FP] General permutes
> @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
>    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; -------------------------------------------------------------------------
> +;; ---- [INT] Misc optab implementations
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - aarch64_bitmask_udiv
> +;; -------------------------------------------------------------------------
> +
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it as
> +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
> +;; double the precision of x.
> +;;
> +;; See aarch64-simd.md for bigger explanation.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> +  "TARGET_SVE2"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;

A slightly simpler way to write this, without the direct use of the
encoding, is:

  rtx elt = unwrap_const_vec_duplicate (operands[2]);
  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
    FAIL;

OK with that change, thanks.

Richard

> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
> +			      addend));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
> +			      lowpart_subreg (<MODE>mode, tmp1,
> +					      <VNARROW>mode)));
> +  emit_move_insn (operands[0],
> +		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
> +  DONE;
> +})
> +
>  ;; =========================================================================
>  ;; == Permutation
>  ;; =========================================================================
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
  

Patch

--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@ 
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@  (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}