[V2] RISC-V: Support POPCOUNT auto-vectorization

Message ID 20230731141349.1188774-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] RISC-V: Support POPCOUNT auto-vectorization |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai July 31, 2023, 2:13 p.m. UTC
  This patch is inspired by "lowerCTPOP" in LLVM.
Support popcount auto-vectorization by LLVM approach.

Before this patch:

<source>:7:21: missed: couldn't vectorize loop
<source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4);

After this patch:

popcount_32:
	ble	a2,zero,.L5
	li	t3,1431654400
	li	a7,858992640
	li	t1,252645376
	li	a6,16711680
	li	a3,65536
	addiw	t3,t3,1365
	addiw	a7,a7,819
	addiw	t1,t1,-241
	addiw	a6,a6,255
	addiw	a3,a3,-1
.L3:
	vsetvli	a5,a2,e8,mf4,ta,ma
	vle32.v	v1,0(a1)
	vsetivli	zero,4,e32,m1,ta,ma
	vsrl.vi	v2,v1,1
	vand.vx	v2,v2,t3
	vsub.vv	v1,v1,v2
	vsrl.vi	v2,v1,2
	vand.vx	v2,v2,a7
	vand.vx	v1,v1,a7
	vadd.vv	v1,v1,v2
	vsrl.vi	v2,v1,4
	vadd.vv	v1,v1,v2
	vand.vx	v1,v1,t1
	vsrl.vi	v2,v1,8
	vand.vx	v2,v2,a6
	slli	a4,a5,2
	vand.vx	v1,v1,a6
	vadd.vv	v1,v1,v2
	vsrl.vi	v2,v1,16
	vand.vx	v1,v1,a3
	vand.vx	v2,v2,a3
	vadd.vv	v1,v1,v2
	vmv.v.v	v1,v1
	vsetvli	zero,a2,e32,m1,ta,ma
	sub	a2,a2,a5
	vse32.v	v1,0(a0)
	add	a1,a1,a4
	add	a0,a0,a4
	bne	a2,zero,.L3
.L5:
	ret

gcc/ChangeLog:

	* config/riscv/autovec.md (popcount<mode>2): New pattern.
	* config/riscv/riscv-protos.h (expand_popcount): New function.
	* config/riscv/riscv-v.cc (expand_popcount): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
	* gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.

---
 gcc/config/riscv/autovec.md                   | 13 +++
 gcc/config/riscv/riscv-protos.h               |  1 +
 gcc/config/riscv/riscv-v.cc                   | 81 +++++++++++++++++++
 .../riscv/rvv/autovec/widen/popcount-1.c      | 23 ++++++
 .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++++
 5 files changed, 168 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
  

Comments

Robin Dapp July 31, 2023, 7:38 p.m. UTC | #1
Hi Juzhe,

> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> +	n = COUNT(n, 0);
> +	n = COUNT(n, 1);
> +	n = COUNT(n, 2);
> +	n = COUNT(n, 3);
> +	n = COUNT(n, 4);
> +   //	n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> +	return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }

That's quite a heavy implementation but I suppose with the proper cost
function it can still be worth it.  Did you also try some alternatives?
WWG comes to mind:

uint64_t c1 = 0x5555555555555555;
uint64_t c2 = 0x3333333333333333;
uint64_t c4 = 0x0F0F0F0F0F0F0F0F;

uint64_t wwg (uint64_t x) {
    x -= (x >> 1) & c1;
    x = ((x >> 2) & c2) + (x & c2);
    x = (x + (x >> 4) ) & c4;
    x *= 0x0101010101010101;
    return x >> 56;
}

From my recollection this is usually 30-40% faster than the naive tree
adder and also amenable to vectorization.  As long as the multiplication
is not terribly slow, that is.  Mula's algorithm should be significantly
faster even, another 30% IIRC.

I'm not against continuing with the more well-known approach for now
but we should keep in mind that might still be potential for improvement.

>  } // namespace riscv_vector
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c

Any particular reason why the tests are in widen?

> +extern void abort (void) __attribute__ ((noreturn));

Why no __builtin_unreachable as in the other tests? 

> +      asm volatile ("" ::: "memory");

Is this necessary?  I doesn't hurt of course, just wondering.

All in all LGTM in case you'd rather get this upstream now.  We can
always improve later.

Regards
 Robin
  
Robin Dapp July 31, 2023, 8:27 p.m. UTC | #2
> +/* FIXME: We don't allow vectorize "__builtin_popcountll" yet since it needs "vec_pack_trunc" support
> +          and such pattern may cause inferior codegen.
> +	  We will enable "vec_pack_trunc" when we support reasonable vector cost model.  */

Wait, why do we need vec_pack_trunc for popcountll?  For me vectorizing
it "just works" when the output is a uint64_t just like the standard
name demands.

If you're referring to something else, please detail in the comment.

Regards
 Robin
  
juzhe.zhong@rivai.ai July 31, 2023, 10:03 p.m. UTC | #3
>> From my recollection this is usually 30-40% faster than the naive tree
>> adder and also amenable to vectorization.  As long as the multiplication
>> is not terribly slow, that is.  Mula's algorithm should be significantly
>> faster even, another 30% IIRC.
 
>> I'm not against continuing with the more well-known approach for now
>> but we should keep in mind that might still be potential for improvement.

No. I don't think it's faster.

>> Wait, why do we need vec_pack_trunc for popcountll?  For me vectorizing
>> it "just works" when the output is a uint64_t just like the standard
>> name demands.

>> If you're referring to something else, please detail in the comment.

I have no ideal. I saw ARM SVE generate:
POP_COUNT
POP_COUNT
VEC_PACK_TRUNC.

I am gonna drop this patch since it's meaningless.

Thanks.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-08-01 03:38
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng
Subject: Re: [PATCH V2] RISC-V: Support POPCOUNT auto-vectorization
Hi Juzhe,
 
> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> + n = COUNT(n, 0);
> + n = COUNT(n, 1);
> + n = COUNT(n, 2);
> + n = COUNT(n, 3);
> + n = COUNT(n, 4);
> +   // n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> + return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }
 
That's quite a heavy implementation but I suppose with the proper cost
function it can still be worth it.  Did you also try some alternatives?
WWG comes to mind:
 
uint64_t c1 = 0x5555555555555555;
uint64_t c2 = 0x3333333333333333;
uint64_t c4 = 0x0F0F0F0F0F0F0F0F;
 
uint64_t wwg (uint64_t x) {
    x -= (x >> 1) & c1;
    x = ((x >> 2) & c2) + (x & c2);
    x = (x + (x >> 4) ) & c4;
    x *= 0x0101010101010101;
    return x >> 56;
}
 
From my recollection this is usually 30-40% faster than the naive tree
adder and also amenable to vectorization.  As long as the multiplication
is not terribly slow, that is.  Mula's algorithm should be significantly
faster even, another 30% IIRC.
 
I'm not against continuing with the more well-known approach for now
but we should keep in mind that might still be potential for improvement.
 
>  } // namespace riscv_vector
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
 
Any particular reason why the tests are in widen?
 
> +extern void abort (void) __attribute__ ((noreturn));
 
Why no __builtin_unreachable as in the other tests? 
 
> +      asm volatile ("" ::: "memory");
 
Is this necessary?  I doesn't hurt of course, just wondering.
 
All in all LGTM in case you'd rather get this upstream now.  We can
always improve later.
 
Regards
Robin
  
Robin Dapp Aug. 1, 2023, 6:47 a.m. UTC | #4
>>> I'm not against continuing with the more well-known approach for now
>>> but we should keep in mind that might still be potential for improvement.
> 
> No. I don't think it's faster.

I did a quick check on my x86 laptop and it's roughly 25% faster there.
That's consistent with the literature.  RISC-V qemu only shows 5-10%
improvement, though.

> I have no ideal. I saw ARM SVE generate:
> POP_COUNT
> POP_COUNT
> VEC_PACK_TRUNC.

I'd strongly suspect this happens because it's converting to int.
If you change dst to uint64_t there won't be any vec_pack_trunc.

> I am gonna drop this patch since it's meaningless.

But why?  It can still help even if we can improve on the sequence.
IMHO you can go ahead with it and just change int -> uint64_t in the
tests.

Regards
 Robin
  
Jeff Law Aug. 4, 2023, 8:56 p.m. UTC | #5
On 8/1/23 00:47, Robin Dapp via Gcc-patches wrote:
>>>>   I'm not against continuing with the more well-known approach for now
>>>>   but we should keep in mind that might still be potential for improvement.
>>
>> No. I don't think it's faster.
> 
> I did a quick check on my x86 laptop and it's roughly 25% faster there.
> That's consistent with the literature.  RISC-V qemu only shows 5-10%
> improvement, though.
> 
>> I have no ideal. I saw ARM SVE generate:
>> POP_COUNT
>> POP_COUNT
>> VEC_PACK_TRUNC.
> 
> I'd strongly suspect this happens because it's converting to int.
> If you change dst to uint64_t there won't be any vec_pack_trunc.
> 
>> I am gonna drop this patch since it's meaningless.
> 
> But why?  It can still help even if we can improve on the sequence.
> IMHO you can go ahead with it and just change int -> uint64_t in the
> tests.
It'd also be interesting to see if those popcounts in deepsjeng are 
vectorizable.  We got a major boost in deepsjeng at a prior employer, 
but I can't remember if it was from getting the pcounts vectorized or 
just not doing stupid stuff with them on the scalar side.


jeff
  

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 2094a77a9a7..7babc9756a1 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -922,6 +922,19 @@ 
   DONE;
 })
 
+;; -------------------------------------------------------------------------------
+;; - [INT] POPCOUNT.
+;; -------------------------------------------------------------------------------
+
+(define_expand "popcount<mode>2"
+  [(match_operand:VI 0 "register_operand")
+   (match_operand:VI 1 "register_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_popcount (operands);
+  DONE;
+})
+
 ;; -------------------------------------------------------------------------------
 ;; ---- [FP] Unary operations
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 9e2c3d3e2cc..446ba7b559e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -321,6 +321,7 @@  void expand_select_vl (rtx *);
 void expand_load_store (rtx *, bool);
 void expand_gather_scatter (rtx *, bool);
 void expand_cond_len_ternop (unsigned, rtx *);
+void expand_popcount (rtx *);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index aa8a6763716..ac7dae952be 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3610,4 +3610,85 @@  expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type)
   emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
 }
 
+/* Expand Vector POPCOUNT by parallel popcnt:
+
+   int parallel_popcnt(uint32_t n) {
+   #define POW2(c)      (1U << (c))
+   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
+   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
+	n = COUNT(n, 0);
+	n = COUNT(n, 1);
+	n = COUNT(n, 2);
+	n = COUNT(n, 3);
+	n = COUNT(n, 4);
+   //	n = COUNT(n, 5);  // uncomment this line for 64-bit integers
+	return n;
+   #undef COUNT
+   #undef MASK
+   #undef POW2
+   }
+*/
+void
+expand_popcount (rtx *ops)
+{
+  rtx dst = ops[0];
+  rtx src = ops[1];
+  machine_mode mode = GET_MODE (dst);
+  scalar_mode smode = GET_MODE_INNER (mode);
+  static const uint64_t mask_values[6]
+    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
+
+  unsigned bit_size = GET_MODE_BITSIZE (smode);
+  rtx count = CONST0_RTX (mode);
+
+  rtx part_value = src;
+  /* Currently we don't have TI vector modes so bit_size is always <= 64.  */
+  for (unsigned i = 1, ct = 0; i < bit_size; i <<= 1, ++ct)
+    {
+      rtx mask_cst = gen_int_mode (mask_values[ct], smode);
+
+      rtx vshift
+	= expand_binop (mode, lshr_optab, part_value, gen_int_mode (i, smode),
+			NULL_RTX, true, OPTAB_DIRECT);
+
+      if (i == 4)
+	{
+	  /* Optimize ((X & MASK) + ((X >> 4) & MASK))
+
+	     -> (X + (X >> 4)) & MASK  */
+	  rtx rhs = expand_binop (mode, add_optab, part_value, vshift, NULL_RTX,
+				  false, OPTAB_DIRECT);
+	  part_value = gen_reg_rtx (mode);
+	  rtx part_ops[] = {part_value, rhs, mask_cst};
+	  emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+			   part_ops);
+	}
+      else
+	{
+	  rtx rhs = gen_reg_rtx (mode);
+	  rtx rhs_ops[] = {rhs, vshift, mask_cst};
+	  emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+			   rhs_ops);
+	  if (i == 1)
+	    part_value = expand_binop (mode, sub_optab, part_value, rhs,
+				       NULL_RTX, false, OPTAB_DIRECT);
+	  else
+	    {
+	      rtx lhs = gen_reg_rtx (mode);
+	      rtx lhs_ops[] = {lhs, part_value, mask_cst};
+	      emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+			       lhs_ops);
+
+	      part_value = expand_binop (mode, add_optab, lhs, rhs, NULL_RTX,
+					 false, OPTAB_DIRECT);
+	    }
+	}
+    }
+
+  count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false,
+			OPTAB_DIRECT);
+  emit_move_insn (dst, count);
+}
+
 } // namespace riscv_vector
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
new file mode 100644
index 00000000000..bcb4a6e8571
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+void __attribute__ ((noinline, noclone))
+popcount_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_popcount (src[i]);
+}
+
+void __attribute__ ((noinline, noclone))
+popcount_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_popcountll (src[i]);
+}
+
+/* FIXME: We don't allow vectorize "__builtin_popcountll" yet since it needs "vec_pack_trunc" support
+          and such pattern may cause inferior codegen.
+	  We will enable "vec_pack_trunc" when we support reasonable vector cost model.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
new file mode 100644
index 00000000000..f6be709639a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
@@ -0,0 +1,50 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "popcount-1.c"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+unsigned int data[] = {
+  0x11111100, 6,
+  0xe0e0f0f0, 14,
+  0x9900aab3, 13,
+  0x00040003, 3,
+  0x000e000c, 5,
+  0x22227777, 16,
+  0x12341234, 10,
+  0x0, 0
+};
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
+
+  uint32_t in32[count];
+  unsigned int out32[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in32[i] = data[i * 2];
+      asm volatile ("" ::: "memory");
+    }
+  popcount_32 (out32, in32, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out32[i] != data[i * 2 + 1])
+      abort ();
+
+  count /= 2;
+  uint64_t in64[count];
+  unsigned int out64[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in64[i] = ((uint64_t) data[i * 4] << 32) | data[i * 4 + 2];
+      asm volatile ("" ::: "memory");
+    }
+  popcount_64 (out64, in64, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out64[i] != data[i * 4 + 1] + data[i * 4 + 3])
+      abort ();
+
+  return 0;
+}