vect: Add a popcount fallback.
Checks
Commit Message
Hi,
This patch adds a fallback when the backend does not provide a popcount
implementation. The algorithm is the same one libgcc uses, as well as
match.pd for recognizing a popcount idiom. __builtin_ctz and __builtin_ffs
can also rely on popcount so I used the fallback for them as well.
Bootstrapped and regtested on x86, aarch64 and power10. Unfortunately
I don't have access to any architecture other than riscv that vectorizes
but does not have a vectorized popcount. I added all vect_int targets
to the selector where a cursory grep "expand.*popcount" would yield no
result.
Regards
Robin
gcc/ChangeLog:
* tree-vect-patterns.cc (vect_have_popcount_fallback): New
function.
(vect_generate_popcount_fallback): New function to emit
vectorized popcount fallback.
(vect_recog_ctz_ffs_pattern): Use fallback.
(vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-popcount-fallback.c: New test.
---
.../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++++
gcc/tree-vect-patterns.cc | 172 ++++++++++++++++--
2 files changed, 267 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
Comments
On Mon, Aug 7, 2023 at 10:20 PM Robin Dapp via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> This patch adds a fallback when the backend does not provide a popcount
> implementation. The algorithm is the same one libgcc uses, as well as
> match.pd for recognizing a popcount idiom. __builtin_ctz and __builtin_ffs
> can also rely on popcount so I used the fallback for them as well.
>
> Bootstrapped and regtested on x86, aarch64 and power10. Unfortunately
> I don't have access to any architecture other than riscv that vectorizes
> but does not have a vectorized popcount. I added all vect_int targets
> to the selector where a cursory grep "expand.*popcount" would yield no
> result.
Looks reasonable to me - I couldn't read from above whether you did
testing on riscv and thus verified the runtime correctness of the fallback?
If not may I suggest to force matching the pattern on a target you can
test for this purpose?
One note below ...
Thanks,
Richard.
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * tree-vect-patterns.cc (vect_have_popcount_fallback): New
> function.
> (vect_generate_popcount_fallback): New function to emit
> vectorized popcount fallback.
> (vect_recog_ctz_ffs_pattern): Use fallback.
> (vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-popcount-fallback.c: New test.
> ---
> .../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++++
> gcc/tree-vect-patterns.cc | 172 ++++++++++++++++--
> 2 files changed, 267 insertions(+), 11 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> new file mode 100644
> index 00000000000..f6300f4ab35
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> @@ -0,0 +1,106 @@
> +/* Check if we vectorize popcount when no expander is available. */
> +/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
> +/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdlib.h>
> +#include <assert.h>
> +#include <stdint-gcc.h>
> +
> +__attribute__ ((noipa))
> +void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_popcount (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ctz (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ffs (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_popcountll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ctzll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ffsll (a[i]);
> +}
> +
> +#define SZ 512
> +
> +__attribute__ ((optimize ("0")))
> +int main ()
> +{
> + uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
> + uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
> + uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
> + uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
> + uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
> + uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
> +
> + uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
> + uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
> + uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
> + uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
> + uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
> + uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
> +
> + for (int i = 0; i < SZ; i++)
> + {
> + int ia = i + 1;
> + a32pc[i] = ia * 1234567;
> + b32pc[i] = 0;
> + a32ctz[i] = ia * 1234567;
> + b32ctz[i] = 0;
> + a32ffs[i] = ia * 1234567;
> + b32ffs[i] = 0;
> + a64pc[i] = ia * 123456789ull;
> + b64pc[i] = 0;
> + a64ctz[i] = ia * 123456789ull;
> + b64ctz[i] = 0;
> + a64ffs[i] = ia * 123456789ull;
> + b64ffs[i] = 0;
> + }
> +
> + popc32 (b32pc, a32pc, SZ);
> + ctz32 (b32ctz, a32ctz, SZ);
> + ffs32 (b32ffs, a32ffs, SZ);
> + popc64 (b64pc, a64pc, SZ);
> + ctz64 (b64ctz, a64ctz, SZ);
> + ffs64 (b64ffs, a64ffs, SZ);
> +
> + for (int i = 0; i < SZ; i++)
> + {
> + assert (b32pc[i] == __builtin_popcount (a32pc[i]));
> + assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
> + assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
> + assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
> + assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
> + assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index ef806e2346e..b812354b986 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> return widen_abd_stmt;
> }
>
> +/* Return TRUE if we have the necessary operations to create a vectorized
> + popcount for type VEC_TYPE. */
> +
> +static bool
> +vect_have_popcount_fallback (tree vec_type)
> +{
> + return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar)
optab_vector would also work (vectorizable_shift will try both)
> + && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default)
> + && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default)
> + && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default)
> + && optab_for_tree_code (MULT_EXPR, vec_type, optab_default));
... note this doesn't actually check the target can do these operations,
you'd have to look whether optab_handler (optab, TYPE_MODE (vec_type))
isn't CODE_FOR_nothing. I see we don't do this consistently though,
and the alternative is a known unsupported popcount.
Did you check whether we try popcount with DImode before using the
fallback for SImode? Or whether we try V2nSImode before falling
back to VnDImode? Note that if the target has popcountqi or hi then
we can end up pattern matching popcount for those modes, not sure
whether targets usually support vectorized those.
> +}
> +
> +/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
> + does (and match.pd recognizes). There are only 32-bit and 64-bit
> + variants.
> + It returns the generated gimple sequence of vector instructions with
> + type VEC_TYPE which is being attached to STMT_VINFO.
> + RHS is the unpromoted input value and LHS_TYPE is the output type.
> + RET_VAR is the address of an SSA variable that holds the result
> + of the last operation. It needs to be created before calling
> + this function and must have LHS_TYPE.
> +
> + int popcount64c (uint64_t x)
> + {
> + x -= (x >> 1) & 0x5555555555555555ULL;
> + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
> + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
> + return (x * 0x0101010101010101ULL) >> 56;
> + }
> +
> + int popcount32c (uint32_t x)
> + {
> + x -= (x >> 1) & 0x55555555;
> + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
> + x = (x + (x >> 4)) & 0x0f0f0f0f;
> + return (x * 0x01010101) >> 24;
> + }
> + */
> +
> +static gimple*
> +vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> + vect_unpromoted_value rhs, tree lhs_type,
> + tree vec_type, tree *ret_var)
> +{
> + int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
> + bool is64 = bitsize == 64;
> +
> + tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
> + lhs_type, &rhs, vec_type);
> +
> + tree one_cst = build_one_cst (lhs_type);
> + tree two_cst = build_int_cst (lhs_type, 2);
> + tree four_cst = build_int_cst (lhs_type, 4);
> + tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
> +
> + tree c1 = build_int_cst (lhs_type,
> + is64 ? 0x5555555555555555ull : 0x55555555);
Hmm, looks like we miss a useful helper to produce an
integer constant with a repeated byte sequence? A
unsigned char buf[8];
memset (buf, val, 8);
c1 = native_interpret (...);
would do the trick but I guess we can have it cheaper using wide-int
directly? This must have come up before ...
> + tree c2 = build_int_cst (lhs_type,
> + is64 ? 0x3333333333333333ull : 0x33333333);
> + tree c4 = build_int_cst (lhs_type,
> + is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
> + tree cm = build_int_cst (lhs_type,
> + is64 ? 0x0101010101010101ull : 0x01010101);
> +
> + gassign *g;
> +
> + tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
> +
> + return g;
> +}
> +
> /* Function vect_recog_ctz_ffs_pattern
>
> Try to find the following pattern:
> @@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> }
> if ((ifnnew == IFN_LAST
> || (defined_at_zero && !defined_at_zero_new))
> - && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> - OPTIMIZE_FOR_SPEED))
> + && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> + OPTIMIZE_FOR_SPEED)
> + || vect_have_popcount_fallback (vec_rhs_type)))
> {
> ifnnew = IFN_POPCOUNT;
> defined_at_zero_new = true;
> @@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
> /* Create B = .IFNNEW (A). */
> new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> - pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> - gimple_call_set_lhs (pattern_stmt, new_var);
> - gimple_set_location (pattern_stmt, loc);
> + if (ifnnew == IFN_POPCOUNT
> + && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED))
> + {
> + gcc_assert (vect_have_popcount_fallback (vec_type));
> + vect_unpromoted_value un_rhs;
> + vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
> + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs,
> + lhs_type, vec_type,
> + &new_var);
> + }
> + else
> + {
> + pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> + gimple_call_set_lhs (pattern_stmt, new_var);
> + gimple_set_location (pattern_stmt, loc);
> + }
> *type_out = vec_type;
>
> if (sub)
> @@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> return pattern_stmt;
> }
>
> +
> /* Function vect_recog_popcount_clz_ctz_ffs_pattern
>
> Try to find the following pattern:
> @@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> if (!vec_type)
> return NULL;
>
> + bool popcount_fallback_p = false;
> +
> bool supported
> = direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
> if (!supported)
> switch (ifn)
> {
> case IFN_POPCOUNT:
> + if (vect_have_popcount_fallback (vec_type))
> + popcount_fallback_p = true;
> + break;
> case IFN_CLZ:
> return NULL;
> case IFN_FFS:
> @@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> OPTIMIZE_FOR_SPEED))
> break;
> if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> - OPTIMIZE_FOR_SPEED))
> + OPTIMIZE_FOR_SPEED)
> + || vect_have_popcount_fallback (vec_type))
> break;
> return NULL;
> default:
> @@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
> call_stmt);
>
> - /* Create B = .POPCOUNT (A). */
> new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> - pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> - gimple_call_set_lhs (pattern_stmt, new_var);
> - gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> - *type_out = vec_type;
> + if (!popcount_fallback_p)
> + {
> + /* Create B = .POPCOUNT (A). */
> + pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> + gimple_call_set_lhs (pattern_stmt, new_var);
> + gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> + *type_out = vec_type;
> + }
> + else
> + {
> + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
> + unprom_diff,
> + lhs_type, vec_type,
> + &new_var);
> + *type_out = vec_type;
> + /* For popcount we're done here. */
> + return pattern_stmt;
> + }
>
> if (dump_enabled_p ())
> dump_printf_loc (MSG_NOTE, vect_location,
> --
> 2.41.0
>
> Looks reasonable to me - I couldn't read from above whether you did
> testing on riscv and thus verified the runtime correctness of the fallback?
> If not may I suggest to force matching the pattern on a target you can
> test for this purpose?
I tested on riscv (manually and verified the run test) but didn't bootstrap.
The vector test suite (or autovec) is not yet enabled by default anyway but
that's going to change soon.
> ... note this doesn't actually check the target can do these operations,
> you'd have to look whether optab_handler (optab, TYPE_MODE (vec_type))
> isn't CODE_FOR_nothing. I see we don't do this consistently though,
> and the alternative is a known unsupported popcount.
Yes, agreed. I changed it to
static bool
vect_have_popcount_fallback (tree vec_type)
{
return ((target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_scalar)
|| target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_vector))
&& target_has_op_for_code (PLUS_EXPR, vec_type, optab_default)
&& target_has_op_for_code (MINUS_EXPR, vec_type, optab_default)
&& target_has_op_for_code (BIT_AND_EXPR, vec_type, optab_default)
&& target_has_op_for_code (MULT_EXPR, vec_type, optab_default));
}
target_has_vecop_for_code was already there further down so I
repurposed it that one
+/* Return true iff the target has an optab implementing the operation
+ CODE on type VECTYPE using the optab subtype OPTAB_TYPE. */
+
+static bool
+target_has_op_for_code (tree_code code, tree vectype,
+ enum optab_subtype optab_type)
+{
+ optab optab = optab_for_tree_code (code, vectype, optab_type);
+ return optab
+ && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
+}
Changes attached.
> Did you check whether we try popcount with DImode before using the
> fallback for SImode? Or whether we try V2nSImode before falling
> back to VnDImode? Note that if the target has popcountqi or hi then
> we can end up pattern matching popcount for those modes, not sure
> whether targets usually support vectorized those.
I haven't observed cases where we vectorize a "worse" mode now.
At least aarch64 tries all modes for vectorization and compares costs
(starting with the widest mode IIRC) so I would expect the fallback
version to always have higher costs and not be selected if there
is a real popcount available. riscv also has VECT_COMPARE_COSTS.
Power has QImode and HImode vector popcounts, no VECT_COMPARE_COSTS
but the testsuite is unchanged FWIW. s390 is similar but I couldn't
test it. A problem would probably occur if a target provides
e.g. only popcountv16qi but we would emit a fallback for popcountv2di?
I'd hope there is no such target :D and if so it should use
VECT_COMPARE_COSTS?
> Hmm, looks like we miss a useful helper to produce an
> integer constant with a repeated byte sequence? A
>
> unsigned char buf[8];
> memset (buf, val, 8);
> c1 = native_interpret (...);
>
> would do the trick but I guess we can have it cheaper using wide-int
> directly? This must have come up before ...
I didn't find something comparable and that's probably due to the
lack of a proper search term. Also, I figured the 2-byte repeating
sequences might be trickier anyway and therefore kept it as is.
If you find it too cumbersome I can look for an alternative.
Right now it closely matches what the example C code says which
is not too bad IMHO.
Regards
Robin
From 03d7e953346b763bc3d0359d7d77b1f65ca05d46 Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@ventanamicro.com>
Date: Tue, 1 Aug 2023 22:05:09 +0200
Subject: [PATCH] vect: Add a popcount fallback.
This patch adds a fallback when the backend does not provide a popcount
implementation. The algorithm is the same one libgcc uses, as well as
match.pd for recognizing a popcount idiom.
gcc/ChangeLog:
* tree-vect-patterns.cc (vect_have_popcount_fallback): New
function.
(vect_generate_popcount_fallback): New function to emit
vectorized popcount fallback.
(vect_recog_ctz_ffs_pattern): Use fallback.
(vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-popcount-fallback.c: New test.
---
.../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++
gcc/tree-vect-patterns.cc | 205 +++++++++++++++---
2 files changed, 286 insertions(+), 25 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
new file mode 100644
index 00000000000..c1d23257b8f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
@@ -0,0 +1,106 @@
+/* Check if we vectorize popcount when no expander is available. */
+/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
+/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint-gcc.h>
+
+__attribute__ ((noipa))
+void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcount (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctz (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffs (a[i]);
+}
+
+__attribute__ ((noipa))
+void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcountll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctzll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffsll (a[i]);
+}
+
+#define SZ 512
+
+__attribute__ ((optimize ("0")))
+int main ()
+{
+ uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
+ uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
+ uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
+ uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
+ uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
+ uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
+
+ uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
+ uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
+ uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
+ uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
+ uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
+ uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
+
+ for (int i = 0; i < SZ; i++)
+ {
+ int ia = i + 1;
+ a32pc[i] = ia * 1234567;
+ b32pc[i] = 0;
+ a32ctz[i] = ia * 1234567;
+ b32ctz[i] = 0;
+ a32ffs[i] = ia * 1234567;
+ b32ffs[i] = 0;
+ a64pc[i] = ia * 123456789ull;
+ b64pc[i] = 0;
+ a64ctz[i] = ia * 123456789ull;
+ b64ctz[i] = 0;
+ a64ffs[i] = ia * 123456789ull;
+ b64ffs[i] = 0;
+ }
+
+ popc32 (b32pc, a32pc, SZ);
+ ctz32 (b32ctz, a32ctz, SZ);
+ ffs32 (b32ffs, a32ffs, SZ);
+ popc64 (b64pc, a64pc, SZ);
+ ctz64 (b64ctz, a64ctz, SZ);
+ ffs64 (b64ffs, a64ffs, SZ);
+
+ for (int i = 0; i < SZ; i++)
+ {
+ assert (b32pc[i] == __builtin_popcount (a32pc[i]));
+ assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
+ assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
+ assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
+ assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
+ assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index ef806e2346e..fdb0849fc3e 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1782,6 +1782,135 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return widen_abd_stmt;
}
+/* Return true iff the target has an optab implementing the operation
+ CODE on type VECTYPE using the optab subtype OPTAB_TYPE. */
+
+static bool
+target_has_op_for_code (tree_code code, tree vectype,
+ enum optab_subtype optab_type)
+{
+ optab optab = optab_for_tree_code (code, vectype, optab_type);
+ return optab
+ && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
+}
+
+/* Return TRUE if we have the necessary operations to create a vectorized
+ popcount for type VEC_TYPE. */
+
+static bool
+vect_have_popcount_fallback (tree vec_type)
+{
+ return ((target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_scalar)
+ || target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_vector))
+ && target_has_op_for_code (PLUS_EXPR, vec_type, optab_default)
+ && target_has_op_for_code (MINUS_EXPR, vec_type, optab_default)
+ && target_has_op_for_code (BIT_AND_EXPR, vec_type, optab_default)
+ && target_has_op_for_code (MULT_EXPR, vec_type, optab_default));
+}
+
+/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
+ does (and match.pd recognizes). There are only 32-bit and 64-bit
+ variants.
+ It returns the generated gimple sequence of vector instructions with
+ type VEC_TYPE which is being attached to STMT_VINFO.
+ RHS is the unpromoted input value and LHS_TYPE is the output type.
+ RET_VAR is the address of an SSA variable that holds the result
+ of the last operation. It needs to be created before calling
+ this function and must have LHS_TYPE.
+
+ int popcount64c (uint64_t x)
+ {
+ x -= (x >> 1) & 0x5555555555555555ULL;
+ x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+ x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+ return (x * 0x0101010101010101ULL) >> 56;
+ }
+
+ int popcount32c (uint32_t x)
+ {
+ x -= (x >> 1) & 0x55555555;
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ return (x * 0x01010101) >> 24;
+ }
+ */
+
+static gimple*
+vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+ vect_unpromoted_value rhs, tree lhs_type,
+ tree vec_type, tree *ret_var)
+{
+ int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
+ bool is64 = bitsize == 64;
+
+ tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
+ lhs_type, &rhs, vec_type);
+
+ tree one_cst = build_one_cst (lhs_type);
+ tree two_cst = build_int_cst (lhs_type, 2);
+ tree four_cst = build_int_cst (lhs_type, 4);
+ tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
+
+ tree c1 = build_int_cst (lhs_type,
+ is64 ? 0x5555555555555555ull : 0x55555555);
+ tree c2 = build_int_cst (lhs_type,
+ is64 ? 0x3333333333333333ull : 0x33333333);
+ tree c4 = build_int_cst (lhs_type,
+ is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
+ tree cm = build_int_cst (lhs_type,
+ is64 ? 0x0101010101010101ull : 0x01010101);
+
+ gassign *g;
+
+ tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
+
+ return g;
+}
+
/* Function vect_recog_ctz_ffs_pattern
Try to find the following pattern:
@@ -1894,8 +2023,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
}
if ((ifnnew == IFN_LAST
|| (defined_at_zero && !defined_at_zero_new))
- && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
- OPTIMIZE_FOR_SPEED))
+ && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_rhs_type)))
{
ifnnew = IFN_POPCOUNT;
defined_at_zero_new = true;
@@ -1996,9 +2126,23 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
/* Create B = .IFNNEW (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, loc);
+ if (ifnnew == IFN_POPCOUNT
+ && !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
+ OPTIMIZE_FOR_SPEED))
+ {
+ gcc_assert (vect_have_popcount_fallback (vec_type));
+ vect_unpromoted_value un_rhs;
+ vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs,
+ lhs_type, vec_type,
+ &new_var);
+ }
+ else
+ {
+ pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, loc);
+ }
*type_out = vec_type;
if (sub)
@@ -2042,6 +2186,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return pattern_stmt;
}
+
/* Function vect_recog_popcount_clz_ctz_ffs_pattern
Try to find the following pattern:
@@ -2226,12 +2371,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
if (!vec_type)
return NULL;
+ bool popcount_fallback_p = false;
+
bool supported
= direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
if (!supported)
switch (ifn)
{
case IFN_POPCOUNT:
+ if (vect_have_popcount_fallback (vec_type))
+ popcount_fallback_p = true;
+ break;
case IFN_CLZ:
return NULL;
case IFN_FFS:
@@ -2247,7 +2397,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
OPTIMIZE_FOR_SPEED))
break;
if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
- OPTIMIZE_FOR_SPEED))
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_type))
break;
return NULL;
default:
@@ -2257,12 +2408,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
call_stmt);
- /* Create B = .POPCOUNT (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, gimple_location (last_stmt));
- *type_out = vec_type;
+ if (!popcount_fallback_p)
+ {
+ /* Create B = .POPCOUNT (A). */
+ pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+ *type_out = vec_type;
+ }
+ else
+ {
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
+ unprom_diff,
+ lhs_type, vec_type,
+ &new_var);
+ *type_out = vec_type;
+ /* For popcount we're done here. */
+ return pattern_stmt;
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -4066,17 +4230,6 @@ vect_recog_vector_vector_shift_pattern (vec_info *vinfo,
return pattern_stmt;
}
-/* Return true iff the target has a vector optab implementing the operation
- CODE on type VECTYPE. */
-
-static bool
-target_has_vecop_for_code (tree_code code, tree vectype)
-{
- optab voptab = optab_for_tree_code (code, vectype, optab_vector);
- return voptab
- && optab_handler (voptab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
-}
-
/* Verify that the target has optabs of VECTYPE to perform all the steps
needed by the multiplication-by-immediate synthesis algorithm described by
ALG and VAR. If SYNTH_SHIFT_P is true ensure that vector addition is
@@ -4089,11 +4242,13 @@ target_supports_mult_synth_alg (struct algorithm *alg, mult_variant var,
if (alg->op[0] != alg_zero && alg->op[0] != alg_m)
return false;
- bool supports_vminus = target_has_vecop_for_code (MINUS_EXPR, vectype);
- bool supports_vplus = target_has_vecop_for_code (PLUS_EXPR, vectype);
+ bool supports_vminus = target_has_op_for_code (MINUS_EXPR, vectype,
+ optab_vector);
+ bool supports_vplus = target_has_op_for_code (PLUS_EXPR, vectype,
+ optab_vector);
if (var == negate_variant
- && !target_has_vecop_for_code (NEGATE_EXPR, vectype))
+ && !target_has_op_for_code (NEGATE_EXPR, vectype, optab_vector))
return false;
/* If we must synthesize shifts with additions make sure that vector
On Tue, Aug 8, 2023 at 10:55 AM Robin Dapp <rdapp.gcc@gmail.com> wrote:
>
> > Looks reasonable to me - I couldn't read from above whether you did
> > testing on riscv and thus verified the runtime correctness of the fallback?
> > If not may I suggest to force matching the pattern on a target you can
> > test for this purpose?
>
> I tested on riscv (manually and verified the run test) but didn't bootstrap.
> The vector test suite (or autovec) is not yet enabled by default anyway but
> that's going to change soon.
>
> > ... note this doesn't actually check the target can do these operations,
> > you'd have to look whether optab_handler (optab, TYPE_MODE (vec_type))
> > isn't CODE_FOR_nothing. I see we don't do this consistently though,
> > and the alternative is a known unsupported popcount.
>
> Yes, agreed. I changed it to
>
> static bool
> vect_have_popcount_fallback (tree vec_type)
> {
> return ((target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_scalar)
> || target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_vector))
> && target_has_op_for_code (PLUS_EXPR, vec_type, optab_default)
> && target_has_op_for_code (MINUS_EXPR, vec_type, optab_default)
> && target_has_op_for_code (BIT_AND_EXPR, vec_type, optab_default)
> && target_has_op_for_code (MULT_EXPR, vec_type, optab_default));
> }
>
> target_has_vecop_for_code was already there further down so I
> repurposed it that one
>
> +/* Return true iff the target has an optab implementing the operation
> + CODE on type VECTYPE using the optab subtype OPTAB_TYPE. */
> +
> +static bool
> +target_has_op_for_code (tree_code code, tree vectype,
> + enum optab_subtype optab_type)
> +{
> + optab optab = optab_for_tree_code (code, vectype, optab_type);
> + return optab
> + && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
> +}
>
> Changes attached.
>
> > Did you check whether we try popcount with DImode before using the
> > fallback for SImode? Or whether we try V2nSImode before falling
> > back to VnDImode? Note that if the target has popcountqi or hi then
> > we can end up pattern matching popcount for those modes, not sure
> > whether targets usually support vectorized those.
>
> I haven't observed cases where we vectorize a "worse" mode now.
> At least aarch64 tries all modes for vectorization and compares costs
> (starting with the widest mode IIRC) so I would expect the fallback
> version to always have higher costs and not be selected if there
> is a real popcount available. riscv also has VECT_COMPARE_COSTS.
>
> Power has QImode and HImode vector popcounts, no VECT_COMPARE_COSTS
> but the testsuite is unchanged FWIW. s390 is similar but I couldn't
> test it. A problem would probably occur if a target provides
> e.g. only popcountv16qi but we would emit a fallback for popcountv2di?
> I'd hope there is no such target :D and if so it should use
> VECT_COMPARE_COSTS?
Well, not sure how VECT_COMPARE_COSTS can help here, we either
get the pattern or vectorize the original function. There's no special handling
for popcount in vectorizable_call so all special cases are handled via patterns.
I was thinking of popcounthi via popcountsi and zero-extend / truncate but
also popcountdi via popcountsi and reducing even/odd SI results via a plus
to a single DI result. It might be that targets without DI/TI popcount support
but SI popcount support might exist and that this might be cheaper than
the generic open-coded scheme. But of course such target could then
implement the DImode version with that trick itself.
>
> > Hmm, looks like we miss a useful helper to produce an
> > integer constant with a repeated byte sequence? A
> >
> > unsigned char buf[8];
> > memset (buf, val, 8);
> > c1 = native_interpret (...);
> >
> > would do the trick but I guess we can have it cheaper using wide-int
> > directly? This must have come up before ...
>
> I didn't find something comparable and that's probably due to the
> lack of a proper search term. Also, I figured the 2-byte repeating
> sequences might be trickier anyway and therefore kept it as is.
> If you find it too cumbersome I can look for an alternative.
> Right now it closely matches what the example C code says which
> is not too bad IMHO.
I agree with two cases it isn't too bad, note you probably get away
with using the full 64bit constant for both 64bit and 32bit, we simply
truncate it. Note rather than 'ull' we have the HOST_WIDE_INT_UC
macro which appends the appropriate suffix.
The patch is OK with or without changing this detail.
Thanks,
Richard.
> Regards
> Robin
>
> From 03d7e953346b763bc3d0359d7d77b1f65ca05d46 Mon Sep 17 00:00:00 2001
> From: Robin Dapp <rdapp@ventanamicro.com>
> Date: Tue, 1 Aug 2023 22:05:09 +0200
> Subject: [PATCH] vect: Add a popcount fallback.
>
> This patch adds a fallback when the backend does not provide a popcount
> implementation. The algorithm is the same one libgcc uses, as well as
> match.pd for recognizing a popcount idiom.
>
> gcc/ChangeLog:
>
> * tree-vect-patterns.cc (vect_have_popcount_fallback): New
> function.
> (vect_generate_popcount_fallback): New function to emit
> vectorized popcount fallback.
> (vect_recog_ctz_ffs_pattern): Use fallback.
> (vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-popcount-fallback.c: New test.
> ---
> .../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++
> gcc/tree-vect-patterns.cc | 205 +++++++++++++++---
> 2 files changed, 286 insertions(+), 25 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> new file mode 100644
> index 00000000000..c1d23257b8f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> @@ -0,0 +1,106 @@
> +/* Check if we vectorize popcount when no expander is available. */
> +/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
> +/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdlib.h>
> +#include <assert.h>
> +#include <stdint-gcc.h>
> +
> +__attribute__ ((noipa))
> +void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_popcount (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ctz (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ffs (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_popcountll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ctzll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = __builtin_ffsll (a[i]);
> +}
> +
> +#define SZ 512
> +
> +__attribute__ ((optimize ("0")))
> +int main ()
> +{
> + uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
> + uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
> + uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
> + uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
> + uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
> + uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
> +
> + uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
> + uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
> + uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
> + uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
> + uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
> + uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
> +
> + for (int i = 0; i < SZ; i++)
> + {
> + int ia = i + 1;
> + a32pc[i] = ia * 1234567;
> + b32pc[i] = 0;
> + a32ctz[i] = ia * 1234567;
> + b32ctz[i] = 0;
> + a32ffs[i] = ia * 1234567;
> + b32ffs[i] = 0;
> + a64pc[i] = ia * 123456789ull;
> + b64pc[i] = 0;
> + a64ctz[i] = ia * 123456789ull;
> + b64ctz[i] = 0;
> + a64ffs[i] = ia * 123456789ull;
> + b64ffs[i] = 0;
> + }
> +
> + popc32 (b32pc, a32pc, SZ);
> + ctz32 (b32ctz, a32ctz, SZ);
> + ffs32 (b32ffs, a32ffs, SZ);
> + popc64 (b64pc, a64pc, SZ);
> + ctz64 (b64ctz, a64ctz, SZ);
> + ffs64 (b64ffs, a64ffs, SZ);
> +
> + for (int i = 0; i < SZ; i++)
> + {
> + assert (b32pc[i] == __builtin_popcount (a32pc[i]));
> + assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
> + assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
> + assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
> + assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
> + assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } } */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index ef806e2346e..fdb0849fc3e 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -1782,6 +1782,135 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> return widen_abd_stmt;
> }
>
> +/* Return true iff the target has an optab implementing the operation
> + CODE on type VECTYPE using the optab subtype OPTAB_TYPE. */
> +
> +static bool
> +target_has_op_for_code (tree_code code, tree vectype,
> + enum optab_subtype optab_type)
> +{
> + optab optab = optab_for_tree_code (code, vectype, optab_type);
> + return optab
> + && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
> +}
> +
> +/* Return TRUE if we have the necessary operations to create a vectorized
> + popcount for type VEC_TYPE. */
> +
> +static bool
> +vect_have_popcount_fallback (tree vec_type)
> +{
> + return ((target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_scalar)
> + || target_has_op_for_code (RSHIFT_EXPR, vec_type, optab_vector))
> + && target_has_op_for_code (PLUS_EXPR, vec_type, optab_default)
> + && target_has_op_for_code (MINUS_EXPR, vec_type, optab_default)
> + && target_has_op_for_code (BIT_AND_EXPR, vec_type, optab_default)
> + && target_has_op_for_code (MULT_EXPR, vec_type, optab_default));
> +}
> +
> +/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
> + does (and match.pd recognizes). There are only 32-bit and 64-bit
> + variants.
> + It returns the generated gimple sequence of vector instructions with
> + type VEC_TYPE which is being attached to STMT_VINFO.
> + RHS is the unpromoted input value and LHS_TYPE is the output type.
> + RET_VAR is the address of an SSA variable that holds the result
> + of the last operation. It needs to be created before calling
> + this function and must have LHS_TYPE.
> +
> + int popcount64c (uint64_t x)
> + {
> + x -= (x >> 1) & 0x5555555555555555ULL;
> + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
> + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
> + return (x * 0x0101010101010101ULL) >> 56;
> + }
> +
> + int popcount32c (uint32_t x)
> + {
> + x -= (x >> 1) & 0x55555555;
> + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
> + x = (x + (x >> 4)) & 0x0f0f0f0f;
> + return (x * 0x01010101) >> 24;
> + }
> + */
> +
> +static gimple*
> +vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> + vect_unpromoted_value rhs, tree lhs_type,
> + tree vec_type, tree *ret_var)
> +{
> + int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
> + bool is64 = bitsize == 64;
> +
> + tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
> + lhs_type, &rhs, vec_type);
> +
> + tree one_cst = build_one_cst (lhs_type);
> + tree two_cst = build_int_cst (lhs_type, 2);
> + tree four_cst = build_int_cst (lhs_type, 4);
> + tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
> +
> + tree c1 = build_int_cst (lhs_type,
> + is64 ? 0x5555555555555555ull : 0x55555555);
> + tree c2 = build_int_cst (lhs_type,
> + is64 ? 0x3333333333333333ull : 0x33333333);
> + tree c4 = build_int_cst (lhs_type,
> + is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
> + tree cm = build_int_cst (lhs_type,
> + is64 ? 0x0101010101010101ull : 0x01010101);
> +
> + gassign *g;
> +
> + tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
> + g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
> + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> + g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
> +
> + return g;
> +}
> +
> /* Function vect_recog_ctz_ffs_pattern
>
> Try to find the following pattern:
> @@ -1894,8 +2023,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> }
> if ((ifnnew == IFN_LAST
> || (defined_at_zero && !defined_at_zero_new))
> - && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> - OPTIMIZE_FOR_SPEED))
> + && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> + OPTIMIZE_FOR_SPEED)
> + || vect_have_popcount_fallback (vec_rhs_type)))
> {
> ifnnew = IFN_POPCOUNT;
> defined_at_zero_new = true;
> @@ -1996,9 +2126,23 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>
> /* Create B = .IFNNEW (A). */
> new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> - pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> - gimple_call_set_lhs (pattern_stmt, new_var);
> - gimple_set_location (pattern_stmt, loc);
> + if (ifnnew == IFN_POPCOUNT
> + && !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> + OPTIMIZE_FOR_SPEED))
> + {
> + gcc_assert (vect_have_popcount_fallback (vec_type));
> + vect_unpromoted_value un_rhs;
> + vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
> + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs,
> + lhs_type, vec_type,
> + &new_var);
> + }
> + else
> + {
> + pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> + gimple_call_set_lhs (pattern_stmt, new_var);
> + gimple_set_location (pattern_stmt, loc);
> + }
> *type_out = vec_type;
>
> if (sub)
> @@ -2042,6 +2186,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> return pattern_stmt;
> }
>
> +
> /* Function vect_recog_popcount_clz_ctz_ffs_pattern
>
> Try to find the following pattern:
> @@ -2226,12 +2371,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> if (!vec_type)
> return NULL;
>
> + bool popcount_fallback_p = false;
> +
> bool supported
> = direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
> if (!supported)
> switch (ifn)
> {
> case IFN_POPCOUNT:
> + if (vect_have_popcount_fallback (vec_type))
> + popcount_fallback_p = true;
> + break;
> case IFN_CLZ:
> return NULL;
> case IFN_FFS:
> @@ -2247,7 +2397,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> OPTIMIZE_FOR_SPEED))
> break;
> if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> - OPTIMIZE_FOR_SPEED))
> + OPTIMIZE_FOR_SPEED)
> + || vect_have_popcount_fallback (vec_type))
> break;
> return NULL;
> default:
> @@ -2257,12 +2408,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
> call_stmt);
>
> - /* Create B = .POPCOUNT (A). */
> new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> - pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> - gimple_call_set_lhs (pattern_stmt, new_var);
> - gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> - *type_out = vec_type;
> + if (!popcount_fallback_p)
> + {
> + /* Create B = .POPCOUNT (A). */
> + pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> + gimple_call_set_lhs (pattern_stmt, new_var);
> + gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> + *type_out = vec_type;
> + }
> + else
> + {
> + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
> + unprom_diff,
> + lhs_type, vec_type,
> + &new_var);
> + *type_out = vec_type;
> + /* For popcount we're done here. */
> + return pattern_stmt;
> + }
>
> if (dump_enabled_p ())
> dump_printf_loc (MSG_NOTE, vect_location,
> @@ -4066,17 +4230,6 @@ vect_recog_vector_vector_shift_pattern (vec_info *vinfo,
> return pattern_stmt;
> }
>
> -/* Return true iff the target has a vector optab implementing the operation
> - CODE on type VECTYPE. */
> -
> -static bool
> -target_has_vecop_for_code (tree_code code, tree vectype)
> -{
> - optab voptab = optab_for_tree_code (code, vectype, optab_vector);
> - return voptab
> - && optab_handler (voptab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
> -}
> -
> /* Verify that the target has optabs of VECTYPE to perform all the steps
> needed by the multiplication-by-immediate synthesis algorithm described by
> ALG and VAR. If SYNTH_SHIFT_P is true ensure that vector addition is
> @@ -4089,11 +4242,13 @@ target_supports_mult_synth_alg (struct algorithm *alg, mult_variant var,
> if (alg->op[0] != alg_zero && alg->op[0] != alg_m)
> return false;
>
> - bool supports_vminus = target_has_vecop_for_code (MINUS_EXPR, vectype);
> - bool supports_vplus = target_has_vecop_for_code (PLUS_EXPR, vectype);
> + bool supports_vminus = target_has_op_for_code (MINUS_EXPR, vectype,
> + optab_vector);
> + bool supports_vplus = target_has_op_for_code (PLUS_EXPR, vectype,
> + optab_vector);
>
> if (var == negate_variant
> - && !target_has_vecop_for_code (NEGATE_EXPR, vectype))
> + && !target_has_op_for_code (NEGATE_EXPR, vectype, optab_vector))
> return false;
>
> /* If we must synthesize shifts with additions make sure that vector
> --
> 2.41.0
>
> Well, not sure how VECT_COMPARE_COSTS can help here, we either
> get the pattern or vectorize the original function. There's no special handling
> for popcount in vectorizable_call so all special cases are handled via patterns.
> I was thinking of popcounthi via popcountsi and zero-extend / truncate but
> also popcountdi via popcountsi and reducing even/odd SI results via a plus
> to a single DI result. It might be that targets without DI/TI popcount support
> but SI popcount support might exist and that this might be cheaper than
> the generic open-coded scheme. But of course such target could then
> implement the DImode version with that trick itself.
Ah, then I misunderstood. Yes, that would be a better fallback option.
A thing for my "spare time" pile :)
Btw another thing I noticed:
/* Input and output of .POPCOUNT should be same-precision integer. */
if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
return NULL;
This prevents us from vectorizing i.e.
(uint64_t)__builtin_popcount(uint32_t). It appears like an
unnecessary restriction as all types should be able to hold a popcount
result (as long as TYPE_PRECISION > 6) if the result is properly
converted? Maybe it complicates the fallback handling but in general
we should be fine?
> I agree with two cases it isn't too bad, note you probably get away
> with using the full 64bit constant for both 64bit and 32bit, we simply
> truncate it. Note rather than 'ull' we have the HOST_WIDE_INT_UC
> macro which appends the appropriate suffix.
>
> The patch is OK with or without changing this detail.
Thanks, changed to the full constant. Going to push after bootstrap
and testsuite runs.
Regards
Robin
On Tue, Aug 8, 2023 at 1:37 PM Robin Dapp <rdapp.gcc@gmail.com> wrote:
>
> > Well, not sure how VECT_COMPARE_COSTS can help here, we either
> > get the pattern or vectorize the original function. There's no special handling
> > for popcount in vectorizable_call so all special cases are handled via patterns.
> > I was thinking of popcounthi via popcountsi and zero-extend / truncate but
> > also popcountdi via popcountsi and reducing even/odd SI results via a plus
> > to a single DI result. It might be that targets without DI/TI popcount support
> > but SI popcount support might exist and that this might be cheaper than
> > the generic open-coded scheme. But of course such target could then
> > implement the DImode version with that trick itself.
>
> Ah, then I misunderstood. Yes, that would be a better fallback option.
> A thing for my "spare time" pile :)
>
> Btw another thing I noticed:
>
> /* Input and output of .POPCOUNT should be same-precision integer. */
> if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
> return NULL;
>
> This prevents us from vectorizing i.e.
> (uint64_t)__builtin_popcount(uint32_t). It appears like an
> unnecessary restriction as all types should be able to hold a popcount
> result (as long as TYPE_PRECISION > 6) if the result is properly
> converted? Maybe it complicates the fallback handling but in general
> we should be fine?
Hmm, the conversion should be a separate statement so I wonder
why it would go wrong?
Richard.
>
> > I agree with two cases it isn't too bad, note you probably get away
> > with using the full 64bit constant for both 64bit and 32bit, we simply
> > truncate it. Note rather than 'ull' we have the HOST_WIDE_INT_UC
> > macro which appends the appropriate suffix.
> >
> > The patch is OK with or without changing this detail.
>
> Thanks, changed to the full constant. Going to push after bootstrap
> and testsuite runs.
>
> Regards
> Robin
> Hmm, the conversion should be a separate statement so I wonder
> why it would go wrong?
It is indeed. Yet, lhs_type is the lhs type of the conversion
and not the call and consequently we compare the precision of
the converted type with the popcount input.
So we should probably rather do something like:
+ tree call_lhs = gimple_call_lhs (call_stmt);
+
/* Input and output of .POPCOUNT should be same-precision integer. */
- if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
+ if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (TREE_TYPE (call_lhs)))
return NULL;
Regards
Robin
On Tue, Aug 8, 2023 at 3:06 PM Robin Dapp <rdapp.gcc@gmail.com> wrote:
>
> > Hmm, the conversion should be a separate statement so I wonder
> > why it would go wrong?
>
> It is indeed. Yet, lhs_type is the lhs type of the conversion
> and not the call and consequently we compare the precision of
> the converted type with the popcount input.
>
> So we should probably rather do something like:
>
> + tree call_lhs = gimple_call_lhs (call_stmt);
> +
> /* Input and output of .POPCOUNT should be same-precision integer. */
> - if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
> + if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (TREE_TYPE (call_lhs)))
> return NULL;
We seem to be looking at promotions of the call argument, lhs_type
is the same as the type of the call LHS. But the comment mentions .POPCOUNT
and the following code also handles others, so maybe handling should be
moved. Also when we look to vectorize popcount (x) instead of popcount((T)x)
we can simply promote the result accordingly.
It looks like vect_recog_popcount_clz_ctz_ffs_pattern is specifcally for
the conversions, so your fallback should possibly apply even when not
matching them.
> Regards
> Robin
On 8/8/23 02:55, Robin Dapp via Gcc-patches wrote:
>> Looks reasonable to me - I couldn't read from above whether you did
>> testing on riscv and thus verified the runtime correctness of the fallback?
>> If not may I suggest to force matching the pattern on a target you can
>> test for this purpose?
>
> I tested on riscv (manually and verified the run test) but didn't bootstrap.
> The vector test suite (or autovec) is not yet enabled by default anyway but
> that's going to change soon.
Presumably this is an alternative to the approach Juzhe posted a week or
two ago and ultimately dropped?
Jeff
> Presumably this is an alternative to the approach Juzhe posted a week
> or two ago and ultimately dropped?
Yeah, I figured having a generic fallback could help more targets.
We can still have a better expander if we see the need.
Regards
Robin
> We seem to be looking at promotions of the call argument, lhs_type
> is the same as the type of the call LHS. But the comment mentions .POPCOUNT
> and the following code also handles others, so maybe handling should be
> moved. Also when we look to vectorize popcount (x) instead of popcount((T)x)
> we can simply promote the result accordingly.
IMHO lhs_type is the type of the conversion
lhs_oprnd = gimple_assign_lhs (last_stmt);
lhs_type = TREE_TYPE (lhs_oprnd);
and rhs/unprom_diff has the type of the call's input argument
rhs_oprnd = gimple_call_arg (call_stmt, 0);
vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
So we can potentially have
T0 arg
T1 in = (T1)arg
T2 ret = __builtin_popcount (in)
T3 lhs = (T3)ret
and we're checking if precision (T0) == precision (T3).
This will never be true for a proper __builtin_popcountll except if
the return value is cast to uint64_t (which I just happened to do
in my test...). Therefore it still doesn't really make sense to me.
Interestingly though, it helps for an aarch64 __builtin_popcountll
testcase where we abort here and then manage to vectorize via
vectorizable_call. When we skip this check, recognition succeeds
and replaces the call with the pattern. Then scalar costs are lower
than in the vectorizable_call case because __builtin_popcountll is
not STMT_VINFO_RELEVANT_P anymore (not live or so?).
Then, vectorization costs are too high compared to the wrong scalar
costs and we don't vectorize... Odd, might require fixing separately.
We might need to calculate the scalar costs in advance?
> It looks like vect_recog_popcount_clz_ctz_ffs_pattern is specifcally for
> the conversions, so your fallback should possibly apply even when not
> matching them.
Mhm, yes it appears to only match when casting the return value to
something else than an int. So we'd need a fallback in vectorizable_call?
And it would potentially look a bit out of place there only handling
popcount and not ctz, clz, ... Not sure if it is worth it then?
Regards
Robin
On Wed, Aug 9, 2023 at 12:23 PM Robin Dapp <rdapp.gcc@gmail.com> wrote:
>
> > We seem to be looking at promotions of the call argument, lhs_type
> > is the same as the type of the call LHS. But the comment mentions .POPCOUNT
> > and the following code also handles others, so maybe handling should be
> > moved. Also when we look to vectorize popcount (x) instead of popcount((T)x)
> > we can simply promote the result accordingly.
>
> IMHO lhs_type is the type of the conversion
>
> lhs_oprnd = gimple_assign_lhs (last_stmt);
> lhs_type = TREE_TYPE (lhs_oprnd);
>
> and rhs/unprom_diff has the type of the call's input argument
>
> rhs_oprnd = gimple_call_arg (call_stmt, 0);
> vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
>
> So we can potentially have
> T0 arg
> T1 in = (T1)arg
> T2 ret = __builtin_popcount (in)
> T3 lhs = (T3)ret
>
> and we're checking if precision (T0) == precision (T3).
Looks like so. Note T1 == T2. What we're really after is
changing T1/T2 and the actual popcount used closer to
T0/T3, like in case T0 was 'char' and T3 was 'long' we
could still use popcountqi and then widen to T3 (or the
other way around). So yes, I think requiring that T0 and T3
are equal isn't necessary.
> This will never be true for a proper __builtin_popcountll except if
> the return value is cast to uint64_t (which I just happened to do
> in my test...). Therefore it still doesn't really make sense to me.
>
> Interestingly though, it helps for an aarch64 __builtin_popcountll
> testcase where we abort here and then manage to vectorize via
> vectorizable_call. When we skip this check, recognition succeeds
> and replaces the call with the pattern. Then scalar costs are lower
> than in the vectorizable_call case because __builtin_popcountll is
> not STMT_VINFO_RELEVANT_P anymore (not live or so?).
> Then, vectorization costs are too high compared to the wrong scalar
> costs and we don't vectorize... Odd, might require fixing separately.
> We might need to calculate the scalar costs in advance?
>
> > It looks like vect_recog_popcount_clz_ctz_ffs_pattern is specifcally for
> > the conversions, so your fallback should possibly apply even when not
> > matching them.
>
> Mhm, yes it appears to only match when casting the return value to
> something else than an int. So we'd need a fallback in vectorizable_call?
> And it would potentially look a bit out of place there only handling
> popcount and not ctz, clz, ... Not sure if it is worth it then?
I'd keep the handling as pattern just also match on popcount directly
when not converted.
>
> Regards
> Robin
>
new file mode 100644
@@ -0,0 +1,106 @@
+/* Check if we vectorize popcount when no expander is available. */
+/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
+/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint-gcc.h>
+
+__attribute__ ((noipa))
+void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcount (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctz (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffs (a[i]);
+}
+
+__attribute__ ((noipa))
+void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcountll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctzll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffsll (a[i]);
+}
+
+#define SZ 512
+
+__attribute__ ((optimize ("0")))
+int main ()
+{
+ uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
+ uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
+ uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
+ uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
+ uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
+ uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
+
+ uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
+ uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
+ uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
+ uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
+ uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
+ uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
+
+ for (int i = 0; i < SZ; i++)
+ {
+ int ia = i + 1;
+ a32pc[i] = ia * 1234567;
+ b32pc[i] = 0;
+ a32ctz[i] = ia * 1234567;
+ b32ctz[i] = 0;
+ a32ffs[i] = ia * 1234567;
+ b32ffs[i] = 0;
+ a64pc[i] = ia * 123456789ull;
+ b64pc[i] = 0;
+ a64ctz[i] = ia * 123456789ull;
+ b64ctz[i] = 0;
+ a64ffs[i] = ia * 123456789ull;
+ b64ffs[i] = 0;
+ }
+
+ popc32 (b32pc, a32pc, SZ);
+ ctz32 (b32ctz, a32ctz, SZ);
+ ffs32 (b32ffs, a32ffs, SZ);
+ popc64 (b64pc, a64pc, SZ);
+ ctz64 (b64ctz, a64ctz, SZ);
+ ffs64 (b64ffs, a64ffs, SZ);
+
+ for (int i = 0; i < SZ; i++)
+ {
+ assert (b32pc[i] == __builtin_popcount (a32pc[i]));
+ assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
+ assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
+ assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
+ assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
+ assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
@@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return widen_abd_stmt;
}
+/* Return TRUE if we have the necessary operations to create a vectorized
+ popcount for type VEC_TYPE. */
+
+static bool
+vect_have_popcount_fallback (tree vec_type)
+{
+ return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar)
+ && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (MULT_EXPR, vec_type, optab_default));
+}
+
+/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
+ does (and match.pd recognizes). There are only 32-bit and 64-bit
+ variants.
+ It returns the generated gimple sequence of vector instructions with
+ type VEC_TYPE which is being attached to STMT_VINFO.
+ RHS is the unpromoted input value and LHS_TYPE is the output type.
+ RET_VAR is the address of an SSA variable that holds the result
+ of the last operation. It needs to be created before calling
+ this function and must have LHS_TYPE.
+
+ int popcount64c (uint64_t x)
+ {
+ x -= (x >> 1) & 0x5555555555555555ULL;
+ x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+ x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+ return (x * 0x0101010101010101ULL) >> 56;
+ }
+
+ int popcount32c (uint32_t x)
+ {
+ x -= (x >> 1) & 0x55555555;
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ return (x * 0x01010101) >> 24;
+ }
+ */
+
+static gimple*
+vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+ vect_unpromoted_value rhs, tree lhs_type,
+ tree vec_type, tree *ret_var)
+{
+ int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
+ bool is64 = bitsize == 64;
+
+ tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
+ lhs_type, &rhs, vec_type);
+
+ tree one_cst = build_one_cst (lhs_type);
+ tree two_cst = build_int_cst (lhs_type, 2);
+ tree four_cst = build_int_cst (lhs_type, 4);
+ tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
+
+ tree c1 = build_int_cst (lhs_type,
+ is64 ? 0x5555555555555555ull : 0x55555555);
+ tree c2 = build_int_cst (lhs_type,
+ is64 ? 0x3333333333333333ull : 0x33333333);
+ tree c4 = build_int_cst (lhs_type,
+ is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
+ tree cm = build_int_cst (lhs_type,
+ is64 ? 0x0101010101010101ull : 0x01010101);
+
+ gassign *g;
+
+ tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
+
+ return g;
+}
+
/* Function vect_recog_ctz_ffs_pattern
Try to find the following pattern:
@@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
}
if ((ifnnew == IFN_LAST
|| (defined_at_zero && !defined_at_zero_new))
- && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
- OPTIMIZE_FOR_SPEED))
+ && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_rhs_type)))
{
ifnnew = IFN_POPCOUNT;
defined_at_zero_new = true;
@@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
/* Create B = .IFNNEW (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, loc);
+ if (ifnnew == IFN_POPCOUNT
+ && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED))
+ {
+ gcc_assert (vect_have_popcount_fallback (vec_type));
+ vect_unpromoted_value un_rhs;
+ vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs,
+ lhs_type, vec_type,
+ &new_var);
+ }
+ else
+ {
+ pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, loc);
+ }
*type_out = vec_type;
if (sub)
@@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return pattern_stmt;
}
+
/* Function vect_recog_popcount_clz_ctz_ffs_pattern
Try to find the following pattern:
@@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
if (!vec_type)
return NULL;
+ bool popcount_fallback_p = false;
+
bool supported
= direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
if (!supported)
switch (ifn)
{
case IFN_POPCOUNT:
+ if (vect_have_popcount_fallback (vec_type))
+ popcount_fallback_p = true;
+ break;
case IFN_CLZ:
return NULL;
case IFN_FFS:
@@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
OPTIMIZE_FOR_SPEED))
break;
if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
- OPTIMIZE_FOR_SPEED))
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_type))
break;
return NULL;
default:
@@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
call_stmt);
- /* Create B = .POPCOUNT (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, gimple_location (last_stmt));
- *type_out = vec_type;
+ if (!popcount_fallback_p)
+ {
+ /* Create B = .POPCOUNT (A). */
+ pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+ *type_out = vec_type;
+ }
+ else
+ {
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
+ unprom_diff,
+ lhs_type, vec_type,
+ &new_var);
+ *type_out = vec_type;
+ /* For popcount we're done here. */
+ return pattern_stmt;
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,