[5/8] middle-end: Add cltz_complement idiom recognition
Checks
Commit Message
This recognises patterns of the form:
while (n) { n >>= 1 }
This patch results in improved (but still suboptimal) codegen:
foo (unsigned int b) {
int c = 0;
while (b) {
b >>= 1;
c++;
}
return c;
}
foo:
.LFB11:
.cfi_startproc
cbz w0, .L3
clz w1, w0
tst x0, 1
mov w0, 32
sub w0, w0, w1
csel w0, w0, wzr, ne
ret
The conditional is unnecessary. phiopt could recognise a redundant csel
(using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
clz call, but it cannot recognise the redunancy when the input is (e.g.)
(32 - clz).
I could perhaps extend this function to recognise this pattern in a later
patch, if this is a good place to recognise more patterns.
gcc/ChangeLog:
* tree-scalar-evolution.cc (expression_expensive_p): Add checks
for c[lt]z optabs.
* tree-ssa-loop-niter.cc (build_cltz_expr): New.
(number_of_iterations_cltz_complement): New.
(number_of_iterations_bitcount): Add call to the above.
gcc/testsuite/ChangeLog:
* lib/target-supports.exp (check_effective_target_clz)
(check_effective_target_clzl, check_effective_target_clzll)
(check_effective_target_ctz, check_effective_target_clzl)
(check_effective_target_ctzll): New.
* gcc.dg/tree-ssa/cltz-complement-max.c: New test.
* gcc.dg/tree-ssa/clz-complement-char.c: New test.
* gcc.dg/tree-ssa/clz-complement-int.c: New test.
* gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/clz-complement-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-char.c: New test.
* gcc.dg/tree-ssa/ctz-complement-int.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long.c: New test.
--
Comments
On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This recognises patterns of the form:
> while (n) { n >>= 1 }
>
> This patch results in improved (but still suboptimal) codegen:
>
> foo (unsigned int b) {
> int c = 0;
>
> while (b) {
> b >>= 1;
> c++;
> }
>
> return c;
> }
>
> foo:
> .LFB11:
> .cfi_startproc
> cbz w0, .L3
> clz w1, w0
> tst x0, 1
> mov w0, 32
> sub w0, w0, w1
> csel w0, w0, wzr, ne
> ret
>
> The conditional is unnecessary. phiopt could recognise a redundant csel
> (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> clz call, but it cannot recognise the redunancy when the input is (e.g.)
> (32 - clz).
>
> I could perhaps extend this function to recognise this pattern in a later
> patch, if this is a good place to recognise more patterns.
>
> gcc/ChangeLog:
>
> * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> for c[lt]z optabs.
> * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> (number_of_iterations_cltz_complement): New.
> (number_of_iterations_bitcount): Add call to the above.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp (check_effective_target_clz)
> (check_effective_target_clzl, check_effective_target_clzll)
> (check_effective_target_ctz, check_effective_target_clzl)
> (check_effective_target_ctzll): New.
> * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
>
>
> --
>
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1a29ca52e42e50822e4e3213b2cb008b766d0318
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int clz_complement_count1 (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> + if (c <= PREC)
> + return 0;
> + else
> + return 34567;
> +}
> +
> +int clz_complement_count2 (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> + if (c <= PREC - 1)
> + return 0;
> + else
> + return 76543;
> +}
> +
> +int ctz_complement_count1 (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> + if (c <= PREC)
> + return 0;
> + else
> + return 23456;
> +}
> +
> +int ctz_complement_count2 (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> + if (c <= PREC - 1)
> + return 0;
> + else
> + return 65432;
> +}
> +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(5) != 3)
> + __builtin_abort ();
> + if (foo(255) != 8)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(5) != 3)
> + __builtin_abort ();
> + if (foo(1 << (PREC - 1)) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..7f7793f0efac1f0d793e6e99b84988e5cc5221c9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(5) != 3)
> + __builtin_abort ();
> + if (foo(1LL << (PREC - 1)) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..97161bb7a74260bea20e325ebab64acb33a2b696
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> + int c = 0;
> +
> + while (b) {
> + b >>= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(5) != 3)
> + __builtin_abort ();
> + if (foo(1L << (PREC - 1)) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b9afe8852d8ffbc7ee9a0760cf04b8f98af293a2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(96) != PREC - 5)
> + __builtin_abort ();
> + if (foo(35) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d2702a65daf34db66550d2255395db68a29a4797
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(96) != PREC - 5)
> + __builtin_abort ();
> + if (foo(35) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1ea0d5d7d9f8be1824c4177c33edd91e66b4ddab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(96) != PREC - 5)
> + __builtin_abort ();
> + if (foo(35) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..80fb02dcfa68bc022ae69b26fb189323e01fc6fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> + int c = 0;
> +
> + while (b) {
> + b <<= 1;
> + c++;
> + }
> +
> + return c;
> +}
> +
> +int main()
> +{
> + if (foo(0) != 0)
> + __builtin_abort ();
> + if (foo(96) != PREC - 5)
> + __builtin_abort ();
> + if (foo(35) != PREC)
> + __builtin_abort ();
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index c7f583d6d1498401a7c106ed3f539dcd04f95451..325f12d62324793d6b2cf55b074ef6cc9cf4dd4d 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -8687,6 +8687,72 @@ proc check_effective_target_popcount { } {
> } "" ]
> }
>
> +# Return 1 if the target supports clz on int.
> +
> +proc check_effective_target_clz { } {
> + return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
> + int foo (int b)
> + {
> + return __builtin_clz (b);
> + }
> + } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long.
> +
> +proc check_effective_target_clzl { } {
> + return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
> + int foo (long b)
> + {
> + return __builtin_clzl (b);
> + }
> + } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long long.
> +
> +proc check_effective_target_clzll { } {
> + return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
> + int foo (long long b)
> + {
> + return __builtin_clzll (b);
> + }
> + } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on int.
> +
> +proc check_effective_target_ctz { } {
> + return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
> + int foo (int b)
> + {
> + return __builtin_ctz (b);
> + }
> + } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long.
> +
> +proc check_effective_target_ctzl { } {
> + return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
> + int foo (long b)
> + {
> + return __builtin_ctzl (b);
> + }
> + } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long long.
> +
> +proc check_effective_target_ctzll { } {
> + return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
> + int foo (long long b)
> + {
> + return __builtin_ctzll (b);
> + }
> + } "" ]
> +}
> +
> # Return 1 if the target supports atomic operations on "long long"
> # and can execute them.
> #
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> library call for popcount when backend does not have an instruction
> to do so. We consider this to be expensive and generate
> __builtin_popcount only when backend defines it. */
> + optab optab;
> combined_fn cfn = get_call_combined_fn (expr);
> switch (cfn)
> {
> CASE_CFN_POPCOUNT:
> + optab = popcount_optab;
> + goto bitcount_call;
> + CASE_CFN_CLZ:
> + optab = clz_optab;
> + goto bitcount_call;
> + CASE_CFN_CTZ:
> + optab = ctz_optab;
> +bitcount_call:
> /* Check if opcode for popcount is available in the mode required. */
> - if (optab_handler (popcount_optab,
> + if (optab_handler (optab,
> TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
> == CODE_FOR_nothing)
> {
> @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> instructions. */
> if (is_a <scalar_int_mode> (mode, &int_mode)
> && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> - && (optab_handler (popcount_optab, word_mode)
> + && (optab_handler (optab, word_mode)
> != CODE_FOR_nothing))
> break;
> return true;
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> return true;
> }
>
> +/* Return an expression that counts the leading/trailing zeroes of src. */
Can you expand the comment on how you handle defined_at_zero and how
that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
The loop examples you gave above all have a defined value for zero, I'm
not sure how you'd write a C loop which has that undefined.
> +static tree
> +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> +{
> + tree fn;
> + int prec = TYPE_PRECISION (TREE_TYPE (src));
> + int i_prec = TYPE_PRECISION (integer_type_node);
> + int li_prec = TYPE_PRECISION (long_integer_type_node);
> + int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> + if (prec <= i_prec)
I don't think we can use <= for both CLZ and CTZ, no? You probably
need a GIMPLE testcase or a language that doesn't promote char/short
to int for a testcase that fails though.
> + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> + : builtin_decl_implicit (BUILT_IN_CTZ);
> + else if (prec == li_prec)
> + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> + : builtin_decl_implicit (BUILT_IN_CTZL);
> + else if (prec == lli_prec || prec == 2 * lli_prec)
> + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> + : builtin_decl_implicit (BUILT_IN_CTZLL);
> + else
> + return NULL_TREE;
> +
> + tree utype = unsigned_type_for (TREE_TYPE (src));
> + src = fold_convert (utype, src);
> + if (prec < i_prec)
> + src = fold_convert (unsigned_type_node, src);
> +
> + tree call;
> + if (prec == 2 * lli_prec)
> + {
> + tree src1 = fold_convert (long_long_unsigned_type_node,
> + fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> + unshare_expr (src),
> + build_int_cst (integer_type_node,
> + lli_prec)));
> + tree src2 = fold_convert (long_long_unsigned_type_node, src);
> + /* We count the zeroes in src1, and add the number in src2 when src1
> + is 0. */
> + if (!leading)
> + std::swap(src1, src2);
> + tree call1 = build_call_expr (fn, 1, src1);
> + tree call2 = build_call_expr (fn, 1, src2);
> + if (defined_at_zero)
> + {
> + tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> + build_zero_cst (TREE_TYPE (src2)));
> + call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> + build_int_cst (integer_type_node, lli_prec));
> + }
> + tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> + build_zero_cst (TREE_TYPE (src1)));
> + call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> + fold_build2 (PLUS_EXPR, integer_type_node, call2,
> + build_int_cst (integer_type_node,
> + lli_prec)));
> + }
> + else
> + {
> + call = build_call_expr (fn, 1, src);
> + if (defined_at_zero)
> + {
> + tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> + build_zero_cst (TREE_TYPE (src)));
> + call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> + build_int_cst (integer_type_node, prec));
> + }
> + }
> +
> + if (leading && prec < i_prec)
> + call = fold_build2(MINUS_EXPR, integer_type_node, call,
> + build_int_cst (integer_type_node,
> + i_prec - prec));
> +
> + return call;
> +}
> +
> +/* See comment below for number_of_iterations_bitcount.
> + For c[lt]z complement, we have:
> +
> + modify:
> + iv_2 = iv_1 >> 1 OR iv_1 << 1
> +
> + test:
> + if (iv != 0)
> +
> + modification count:
> + src precision - c[lt]z (src)
> +
> + */
> +
> +static bool
> +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> + enum tree_code code,
> + class tree_niter_desc *niter)
> +{
> + bool modify_before_test = true;
> + HOST_WIDE_INT max;
> +
> + /* Check that condition for staying inside the loop is like
> + if (iv != 0). */
> + gimple *cond_stmt = last_stmt (exit->src);
> + if (!cond_stmt
> + || gimple_code (cond_stmt) != GIMPLE_COND
> + || code != NE_EXPR
> + || !integer_zerop (gimple_cond_rhs (cond_stmt))
> + || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> + return false;
> +
> + tree iv_2 = gimple_cond_lhs (cond_stmt);
> + gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +
> + /* If the test comes before the iv modification, then these will actually be
> + iv_1 and a phi node. */
> + if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> + && gimple_bb (iv_2_stmt) == loop->header
> + && gimple_phi_num_args (iv_2_stmt) == 2
> + && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> + loop_latch_edge (loop)->dest_idx))
> + == SSA_NAME))
> + {
> + /* iv_2 is actually one of the inputs to the phi. */
> + iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> + iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> + modify_before_test = false;
> + }
> +
> + /* Make sure iv_2_stmt is a logical shift by one stmt:
> + iv_2 = iv_1 {>>|<<} 1 */
> + if (!is_gimple_assign (iv_2_stmt)
> + || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> + && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> + || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> + || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> + return false;
> +
> + bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> +
> + tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> +
> + /* Check the recurrence. */
> + gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> + if (gimple_code (phi) != GIMPLE_PHI
> + || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> + || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> + return false;
> +
> + /* We found a match. */
> + tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> + int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> +
> + /* Get the corresponding c[lt]z builtin. */
> + tree expr = build_cltz_expr (src, !left_shift, true);
So we always have defined_at_zero == true?
> +
> + if (!expr)
> + return false;
> +
> + expr = fold_build2 (MINUS_EXPR, integer_type_node,
> + build_int_cst (integer_type_node, src_precision),
> + expr);
> +
> + max = src_precision;
> +
> + tree may_be_zero = boolean_false_node;
> +
> + if (modify_before_test)
> + {
> + expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> + integer_one_node);
> + max = max - 1;
> + may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> + build_zero_cst (TREE_TYPE (src)));
> + }
> +
> + expr = fold_convert (unsigned_type_node, expr);
> +
> + niter->assumptions = boolean_true_node;
> + niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> + niter->niter = simplify_using_initial_conditions (loop, expr);
> +
> + if (TREE_CODE (niter->niter) == INTEGER_CST)
> + niter->max = tree_to_uhwi (niter->niter);
> + else
> + niter->max = max;
> +
> + niter->bound = NULL_TREE;
> + niter->cmp = ERROR_MARK;
> + return true;
> +}
> +
> /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> 1. A modification to the induction variabler;.
> 2. A test to determine whether or not to exit the loop.
> @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
> enum tree_code code,
> class tree_niter_desc *niter)
> {
> - return number_of_iterations_popcount (loop, exit, code, niter);
> + return (number_of_iterations_popcount (loop, exit, code, niter)
> + || number_of_iterations_cltz_complement (loop, exit, code, niter));
I'm kind-of missing the non-complement variant ;)
Otherwise looks OK to me.
Thanks,
Richard.
> }
>
> /* Substitute NEW_TREE for OLD in EXPR and fold the result.
On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote:
> On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > This recognises patterns of the form:
> > while (n) { n >>= 1 }
> >
> > This patch results in improved (but still suboptimal) codegen:
> >
> > foo (unsigned int b) {
> > int c = 0;
> >
> > while (b) {
> > b >>= 1;
> > c++;
> > }
> >
> > return c;
> > }
> >
> > foo:
> > .LFB11:
> > .cfi_startproc
> > cbz w0, .L3
> > clz w1, w0
> > tst x0, 1
> > mov w0, 32
> > sub w0, w0, w1
> > csel w0, w0, wzr, ne
> > ret
> >
> > The conditional is unnecessary. phiopt could recognise a redundant csel
> > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> > clz call, but it cannot recognise the redunancy when the input is (e.g.)
> > (32 - clz).
> >
> > I could perhaps extend this function to recognise this pattern in a later
> > patch, if this is a good place to recognise more patterns.
> >
> > gcc/ChangeLog:
> >
+ PR tree-optimization/94793
> > * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> > for c[lt]z optabs.
> > * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> > (number_of_iterations_cltz_complement): New.
> > (number_of_iterations_bitcount): Add call to the above.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * lib/target-supports.exp (check_effective_target_clz)
> > (check_effective_target_clzl, check_effective_target_clzll)
> > (check_effective_target_ctz, check_effective_target_clzl)
> > (check_effective_target_ctzll): New.
> > * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
> >
> >
> > --
> >
> >
[snip test diffs]
> > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> > --- a/gcc/tree-scalar-evolution.cc
> > +++ b/gcc/tree-scalar-evolution.cc
> > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > library call for popcount when backend does not have an instruction
> > to do so. We consider this to be expensive and generate
> > __builtin_popcount only when backend defines it. */
> > + optab optab;
> > combined_fn cfn = get_call_combined_fn (expr);
> > switch (cfn)
> > {
> > CASE_CFN_POPCOUNT:
> > + optab = popcount_optab;
> > + goto bitcount_call;
> > + CASE_CFN_CLZ:
> > + optab = clz_optab;
> > + goto bitcount_call;
> > + CASE_CFN_CTZ:
> > + optab = ctz_optab;
> > +bitcount_call:
> > /* Check if opcode for popcount is available in the mode required. */
> > - if (optab_handler (popcount_optab,
> > + if (optab_handler (optab,
> > TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
> > == CODE_FOR_nothing)
> > {
> > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > instructions. */
> > if (is_a <scalar_int_mode> (mode, &int_mode)
> > && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> > - && (optab_handler (popcount_optab, word_mode)
> > + && (optab_handler (optab, word_mode)
> > != CODE_FOR_nothing))
> > break;
> > return true;
> > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> > --- a/gcc/tree-ssa-loop-niter.cc
> > +++ b/gcc/tree-ssa-loop-niter.cc
> > @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> > return true;
> > }
> >
> > +/* Return an expression that counts the leading/trailing zeroes of src. */
>
> Can you expand the comment on how you handle defined_at_zero and how
> that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
> The loop examples you gave above all have a defined value for zero, I'm
> not sure how you'd write a C loop which has that undefined.
>
How about:
/* Return an expression that counts the leading/trailing zeroes of src.
If defined_at_zero is true, then the built expression uses a conditonal
expression to return the precision of src when src == 0.
Otherwise, we can elide the conditional expression and let src = 0 invoke
undefined behaviour. */
> > +static tree
> > +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> > +{
> > + tree fn;
> > + int prec = TYPE_PRECISION (TREE_TYPE (src));
> > + int i_prec = TYPE_PRECISION (integer_type_node);
> > + int li_prec = TYPE_PRECISION (long_integer_type_node);
> > + int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> > + if (prec <= i_prec)
>
> I don't think we can use <= for both CLZ and CTZ, no? You probably
> need a GIMPLE testcase or a language that doesn't promote char/short
> to int for a testcase that fails though.
I don't see an issue here. I've ensured that src is the mode used in
the iterator, and if a longer mode is used for the __builtin_clz
call then I account for the difference below...
> > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> > + : builtin_decl_implicit (BUILT_IN_CTZ);
> > + else if (prec == li_prec)
> > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> > + : builtin_decl_implicit (BUILT_IN_CTZL);
> > + else if (prec == lli_prec || prec == 2 * lli_prec)
> > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> > + : builtin_decl_implicit (BUILT_IN_CTZLL);
> > + else
> > + return NULL_TREE;
> > +
> > + tree utype = unsigned_type_for (TREE_TYPE (src));
> > + src = fold_convert (utype, src);
> > + if (prec < i_prec)
> > + src = fold_convert (unsigned_type_node, src);
> > +
> > + tree call;
> > + if (prec == 2 * lli_prec)
> > + {
> > + tree src1 = fold_convert (long_long_unsigned_type_node,
> > + fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> > + unshare_expr (src),
> > + build_int_cst (integer_type_node,
> > + lli_prec)));
> > + tree src2 = fold_convert (long_long_unsigned_type_node, src);
> > + /* We count the zeroes in src1, and add the number in src2 when src1
> > + is 0. */
> > + if (!leading)
> > + std::swap(src1, src2);
> > + tree call1 = build_call_expr (fn, 1, src1);
> > + tree call2 = build_call_expr (fn, 1, src2);
> > + if (defined_at_zero)
> > + {
> > + tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> > + build_zero_cst (TREE_TYPE (src2)));
> > + call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> > + build_int_cst (integer_type_node, lli_prec));
> > + }
> > + tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> > + build_zero_cst (TREE_TYPE (src1)));
> > + call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> > + fold_build2 (PLUS_EXPR, integer_type_node, call2,
> > + build_int_cst (integer_type_node,
> > + lli_prec)));
> > + }
> > + else
> > + {
> > + call = build_call_expr (fn, 1, src);
> > + if (defined_at_zero)
> > + {
> > + tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> > + build_zero_cst (TREE_TYPE (src)));
> > + call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> > + build_int_cst (integer_type_node, prec));
> > + }
> > + }
> > +
...with this code:
> > + if (leading && prec < i_prec)
> > + call = fold_build2(MINUS_EXPR, integer_type_node, call,
> > + build_int_cst (integer_type_node,
> > + i_prec - prec));
> > +
> > + return call;
> > +}
> > +
> > +/* See comment below for number_of_iterations_bitcount.
> > + For c[lt]z complement, we have:
> > +
> > + modify:
> > + iv_2 = iv_1 >> 1 OR iv_1 << 1
> > +
> > + test:
> > + if (iv != 0)
> > +
> > + modification count:
> > + src precision - c[lt]z (src)
> > +
> > + */
> > +
> > +static bool
> > +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> > + enum tree_code code,
> > + class tree_niter_desc *niter)
> > +{
> > + bool modify_before_test = true;
> > + HOST_WIDE_INT max;
> > +
> > + /* Check that condition for staying inside the loop is like
> > + if (iv != 0). */
> > + gimple *cond_stmt = last_stmt (exit->src);
> > + if (!cond_stmt
> > + || gimple_code (cond_stmt) != GIMPLE_COND
> > + || code != NE_EXPR
> > + || !integer_zerop (gimple_cond_rhs (cond_stmt))
> > + || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> > + return false;
> > +
> > + tree iv_2 = gimple_cond_lhs (cond_stmt);
> > + gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > +
> > + /* If the test comes before the iv modification, then these will actually be
> > + iv_1 and a phi node. */
> > + if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> > + && gimple_bb (iv_2_stmt) == loop->header
> > + && gimple_phi_num_args (iv_2_stmt) == 2
> > + && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> > + loop_latch_edge (loop)->dest_idx))
> > + == SSA_NAME))
> > + {
> > + /* iv_2 is actually one of the inputs to the phi. */
> > + iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> > + iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > + modify_before_test = false;
> > + }
> > +
> > + /* Make sure iv_2_stmt is a logical shift by one stmt:
> > + iv_2 = iv_1 {>>|<<} 1 */
> > + if (!is_gimple_assign (iv_2_stmt)
> > + || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> > + && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> > + || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> > + || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> > + return false;
> > +
> > + bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> > +
> > + tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> > +
> > + /* Check the recurrence. */
> > + gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> > + if (gimple_code (phi) != GIMPLE_PHI
> > + || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> > + || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> > + return false;
> > +
> > + /* We found a match. */
> > + tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> > + int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> > +
> > + /* Get the corresponding c[lt]z builtin. */
> > + tree expr = build_cltz_expr (src, !left_shift, true);
>
> So we always have defined_at_zero == true?
In the patch: yes. The next patch uses defined_at_zero == false, and I
don't think there's any point in submitting a simpler build_cltz_expr
for just this patch.
> > +
> > + if (!expr)
> > + return false;
> > +
> > + expr = fold_build2 (MINUS_EXPR, integer_type_node,
> > + build_int_cst (integer_type_node, src_precision),
> > + expr);
> > +
> > + max = src_precision;
> > +
> > + tree may_be_zero = boolean_false_node;
> > +
> > + if (modify_before_test)
> > + {
> > + expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> > + integer_one_node);
> > + max = max - 1;
> > + may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> > + build_zero_cst (TREE_TYPE (src)));
> > + }
> > +
> > + expr = fold_convert (unsigned_type_node, expr);
> > +
> > + niter->assumptions = boolean_true_node;
> > + niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> > + niter->niter = simplify_using_initial_conditions (loop, expr);
> > +
> > + if (TREE_CODE (niter->niter) == INTEGER_CST)
> > + niter->max = tree_to_uhwi (niter->niter);
> > + else
> > + niter->max = max;
> > +
> > + niter->bound = NULL_TREE;
> > + niter->cmp = ERROR_MARK;
> > + return true;
> > +}
> > +
> > /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> > 1. A modification to the induction variabler;.
> > 2. A test to determine whether or not to exit the loop.
> > @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
> > enum tree_code code,
> > class tree_niter_desc *niter)
> > {
> > - return number_of_iterations_popcount (loop, exit, code, niter);
> > + return (number_of_iterations_popcount (loop, exit, code, niter)
> > + || number_of_iterations_cltz_complement (loop, exit, code, niter));
>
> I'm kind-of missing the non-complement variant ;)
See next patch :)
> Otherwise looks OK to me.
>
> Thanks,
> Richard.
>
> > }
> >
> > /* Substitute NEW_TREE for OLD in EXPR and fold the result.
On Mon, Nov 21, 2022 at 4:53 PM Andrew Carlotti <andrew.carlotti@arm.com> wrote:
>
> On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote:
> > On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > This recognises patterns of the form:
> > > while (n) { n >>= 1 }
> > >
> > > This patch results in improved (but still suboptimal) codegen:
> > >
> > > foo (unsigned int b) {
> > > int c = 0;
> > >
> > > while (b) {
> > > b >>= 1;
> > > c++;
> > > }
> > >
> > > return c;
> > > }
> > >
> > > foo:
> > > .LFB11:
> > > .cfi_startproc
> > > cbz w0, .L3
> > > clz w1, w0
> > > tst x0, 1
> > > mov w0, 32
> > > sub w0, w0, w1
> > > csel w0, w0, wzr, ne
> > > ret
> > >
> > > The conditional is unnecessary. phiopt could recognise a redundant csel
> > > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> > > clz call, but it cannot recognise the redunancy when the input is (e.g.)
> > > (32 - clz).
> > >
> > > I could perhaps extend this function to recognise this pattern in a later
> > > patch, if this is a good place to recognise more patterns.
> > >
> > > gcc/ChangeLog:
> > >
> + PR tree-optimization/94793
> > > * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> > > for c[lt]z optabs.
> > > * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> > > (number_of_iterations_cltz_complement): New.
> > > (number_of_iterations_bitcount): Add call to the above.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * lib/target-supports.exp (check_effective_target_clz)
> > > (check_effective_target_clzl, check_effective_target_clzll)
> > > (check_effective_target_ctz, check_effective_target_clzl)
> > > (check_effective_target_ctzll): New.
> > > * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> > > * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> > > * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> > > * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> > > * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> > > * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> > > * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> > > * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> > > * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
> > >
> > >
> > > --
> > >
> > >
>
> [snip test diffs]
>
> > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > > index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> > > --- a/gcc/tree-scalar-evolution.cc
> > > +++ b/gcc/tree-scalar-evolution.cc
> > > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > > library call for popcount when backend does not have an instruction
> > > to do so. We consider this to be expensive and generate
> > > __builtin_popcount only when backend defines it. */
> > > + optab optab;
> > > combined_fn cfn = get_call_combined_fn (expr);
> > > switch (cfn)
> > > {
> > > CASE_CFN_POPCOUNT:
> > > + optab = popcount_optab;
> > > + goto bitcount_call;
> > > + CASE_CFN_CLZ:
> > > + optab = clz_optab;
> > > + goto bitcount_call;
> > > + CASE_CFN_CTZ:
> > > + optab = ctz_optab;
> > > +bitcount_call:
> > > /* Check if opcode for popcount is available in the mode required. */
> > > - if (optab_handler (popcount_optab,
> > > + if (optab_handler (optab,
> > > TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
> > > == CODE_FOR_nothing)
> > > {
> > > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > > instructions. */
> > > if (is_a <scalar_int_mode> (mode, &int_mode)
> > > && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> > > - && (optab_handler (popcount_optab, word_mode)
> > > + && (optab_handler (optab, word_mode)
> > > != CODE_FOR_nothing))
> > > break;
> > > return true;
> > > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > > index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> > > --- a/gcc/tree-ssa-loop-niter.cc
> > > +++ b/gcc/tree-ssa-loop-niter.cc
> > > @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> > > return true;
> > > }
> > >
> > > +/* Return an expression that counts the leading/trailing zeroes of src. */
> >
> > Can you expand the comment on how you handle defined_at_zero and how
> > that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
> > The loop examples you gave above all have a defined value for zero, I'm
> > not sure how you'd write a C loop which has that undefined.
> >
>
> How about:
>
> /* Return an expression that counts the leading/trailing zeroes of src.
>
> If defined_at_zero is true, then the built expression uses a conditonal
> expression to return the precision of src when src == 0.
> Otherwise, we can elide the conditional expression and let src = 0 invoke
> undefined behaviour. */
Ah, yes - that makes things clearer.
>
> > > +static tree
> > > +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> > > +{
> > > + tree fn;
> > > + int prec = TYPE_PRECISION (TREE_TYPE (src));
> > > + int i_prec = TYPE_PRECISION (integer_type_node);
> > > + int li_prec = TYPE_PRECISION (long_integer_type_node);
> > > + int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> > > + if (prec <= i_prec)
> >
> > I don't think we can use <= for both CLZ and CTZ, no? You probably
> > need a GIMPLE testcase or a language that doesn't promote char/short
> > to int for a testcase that fails though.
>
> I don't see an issue here. I've ensured that src is the mode used in
> the iterator, and if a longer mode is used for the __builtin_clz
> call then I account for the difference below...
>
> > > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> > > + : builtin_decl_implicit (BUILT_IN_CTZ);
> > > + else if (prec == li_prec)
> > > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> > > + : builtin_decl_implicit (BUILT_IN_CTZL);
> > > + else if (prec == lli_prec || prec == 2 * lli_prec)
> > > + fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> > > + : builtin_decl_implicit (BUILT_IN_CTZLL);
> > > + else
> > > + return NULL_TREE;
> > > +
> > > + tree utype = unsigned_type_for (TREE_TYPE (src));
> > > + src = fold_convert (utype, src);
> > > + if (prec < i_prec)
> > > + src = fold_convert (unsigned_type_node, src);
> > > +
but if you have an unsigned short variable you promote
that to unsigned int here, no?
> > > + tree call;
> > > + if (prec == 2 * lli_prec)
> > > + {
> > > + tree src1 = fold_convert (long_long_unsigned_type_node,
> > > + fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> > > + unshare_expr (src),
> > > + build_int_cst (integer_type_node,
> > > + lli_prec)));
> > > + tree src2 = fold_convert (long_long_unsigned_type_node, src);
> > > + /* We count the zeroes in src1, and add the number in src2 when src1
> > > + is 0. */
> > > + if (!leading)
> > > + std::swap(src1, src2);
> > > + tree call1 = build_call_expr (fn, 1, src1);
> > > + tree call2 = build_call_expr (fn, 1, src2);
> > > + if (defined_at_zero)
> > > + {
> > > + tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> > > + build_zero_cst (TREE_TYPE (src2)));
> > > + call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> > > + build_int_cst (integer_type_node, lli_prec));
> > > + }
> > > + tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> > > + build_zero_cst (TREE_TYPE (src1)));
> > > + call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> > > + fold_build2 (PLUS_EXPR, integer_type_node, call2,
> > > + build_int_cst (integer_type_node,
> > > + lli_prec)));
> > > + }
> > > + else
> > > + {
> > > + call = build_call_expr (fn, 1, src);
> > > + if (defined_at_zero)
> > > + {
> > > + tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> > > + build_zero_cst (TREE_TYPE (src)));
> > > + call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> > > + build_int_cst (integer_type_node, prec));
> > > + }
> > > + }
> > > +
>
> ...with this code:
>
> > > + if (leading && prec < i_prec)
> > > + call = fold_build2(MINUS_EXPR, integer_type_node, call,
> > > + build_int_cst (integer_type_node,
> > > + i_prec - prec));
... ah, OK. Indeed, that should work. Note we do have CTZ and CLZ
optabs and internal functions - in case there's a HImode CLZ this
could be elided. More general you can avoid using the __builtin_
functions with their fixed types in favor of using IFN_C[TL]Z which
are type agnostic (but require optab support - you should be able
to check this via direct_internal_fn_supported_p).
> > > + return call;
> > > +}
> > > +
> > > +/* See comment below for number_of_iterations_bitcount.
> > > + For c[lt]z complement, we have:
> > > +
> > > + modify:
> > > + iv_2 = iv_1 >> 1 OR iv_1 << 1
> > > +
> > > + test:
> > > + if (iv != 0)
> > > +
> > > + modification count:
> > > + src precision - c[lt]z (src)
> > > +
> > > + */
> > > +
> > > +static bool
> > > +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> > > + enum tree_code code,
> > > + class tree_niter_desc *niter)
> > > +{
> > > + bool modify_before_test = true;
> > > + HOST_WIDE_INT max;
> > > +
> > > + /* Check that condition for staying inside the loop is like
> > > + if (iv != 0). */
> > > + gimple *cond_stmt = last_stmt (exit->src);
> > > + if (!cond_stmt
> > > + || gimple_code (cond_stmt) != GIMPLE_COND
> > > + || code != NE_EXPR
> > > + || !integer_zerop (gimple_cond_rhs (cond_stmt))
> > > + || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> > > + return false;
> > > +
> > > + tree iv_2 = gimple_cond_lhs (cond_stmt);
> > > + gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > > +
> > > + /* If the test comes before the iv modification, then these will actually be
> > > + iv_1 and a phi node. */
> > > + if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> > > + && gimple_bb (iv_2_stmt) == loop->header
> > > + && gimple_phi_num_args (iv_2_stmt) == 2
> > > + && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> > > + loop_latch_edge (loop)->dest_idx))
> > > + == SSA_NAME))
> > > + {
> > > + /* iv_2 is actually one of the inputs to the phi. */
> > > + iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> > > + iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > > + modify_before_test = false;
> > > + }
> > > +
> > > + /* Make sure iv_2_stmt is a logical shift by one stmt:
> > > + iv_2 = iv_1 {>>|<<} 1 */
> > > + if (!is_gimple_assign (iv_2_stmt)
> > > + || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> > > + && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> > > + || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> > > + || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> > > + return false;
> > > +
> > > + bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> > > +
> > > + tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> > > +
> > > + /* Check the recurrence. */
> > > + gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> > > + if (gimple_code (phi) != GIMPLE_PHI
> > > + || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> > > + || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> > > + return false;
> > > +
> > > + /* We found a match. */
> > > + tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> > > + int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> > > +
> > > + /* Get the corresponding c[lt]z builtin. */
> > > + tree expr = build_cltz_expr (src, !left_shift, true);
> >
> > So we always have defined_at_zero == true?
>
> In the patch: yes. The next patch uses defined_at_zero == false, and I
> don't think there's any point in submitting a simpler build_cltz_expr
> for just this patch.
>
> > > +
> > > + if (!expr)
> > > + return false;
> > > +
> > > + expr = fold_build2 (MINUS_EXPR, integer_type_node,
> > > + build_int_cst (integer_type_node, src_precision),
> > > + expr);
> > > +
> > > + max = src_precision;
> > > +
> > > + tree may_be_zero = boolean_false_node;
> > > +
> > > + if (modify_before_test)
> > > + {
> > > + expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> > > + integer_one_node);
> > > + max = max - 1;
> > > + may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> > > + build_zero_cst (TREE_TYPE (src)));
> > > + }
> > > +
> > > + expr = fold_convert (unsigned_type_node, expr);
> > > +
> > > + niter->assumptions = boolean_true_node;
> > > + niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> > > + niter->niter = simplify_using_initial_conditions (loop, expr);
> > > +
> > > + if (TREE_CODE (niter->niter) == INTEGER_CST)
> > > + niter->max = tree_to_uhwi (niter->niter);
> > > + else
> > > + niter->max = max;
> > > +
> > > + niter->bound = NULL_TREE;
> > > + niter->cmp = ERROR_MARK;
> > > + return true;
> > > +}
> > > +
> > > /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> > > 1. A modification to the induction variabler;.
> > > 2. A test to determine whether or not to exit the loop.
> > > @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
> > > enum tree_code code,
> > > class tree_niter_desc *niter)
> > > {
> > > - return number_of_iterations_popcount (loop, exit, code, niter);
> > > + return (number_of_iterations_popcount (loop, exit, code, niter)
> > > + || number_of_iterations_cltz_complement (loop, exit, code, niter));
> >
> > I'm kind-of missing the non-complement variant ;)
>
> See next patch :)
>
> > Otherwise looks OK to me.
> >
> > Thanks,
> > Richard.
> >
> > > }
> > >
> > > /* Substitute NEW_TREE for OLD in EXPR and fold the result.
new file mode 100644
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_complement_count1 (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+ if (c <= PREC)
+ return 0;
+ else
+ return 34567;
+}
+
+int clz_complement_count2 (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+ if (c <= PREC - 1)
+ return 0;
+ else
+ return 76543;
+}
+
+int ctz_complement_count1 (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+ if (c <= PREC)
+ return 0;
+ else
+ return 23456;
+}
+
+int ctz_complement_count2 (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+ if (c <= PREC - 1)
+ return 0;
+ else
+ return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(5) != 3)
+ __builtin_abort ();
+ if (foo(255) != 8)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(5) != 3)
+ __builtin_abort ();
+ if (foo(1 << (PREC - 1)) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(5) != 3)
+ __builtin_abort ();
+ if (foo(1LL << (PREC - 1)) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+ int c = 0;
+
+ while (b) {
+ b >>= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(5) != 3)
+ __builtin_abort ();
+ if (foo(1L << (PREC - 1)) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(96) != PREC - 5)
+ __builtin_abort ();
+ if (foo(35) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(96) != PREC - 5)
+ __builtin_abort ();
+ if (foo(35) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(96) != PREC - 5)
+ __builtin_abort ();
+ if (foo(35) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+ int c = 0;
+
+ while (b) {
+ b <<= 1;
+ c++;
+ }
+
+ return c;
+}
+
+int main()
+{
+ if (foo(0) != 0)
+ __builtin_abort ();
+ if (foo(96) != PREC - 5)
+ __builtin_abort ();
+ if (foo(35) != PREC)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
@@ -8687,6 +8687,72 @@ proc check_effective_target_popcount { } {
} "" ]
}
+# Return 1 if the target supports clz on int.
+
+proc check_effective_target_clz { } {
+ return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
+ int foo (int b)
+ {
+ return __builtin_clz (b);
+ }
+ } "" ]
+}
+
+# Return 1 if the target supports clz on long.
+
+proc check_effective_target_clzl { } {
+ return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
+ int foo (long b)
+ {
+ return __builtin_clzl (b);
+ }
+ } "" ]
+}
+
+# Return 1 if the target supports clz on long long.
+
+proc check_effective_target_clzll { } {
+ return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
+ int foo (long long b)
+ {
+ return __builtin_clzll (b);
+ }
+ } "" ]
+}
+
+# Return 1 if the target supports ctz on int.
+
+proc check_effective_target_ctz { } {
+ return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
+ int foo (int b)
+ {
+ return __builtin_ctz (b);
+ }
+ } "" ]
+}
+
+# Return 1 if the target supports ctz on long.
+
+proc check_effective_target_ctzl { } {
+ return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
+ int foo (long b)
+ {
+ return __builtin_ctzl (b);
+ }
+ } "" ]
+}
+
+# Return 1 if the target supports ctz on long long.
+
+proc check_effective_target_ctzll { } {
+ return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
+ int foo (long long b)
+ {
+ return __builtin_ctzll (b);
+ }
+ } "" ]
+}
+
# Return 1 if the target supports atomic operations on "long long"
# and can execute them.
#
@@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
library call for popcount when backend does not have an instruction
to do so. We consider this to be expensive and generate
__builtin_popcount only when backend defines it. */
+ optab optab;
combined_fn cfn = get_call_combined_fn (expr);
switch (cfn)
{
CASE_CFN_POPCOUNT:
+ optab = popcount_optab;
+ goto bitcount_call;
+ CASE_CFN_CLZ:
+ optab = clz_optab;
+ goto bitcount_call;
+ CASE_CFN_CTZ:
+ optab = ctz_optab;
+bitcount_call:
/* Check if opcode for popcount is available in the mode required. */
- if (optab_handler (popcount_optab,
+ if (optab_handler (optab,
TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
== CODE_FOR_nothing)
{
@@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
instructions. */
if (is_a <scalar_int_mode> (mode, &int_mode)
&& GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
- && (optab_handler (popcount_optab, word_mode)
+ && (optab_handler (optab, word_mode)
!= CODE_FOR_nothing))
break;
return true;
@@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
return true;
}
+/* Return an expression that counts the leading/trailing zeroes of src. */
+
+static tree
+build_cltz_expr (tree src, bool leading, bool defined_at_zero)
+{
+ tree fn;
+ int prec = TYPE_PRECISION (TREE_TYPE (src));
+ int i_prec = TYPE_PRECISION (integer_type_node);
+ int li_prec = TYPE_PRECISION (long_integer_type_node);
+ int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
+ if (prec <= i_prec)
+ fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
+ : builtin_decl_implicit (BUILT_IN_CTZ);
+ else if (prec == li_prec)
+ fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
+ : builtin_decl_implicit (BUILT_IN_CTZL);
+ else if (prec == lli_prec || prec == 2 * lli_prec)
+ fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
+ : builtin_decl_implicit (BUILT_IN_CTZLL);
+ else
+ return NULL_TREE;
+
+ tree utype = unsigned_type_for (TREE_TYPE (src));
+ src = fold_convert (utype, src);
+ if (prec < i_prec)
+ src = fold_convert (unsigned_type_node, src);
+
+ tree call;
+ if (prec == 2 * lli_prec)
+ {
+ tree src1 = fold_convert (long_long_unsigned_type_node,
+ fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+ unshare_expr (src),
+ build_int_cst (integer_type_node,
+ lli_prec)));
+ tree src2 = fold_convert (long_long_unsigned_type_node, src);
+ /* We count the zeroes in src1, and add the number in src2 when src1
+ is 0. */
+ if (!leading)
+ std::swap(src1, src2);
+ tree call1 = build_call_expr (fn, 1, src1);
+ tree call2 = build_call_expr (fn, 1, src2);
+ if (defined_at_zero)
+ {
+ tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
+ build_zero_cst (TREE_TYPE (src2)));
+ call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
+ build_int_cst (integer_type_node, lli_prec));
+ }
+ tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
+ build_zero_cst (TREE_TYPE (src1)));
+ call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
+ fold_build2 (PLUS_EXPR, integer_type_node, call2,
+ build_int_cst (integer_type_node,
+ lli_prec)));
+ }
+ else
+ {
+ call = build_call_expr (fn, 1, src);
+ if (defined_at_zero)
+ {
+ tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
+ build_zero_cst (TREE_TYPE (src)));
+ call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
+ build_int_cst (integer_type_node, prec));
+ }
+ }
+
+ if (leading && prec < i_prec)
+ call = fold_build2(MINUS_EXPR, integer_type_node, call,
+ build_int_cst (integer_type_node,
+ i_prec - prec));
+
+ return call;
+}
+
+/* See comment below for number_of_iterations_bitcount.
+ For c[lt]z complement, we have:
+
+ modify:
+ iv_2 = iv_1 >> 1 OR iv_1 << 1
+
+ test:
+ if (iv != 0)
+
+ modification count:
+ src precision - c[lt]z (src)
+
+ */
+
+static bool
+number_of_iterations_cltz_complement (loop_p loop, edge exit,
+ enum tree_code code,
+ class tree_niter_desc *niter)
+{
+ bool modify_before_test = true;
+ HOST_WIDE_INT max;
+
+ /* Check that condition for staying inside the loop is like
+ if (iv != 0). */
+ gimple *cond_stmt = last_stmt (exit->src);
+ if (!cond_stmt
+ || gimple_code (cond_stmt) != GIMPLE_COND
+ || code != NE_EXPR
+ || !integer_zerop (gimple_cond_rhs (cond_stmt))
+ || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
+ return false;
+
+ tree iv_2 = gimple_cond_lhs (cond_stmt);
+ gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+
+ /* If the test comes before the iv modification, then these will actually be
+ iv_1 and a phi node. */
+ if (gimple_code (iv_2_stmt) == GIMPLE_PHI
+ && gimple_bb (iv_2_stmt) == loop->header
+ && gimple_phi_num_args (iv_2_stmt) == 2
+ && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
+ loop_latch_edge (loop)->dest_idx))
+ == SSA_NAME))
+ {
+ /* iv_2 is actually one of the inputs to the phi. */
+ iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
+ iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+ modify_before_test = false;
+ }
+
+ /* Make sure iv_2_stmt is a logical shift by one stmt:
+ iv_2 = iv_1 {>>|<<} 1 */
+ if (!is_gimple_assign (iv_2_stmt)
+ || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
+ && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
+ || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
+ || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
+ return false;
+
+ bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
+
+ tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
+
+ /* Check the recurrence. */
+ gimple *phi = SSA_NAME_DEF_STMT (iv_1);
+ if (gimple_code (phi) != GIMPLE_PHI
+ || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+ || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+ return false;
+
+ /* We found a match. */
+ tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+ int src_precision = TYPE_PRECISION (TREE_TYPE (src));
+
+ /* Get the corresponding c[lt]z builtin. */
+ tree expr = build_cltz_expr (src, !left_shift, true);
+
+ if (!expr)
+ return false;
+
+ expr = fold_build2 (MINUS_EXPR, integer_type_node,
+ build_int_cst (integer_type_node, src_precision),
+ expr);
+
+ max = src_precision;
+
+ tree may_be_zero = boolean_false_node;
+
+ if (modify_before_test)
+ {
+ expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
+ integer_one_node);
+ max = max - 1;
+ may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
+ build_zero_cst (TREE_TYPE (src)));
+ }
+
+ expr = fold_convert (unsigned_type_node, expr);
+
+ niter->assumptions = boolean_true_node;
+ niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
+ niter->niter = simplify_using_initial_conditions (loop, expr);
+
+ if (TREE_CODE (niter->niter) == INTEGER_CST)
+ niter->max = tree_to_uhwi (niter->niter);
+ else
+ niter->max = max;
+
+ niter->bound = NULL_TREE;
+ niter->cmp = ERROR_MARK;
+ return true;
+}
+
/* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
1. A modification to the induction variabler;.
2. A test to determine whether or not to exit the loop.
@@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
enum tree_code code,
class tree_niter_desc *niter)
{
- return number_of_iterations_popcount (loop, exit, code, niter);
+ return (number_of_iterations_popcount (loop, exit, code, niter)
+ || number_of_iterations_cltz_complement (loop, exit, code, niter));
}
/* Substitute NEW_TREE for OLD in EXPR and fold the result.