match.pd: Optimize ctz/popcount/parity/ffs on extended argument [PR112566]

Message ID ZVdu8BjuGofK3TeU@tucnak
State Unresolved
Headers
Series match.pd: Optimize ctz/popcount/parity/ffs on extended argument [PR112566] |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Jakub Jelinek Nov. 17, 2023, 1:47 p.m. UTC
  Hi!

ctz(ext(X)) is the same as ctz(X) in the UB on zero case (or could be also
in the 2 argument case on large BITINT_TYPE by preserving the argument, not
implemented in this patch),
popcount(zext(X)) is the same as popcount(X),
parity(zext(X)) is the same as parity(X),
parity(sext(X)) is the same as parity(X) provided the bit difference between
the extended and unextended types is even,
ffs(ext(X)) is the same as ffs(X).

The following patch optimizes those in match.pd if those are beneficial
(always in the large BITINT_TYPE case, or if the narrower type has optab
and the wider doesn't, or the wider is larger than word and narrower is
one of the standard argument sizes (tested just int and long long, as
long is on most targets same bitsize as one of those two).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Joseph in the PR mentioned that ctz(narrow(X)) is the same as ctz(X)
if UB on 0, but that can be handled incrementally (and would need different
decisions when it is profitable).
And clz(zext(X)) is clz(X) + bit_difference, but not sure we want to change
that in match.pd at all, perhaps during insn selection?

2023-11-17  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/112566
	PR tree-optimization/83171
	* match.pd (ctz(ext(X)) -> ctz(X), popcount(zext(X)) -> popcount(X),
	parity(ext(X)) -> parity(X), ffs(ext(X)) -> ffs(X)): New
	simplifications.
	( __builtin_ffs (X) == 0 -> X == 0): Use FFS rather than
	BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL BUILT_IN_FFSIMAX.

	* gcc.dg/pr112566-1.c: New test.
	* gcc.dg/pr112566-2.c: New test.
	* gcc.target/i386/pr78057.c (foo): Pass another long long argument
	and use it in __builtin_ia32_*zcnt_u64 instead of the int one.


	Jakub
  

Comments

Richard Biener Nov. 17, 2023, 1:55 p.m. UTC | #1
On Fri, 17 Nov 2023, Jakub Jelinek wrote:

> Hi!
> 
> ctz(ext(X)) is the same as ctz(X) in the UB on zero case (or could be also
> in the 2 argument case on large BITINT_TYPE by preserving the argument, not
> implemented in this patch),
> popcount(zext(X)) is the same as popcount(X),
> parity(zext(X)) is the same as parity(X),
> parity(sext(X)) is the same as parity(X) provided the bit difference between
> the extended and unextended types is even,
> ffs(ext(X)) is the same as ffs(X).
> 
> The following patch optimizes those in match.pd if those are beneficial
> (always in the large BITINT_TYPE case, or if the narrower type has optab
> and the wider doesn't, or the wider is larger than word and narrower is
> one of the standard argument sizes (tested just int and long long, as
> long is on most targets same bitsize as one of those two).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

> Joseph in the PR mentioned that ctz(narrow(X)) is the same as ctz(X)
> if UB on 0, but that can be handled incrementally (and would need different
> decisions when it is profitable).
> And clz(zext(X)) is clz(X) + bit_difference, but not sure we want to change
> that in match.pd at all, perhaps during insn selection?

Yeah, it would be less canonical for GIMPLE I think.

Richard.

> 2023-11-17  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/112566
> 	PR tree-optimization/83171
> 	* match.pd (ctz(ext(X)) -> ctz(X), popcount(zext(X)) -> popcount(X),
> 	parity(ext(X)) -> parity(X), ffs(ext(X)) -> ffs(X)): New
> 	simplifications.
> 	( __builtin_ffs (X) == 0 -> X == 0): Use FFS rather than
> 	BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL BUILT_IN_FFSIMAX.
> 
> 	* gcc.dg/pr112566-1.c: New test.
> 	* gcc.dg/pr112566-2.c: New test.
> 	* gcc.target/i386/pr78057.c (foo): Pass another long long argument
> 	and use it in __builtin_ia32_*zcnt_u64 instead of the int one.
> 
> --- gcc/match.pd.jj	2023-11-14 10:52:16.190276041 +0100
> +++ gcc/match.pd	2023-11-16 13:09:41.221275374 +0100
> @@ -8672,6 +8672,50 @@ (define_operator_list SYNC_FETCH_AND_AND
>  			       wi::shifted_mask (tree_to_uhwi (@1), 1,
>  						 false, prec)); })))))))
>  
> +#if GIMPLE
> +/* ctz(ext(X)) == ctz(X).  Valid just for the UB at zero cases though.  */
> +(simplify
> +  (CTZ (convert@1 @0))
> +  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
> +   (with { combined_fn cfn = CFN_LAST;
> +	   tree type0 = TREE_TYPE (@0);
> +	   if (TREE_CODE (type0) == BITINT_TYPE)
> +	     {
> +	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
> +		 cfn = CFN_CTZ;
> +	       else
> +		 type0
> +		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
> +						     1);
> +	     }
> +	   type0 = unsigned_type_for (type0);
> +	   if (cfn == CFN_LAST
> +	       && direct_internal_fn_supported_p (IFN_CTZ, type0,
> +						  OPTIMIZE_FOR_BOTH))
> +	     cfn = CFN_CTZ;
> +	   if (cfn == CFN_LAST
> +	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
> +	       && !direct_internal_fn_supported_p (IFN_CTZ,
> +						   TREE_TYPE (@1),
> +						   OPTIMIZE_FOR_BOTH))
> +	     {
> +	       if (TYPE_PRECISION (type0)
> +		   == TYPE_PRECISION (unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_CTZ;
> +	       else if (TYPE_PRECISION (type0)
> +			== TYPE_PRECISION (long_long_unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_CTZLL;
> +	     } }
> +    (if (cfn == CFN_CTZ)
> +     (IFN_CTZ (convert:type0 @0))
> +     (if (cfn == CFN_BUILT_IN_CTZ)
> +      (BUILT_IN_CTZ (convert:type0 @0))
> +      (if (cfn == CFN_BUILT_IN_CTZLL)
> +       (BUILT_IN_CTZLL (convert:type0 @0))))))))
> +#endif
> +
>  /* POPCOUNT simplifications.  */
>  /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
>  (simplify
> @@ -8742,6 +8786,50 @@ (define_operator_list SYNC_FETCH_AND_AND
>  	      (popcount:s @1))
>        (popcount (log2 @0 @1)))))
>  
> +#if GIMPLE
> +/* popcount(zext(X)) == popcount(X).  */
> +(simplify
> +  (POPCOUNT (convert@1 @0))
> +  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
> +   (with { combined_fn cfn = CFN_LAST;
> +	   tree type0 = TREE_TYPE (@0);
> +	   if (TREE_CODE (type0) == BITINT_TYPE)
> +	     {
> +	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
> +		 cfn = CFN_POPCOUNT;
> +	       else
> +		 type0
> +		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
> +						     1);
> +	     }
> +	   if (cfn == CFN_LAST
> +	       && direct_internal_fn_supported_p (IFN_POPCOUNT, type0,
> +						  OPTIMIZE_FOR_BOTH))
> +	     cfn = CFN_POPCOUNT;
> +	   if (cfn == CFN_LAST
> +	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
> +	       && !direct_internal_fn_supported_p (IFN_POPCOUNT,
> +						   TREE_TYPE (@1),
> +						   OPTIMIZE_FOR_BOTH))
> +	     {
> +	       if (TYPE_PRECISION (type0)
> +		   == TYPE_PRECISION (unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_POPCOUNT;
> +	       else if (TYPE_PRECISION (type0)
> +			== TYPE_PRECISION (long_long_unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_POPCOUNTLL;
> +	     } }
> +    (if (cfn == CFN_POPCOUNT)
> +     (IFN_POPCOUNT (convert:type0 @0))
> +     (if (cfn == CFN_BUILT_IN_POPCOUNT)
> +      (BUILT_IN_POPCOUNT (convert:type0 @0))
> +      (if (cfn == CFN_BUILT_IN_POPCOUNTLL)
> +       (BUILT_IN_POPCOUNTLL (convert:type0 @0))))))))
> +#endif
> +
>  /* PARITY simplifications.  */
>  /* parity(~X) is parity(X).  */
>  (simplify
> @@ -8780,6 +8868,54 @@ (define_operator_list SYNC_FETCH_AND_AND
>    (bit_xor (PARITY:s @0) (PARITY:s @1))
>    (PARITY (bit_xor @0 @1)))
>  
> +#if GIMPLE
> +/* parity(zext(X)) == parity(X).  */
> +/* parity(sext(X)) == parity(X) if the difference in precision is even.  */
> +(simplify
> +  (PARITY (convert@1 @0))
> +  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0))
> +       && (TYPE_UNSIGNED (TREE_TYPE (@0))
> +	   || ((TYPE_PRECISION (TREE_TYPE (@1))
> +		- TYPE_PRECISION (TREE_TYPE (@0))) & 1) == 0))
> +   (with { combined_fn cfn = CFN_LAST;
> +	   tree type0 = TREE_TYPE (@0);
> +	   if (TREE_CODE (type0) == BITINT_TYPE)
> +	     {
> +	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
> +		 cfn = CFN_PARITY;
> +	       else
> +		 type0
> +		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
> +						     1);
> +	     }
> +	   type0 = unsigned_type_for (type0);
> +	   if (cfn == CFN_LAST
> +	       && direct_internal_fn_supported_p (IFN_PARITY, type0,
> +						  OPTIMIZE_FOR_BOTH))
> +	     cfn = CFN_PARITY;
> +	   if (cfn == CFN_LAST
> +	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
> +	       && !direct_internal_fn_supported_p (IFN_PARITY,
> +						   TREE_TYPE (@1),
> +						   OPTIMIZE_FOR_BOTH))
> +	     {
> +	       if (TYPE_PRECISION (type0)
> +		   == TYPE_PRECISION (unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_PARITY;
> +	       else if (TYPE_PRECISION (type0)
> +			== TYPE_PRECISION (long_long_unsigned_type_node))
> +		 cfn = CFN_BUILT_IN_PARITYLL;
> +	     } }
> +    (if (cfn == CFN_PARITY)
> +     (IFN_PARITY (convert:type0 @0))
> +     (if (cfn == CFN_BUILT_IN_PARITY)
> +      (BUILT_IN_PARITY (convert:type0 @0))
> +      (if (cfn == CFN_BUILT_IN_PARITYLL)
> +       (BUILT_IN_PARITYLL (convert:type0 @0))))))))
> +#endif
> +
>  /* a != 0 ? FUN(a) : 0 -> Fun(a) for some builtin functions. */
>  (for func (POPCOUNT BSWAP FFS PARITY)
>   (simplify
> @@ -8987,8 +9123,7 @@ (define_operator_list SYNC_FETCH_AND_AND
>     (plus (CTZ:type (convert:utype @0)) { build_one_cst (type); }))))
>  #endif
>  
> -(for ffs (BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL
> -	  BUILT_IN_FFSIMAX)
> +(for ffs (FFS)
>   /* __builtin_ffs (X) == 0 -> X == 0.
>      __builtin_ffs (X) == 6 -> (X & 63) == 32.  */
>   (for cmp (eq ne)
> @@ -9036,6 +9171,50 @@ (define_operator_list SYNC_FETCH_AND_AND
>  		     { build_zero_cst (TREE_TYPE (@0)); }))))))))
>  
>  #if GIMPLE
> +/* ffs(ext(X)) == ffs(X).  */
> +(simplify
> +  (FFS (convert@1 @0))
> +  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
> +   (with { combined_fn cfn = CFN_LAST;
> +	   tree type0 = TREE_TYPE (@0);
> +	   if (TREE_CODE (type0) == BITINT_TYPE)
> +	     {
> +	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
> +		 cfn = CFN_FFS;
> +	       else
> +		 type0
> +		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
> +						     0);
> +	     }
> +	   type0 = signed_type_for (type0);
> +	   if (cfn == CFN_LAST
> +	       && direct_internal_fn_supported_p (IFN_FFS, type0,
> +						  OPTIMIZE_FOR_BOTH))
> +	     cfn = CFN_FFS;
> +	   if (cfn == CFN_LAST
> +	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
> +	       && !direct_internal_fn_supported_p (IFN_FFS,
> +						   TREE_TYPE (@1),
> +						   OPTIMIZE_FOR_BOTH))
> +	     {
> +	       if (TYPE_PRECISION (type0)
> +		   == TYPE_PRECISION (integer_type_node))
> +		 cfn = CFN_BUILT_IN_FFS;
> +	       else if (TYPE_PRECISION (type0)
> +			== TYPE_PRECISION (long_long_integer_type_node))
> +		 cfn = CFN_BUILT_IN_FFSLL;
> +	     } }
> +    (if (cfn == CFN_FFS)
> +     (IFN_FFS (convert:type0 @0))
> +     (if (cfn == CFN_BUILT_IN_FFS)
> +      (BUILT_IN_FFS (convert:type0 @0))
> +      (if (cfn == CFN_BUILT_IN_FFSLL)
> +       (BUILT_IN_FFSLL (convert:type0 @0))))))))
> +#endif
> +
> +#if GIMPLE
>  
>  /* Simplify:
>       a = op a1
> --- gcc/testsuite/gcc.dg/pr112566-1.c.jj	2023-11-16 13:29:23.063761320 +0100
> +++ gcc/testsuite/gcc.dg/pr112566-1.c	2023-11-16 13:35:14.459852714 +0100
> @@ -0,0 +1,14 @@
> +/* PR tree-optimization/112566 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-additional-options "-mbmi2 -mlzcnt -mpopcnt" { target i?86-*-* x86_64-*-* } } */
> +/* { dg-final { scan-tree-dump-not "ll \\\(" "optimized" { target ia32 } } } */
> +/* { dg-final { scan-tree-dump-not "\\\(long long (unsigned )?int\\\)" "optimized" { target ia32 } } } */
> +
> +int foo (unsigned int x) { return __builtin_ctzll (x); }
> +int bar (unsigned int x) { return __builtin_popcountll (x); }
> +int baz (unsigned int x) { return __builtin_parityll (x); }
> +int qux (int x) { return __builtin_ffsll (x); }
> +int corge (int x) { return __builtin_ctzll (x); }
> +int garply (int x) { return __builtin_parityll (x); }
> +int fred (unsigned int x) { return __builtin_ffsll (x); }
> --- gcc/testsuite/gcc.dg/pr112566-2.c.jj	2023-11-16 13:32:30.854138101 +0100
> +++ gcc/testsuite/gcc.dg/pr112566-2.c	2023-11-16 13:34:14.727687104 +0100
> @@ -0,0 +1,12 @@
> +/* PR tree-optimization/112566 */
> +/* { dg-do compile { target bitint575 } } */
> +/* { dg-options "-O2 -fdump-tree-ccp2" } */
> +/* { dg-final { scan-tree-dump-not "\\\((unsigned )?_BitInt\\\(512\\\)\\\)" "ccp2" } } */
> +
> +int foo (unsigned _BitInt(256) x) { return __builtin_ctzg ((unsigned _BitInt(512)) x); }
> +int bar (unsigned _BitInt(256) x) { return __builtin_popcountg ((unsigned _BitInt(512)) x); }
> +int baz (unsigned _BitInt(256) x) { return __builtin_parityg ((unsigned _BitInt(512)) x); }
> +int qux (_BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) x); }
> +int corge (_BitInt(256) x) { return __builtin_ctzg ((unsigned _BitInt(512)) x); }
> +int garply (_BitInt(256) x) { return __builtin_parityg ((unsigned _BitInt(512)) x); }
> +int fred (unsigned _BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) x); }
> --- gcc/testsuite/gcc.target/i386/pr78057.c.jj	2020-01-14 20:02:47.876593462 +0100
> +++ gcc/testsuite/gcc.target/i386/pr78057.c	2023-11-17 14:27:39.120666182 +0100
> @@ -5,7 +5,7 @@
>  extern void link_error (void);
>  
>  int
> -foo (int x)
> +foo (int x, long long y)
>  {
>    if (__builtin_ia32_tzcnt_u16 (16) != 4
>        || __builtin_ia32_tzcnt_u16 (0) != 16
> @@ -24,13 +24,14 @@ foo (int x)
>       )
>      link_error ();
>    x += 2;
> -  if (x == 0)
> +  y += 2;
> +  if (x == 0 || y == 0)
>      return 5;
>    return __builtin_ia32_tzcnt_u32 (x)
>           + __builtin_ia32_lzcnt_u32 (x)
>  #ifdef __x86_64__
> -	 + __builtin_ia32_tzcnt_u64 (x)
> -	 + __builtin_ia32_lzcnt_u64 (x)
> +	 + __builtin_ia32_tzcnt_u64 (y)
> +	 + __builtin_ia32_lzcnt_u64 (y)
>  #endif
>  	 ;
>  }
> 
> 	Jakub
> 
>
  

Patch

--- gcc/match.pd.jj	2023-11-14 10:52:16.190276041 +0100
+++ gcc/match.pd	2023-11-16 13:09:41.221275374 +0100
@@ -8672,6 +8672,50 @@  (define_operator_list SYNC_FETCH_AND_AND
 			       wi::shifted_mask (tree_to_uhwi (@1), 1,
 						 false, prec)); })))))))
 
+#if GIMPLE
+/* ctz(ext(X)) == ctz(X).  Valid just for the UB at zero cases though.  */
+(simplify
+  (CTZ (convert@1 @0))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
+   (with { combined_fn cfn = CFN_LAST;
+	   tree type0 = TREE_TYPE (@0);
+	   if (TREE_CODE (type0) == BITINT_TYPE)
+	     {
+	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
+		 cfn = CFN_CTZ;
+	       else
+		 type0
+		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
+						     1);
+	     }
+	   type0 = unsigned_type_for (type0);
+	   if (cfn == CFN_LAST
+	       && direct_internal_fn_supported_p (IFN_CTZ, type0,
+						  OPTIMIZE_FOR_BOTH))
+	     cfn = CFN_CTZ;
+	   if (cfn == CFN_LAST
+	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
+	       && !direct_internal_fn_supported_p (IFN_CTZ,
+						   TREE_TYPE (@1),
+						   OPTIMIZE_FOR_BOTH))
+	     {
+	       if (TYPE_PRECISION (type0)
+		   == TYPE_PRECISION (unsigned_type_node))
+		 cfn = CFN_BUILT_IN_CTZ;
+	       else if (TYPE_PRECISION (type0)
+			== TYPE_PRECISION (long_long_unsigned_type_node))
+		 cfn = CFN_BUILT_IN_CTZLL;
+	     } }
+    (if (cfn == CFN_CTZ)
+     (IFN_CTZ (convert:type0 @0))
+     (if (cfn == CFN_BUILT_IN_CTZ)
+      (BUILT_IN_CTZ (convert:type0 @0))
+      (if (cfn == CFN_BUILT_IN_CTZLL)
+       (BUILT_IN_CTZLL (convert:type0 @0))))))))
+#endif
+
 /* POPCOUNT simplifications.  */
 /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
 (simplify
@@ -8742,6 +8786,50 @@  (define_operator_list SYNC_FETCH_AND_AND
 	      (popcount:s @1))
       (popcount (log2 @0 @1)))))
 
+#if GIMPLE
+/* popcount(zext(X)) == popcount(X).  */
+(simplify
+  (POPCOUNT (convert@1 @0))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       && TYPE_UNSIGNED (TREE_TYPE (@0))
+       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
+   (with { combined_fn cfn = CFN_LAST;
+	   tree type0 = TREE_TYPE (@0);
+	   if (TREE_CODE (type0) == BITINT_TYPE)
+	     {
+	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
+		 cfn = CFN_POPCOUNT;
+	       else
+		 type0
+		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
+						     1);
+	     }
+	   if (cfn == CFN_LAST
+	       && direct_internal_fn_supported_p (IFN_POPCOUNT, type0,
+						  OPTIMIZE_FOR_BOTH))
+	     cfn = CFN_POPCOUNT;
+	   if (cfn == CFN_LAST
+	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
+	       && !direct_internal_fn_supported_p (IFN_POPCOUNT,
+						   TREE_TYPE (@1),
+						   OPTIMIZE_FOR_BOTH))
+	     {
+	       if (TYPE_PRECISION (type0)
+		   == TYPE_PRECISION (unsigned_type_node))
+		 cfn = CFN_BUILT_IN_POPCOUNT;
+	       else if (TYPE_PRECISION (type0)
+			== TYPE_PRECISION (long_long_unsigned_type_node))
+		 cfn = CFN_BUILT_IN_POPCOUNTLL;
+	     } }
+    (if (cfn == CFN_POPCOUNT)
+     (IFN_POPCOUNT (convert:type0 @0))
+     (if (cfn == CFN_BUILT_IN_POPCOUNT)
+      (BUILT_IN_POPCOUNT (convert:type0 @0))
+      (if (cfn == CFN_BUILT_IN_POPCOUNTLL)
+       (BUILT_IN_POPCOUNTLL (convert:type0 @0))))))))
+#endif
+
 /* PARITY simplifications.  */
 /* parity(~X) is parity(X).  */
 (simplify
@@ -8780,6 +8868,54 @@  (define_operator_list SYNC_FETCH_AND_AND
   (bit_xor (PARITY:s @0) (PARITY:s @1))
   (PARITY (bit_xor @0 @1)))
 
+#if GIMPLE
+/* parity(zext(X)) == parity(X).  */
+/* parity(sext(X)) == parity(X) if the difference in precision is even.  */
+(simplify
+  (PARITY (convert@1 @0))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0))
+       && (TYPE_UNSIGNED (TREE_TYPE (@0))
+	   || ((TYPE_PRECISION (TREE_TYPE (@1))
+		- TYPE_PRECISION (TREE_TYPE (@0))) & 1) == 0))
+   (with { combined_fn cfn = CFN_LAST;
+	   tree type0 = TREE_TYPE (@0);
+	   if (TREE_CODE (type0) == BITINT_TYPE)
+	     {
+	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
+		 cfn = CFN_PARITY;
+	       else
+		 type0
+		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
+						     1);
+	     }
+	   type0 = unsigned_type_for (type0);
+	   if (cfn == CFN_LAST
+	       && direct_internal_fn_supported_p (IFN_PARITY, type0,
+						  OPTIMIZE_FOR_BOTH))
+	     cfn = CFN_PARITY;
+	   if (cfn == CFN_LAST
+	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
+	       && !direct_internal_fn_supported_p (IFN_PARITY,
+						   TREE_TYPE (@1),
+						   OPTIMIZE_FOR_BOTH))
+	     {
+	       if (TYPE_PRECISION (type0)
+		   == TYPE_PRECISION (unsigned_type_node))
+		 cfn = CFN_BUILT_IN_PARITY;
+	       else if (TYPE_PRECISION (type0)
+			== TYPE_PRECISION (long_long_unsigned_type_node))
+		 cfn = CFN_BUILT_IN_PARITYLL;
+	     } }
+    (if (cfn == CFN_PARITY)
+     (IFN_PARITY (convert:type0 @0))
+     (if (cfn == CFN_BUILT_IN_PARITY)
+      (BUILT_IN_PARITY (convert:type0 @0))
+      (if (cfn == CFN_BUILT_IN_PARITYLL)
+       (BUILT_IN_PARITYLL (convert:type0 @0))))))))
+#endif
+
 /* a != 0 ? FUN(a) : 0 -> Fun(a) for some builtin functions. */
 (for func (POPCOUNT BSWAP FFS PARITY)
  (simplify
@@ -8987,8 +9123,7 @@  (define_operator_list SYNC_FETCH_AND_AND
    (plus (CTZ:type (convert:utype @0)) { build_one_cst (type); }))))
 #endif
 
-(for ffs (BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL
-	  BUILT_IN_FFSIMAX)
+(for ffs (FFS)
  /* __builtin_ffs (X) == 0 -> X == 0.
     __builtin_ffs (X) == 6 -> (X & 63) == 32.  */
  (for cmp (eq ne)
@@ -9036,6 +9171,50 @@  (define_operator_list SYNC_FETCH_AND_AND
 		     { build_zero_cst (TREE_TYPE (@0)); }))))))))
 
 #if GIMPLE
+/* ffs(ext(X)) == ffs(X).  */
+(simplify
+  (FFS (convert@1 @0))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)))
+   (with { combined_fn cfn = CFN_LAST;
+	   tree type0 = TREE_TYPE (@0);
+	   if (TREE_CODE (type0) == BITINT_TYPE)
+	     {
+	       if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE)
+		 cfn = CFN_FFS;
+	       else
+		 type0
+		   = build_nonstandard_integer_type (TYPE_PRECISION (type0),
+						     0);
+	     }
+	   type0 = signed_type_for (type0);
+	   if (cfn == CFN_LAST
+	       && direct_internal_fn_supported_p (IFN_FFS, type0,
+						  OPTIMIZE_FOR_BOTH))
+	     cfn = CFN_FFS;
+	   if (cfn == CFN_LAST
+	       && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD
+	       && !direct_internal_fn_supported_p (IFN_FFS,
+						   TREE_TYPE (@1),
+						   OPTIMIZE_FOR_BOTH))
+	     {
+	       if (TYPE_PRECISION (type0)
+		   == TYPE_PRECISION (integer_type_node))
+		 cfn = CFN_BUILT_IN_FFS;
+	       else if (TYPE_PRECISION (type0)
+			== TYPE_PRECISION (long_long_integer_type_node))
+		 cfn = CFN_BUILT_IN_FFSLL;
+	     } }
+    (if (cfn == CFN_FFS)
+     (IFN_FFS (convert:type0 @0))
+     (if (cfn == CFN_BUILT_IN_FFS)
+      (BUILT_IN_FFS (convert:type0 @0))
+      (if (cfn == CFN_BUILT_IN_FFSLL)
+       (BUILT_IN_FFSLL (convert:type0 @0))))))))
+#endif
+
+#if GIMPLE
 
 /* Simplify:
      a = op a1
--- gcc/testsuite/gcc.dg/pr112566-1.c.jj	2023-11-16 13:29:23.063761320 +0100
+++ gcc/testsuite/gcc.dg/pr112566-1.c	2023-11-16 13:35:14.459852714 +0100
@@ -0,0 +1,14 @@ 
+/* PR tree-optimization/112566 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbmi2 -mlzcnt -mpopcnt" { target i?86-*-* x86_64-*-* } } */
+/* { dg-final { scan-tree-dump-not "ll \\\(" "optimized" { target ia32 } } } */
+/* { dg-final { scan-tree-dump-not "\\\(long long (unsigned )?int\\\)" "optimized" { target ia32 } } } */
+
+int foo (unsigned int x) { return __builtin_ctzll (x); }
+int bar (unsigned int x) { return __builtin_popcountll (x); }
+int baz (unsigned int x) { return __builtin_parityll (x); }
+int qux (int x) { return __builtin_ffsll (x); }
+int corge (int x) { return __builtin_ctzll (x); }
+int garply (int x) { return __builtin_parityll (x); }
+int fred (unsigned int x) { return __builtin_ffsll (x); }
--- gcc/testsuite/gcc.dg/pr112566-2.c.jj	2023-11-16 13:32:30.854138101 +0100
+++ gcc/testsuite/gcc.dg/pr112566-2.c	2023-11-16 13:34:14.727687104 +0100
@@ -0,0 +1,12 @@ 
+/* PR tree-optimization/112566 */
+/* { dg-do compile { target bitint575 } } */
+/* { dg-options "-O2 -fdump-tree-ccp2" } */
+/* { dg-final { scan-tree-dump-not "\\\((unsigned )?_BitInt\\\(512\\\)\\\)" "ccp2" } } */
+
+int foo (unsigned _BitInt(256) x) { return __builtin_ctzg ((unsigned _BitInt(512)) x); }
+int bar (unsigned _BitInt(256) x) { return __builtin_popcountg ((unsigned _BitInt(512)) x); }
+int baz (unsigned _BitInt(256) x) { return __builtin_parityg ((unsigned _BitInt(512)) x); }
+int qux (_BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) x); }
+int corge (_BitInt(256) x) { return __builtin_ctzg ((unsigned _BitInt(512)) x); }
+int garply (_BitInt(256) x) { return __builtin_parityg ((unsigned _BitInt(512)) x); }
+int fred (unsigned _BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) x); }
--- gcc/testsuite/gcc.target/i386/pr78057.c.jj	2020-01-14 20:02:47.876593462 +0100
+++ gcc/testsuite/gcc.target/i386/pr78057.c	2023-11-17 14:27:39.120666182 +0100
@@ -5,7 +5,7 @@ 
 extern void link_error (void);
 
 int
-foo (int x)
+foo (int x, long long y)
 {
   if (__builtin_ia32_tzcnt_u16 (16) != 4
       || __builtin_ia32_tzcnt_u16 (0) != 16
@@ -24,13 +24,14 @@  foo (int x)
      )
     link_error ();
   x += 2;
-  if (x == 0)
+  y += 2;
+  if (x == 0 || y == 0)
     return 5;
   return __builtin_ia32_tzcnt_u32 (x)
          + __builtin_ia32_lzcnt_u32 (x)
 #ifdef __x86_64__
-	 + __builtin_ia32_tzcnt_u64 (x)
-	 + __builtin_ia32_lzcnt_u64 (x)
+	 + __builtin_ia32_tzcnt_u64 (y)
+	 + __builtin_ia32_lzcnt_u64 (y)
 #endif
 	 ;
 }