[1/4] middle-end Support not decomposing specific divisions during vectorization.

Message ID patch-15779-tamar@arm.com
State New, archived
Headers
Series [1/4] middle-end Support not decomposing specific divisions during vectorization. |

Commit Message

Tamar Christina Sept. 23, 2022, 9:33 a.m. UTC
  Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

e.g.:

   x = y / (2 ^ (bitsize (y)/2)-1

This patch adds a new target hook can_special_div_by_const, similar to
can_vec_perm which can be called to check if a target will handle a particular
division in a special way in the back-end.

The vectorizer will then vectorize the division using the standard tree code
and at expansion time the hook is called again to generate the code for the
division.

Alot of the changes in the patch are to pass down the tree operands in all paths
that can lead to the divmod expansion so that the target hook always has the
type of the expression you're expanding since the types can change the
expansion.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
	* expmed.cc (expand_divmod): Likewise.
	* explow.cc (round_push, align_dynamic_address): Likewise.
	* expr.cc (force_operand, expand_expr_divmod): Likewise.
	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
	Likewise.
	* target.h: Include tree-core.
	* target.def (can_special_div_by_const): New.
	* targhooks.cc (default_can_special_div_by_const): New.
	* targhooks.h (default_can_special_div_by_const): New.
	* tree-vect-generic.cc (expand_vector_operation): Use it.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
	* tree-vect-stmts.cc (vectorizable_operation): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
	* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644




--
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
  

Comments

Richard Biener Sept. 26, 2022, 10:39 a.m. UTC | #1
On Fri, 23 Sep 2022, Tamar Christina wrote:

> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel values
> by a widening operation and then scale them back into range by dividing by 255.
> 
> e.g.:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1
> 
> This patch adds a new target hook can_special_div_by_const, similar to
> can_vec_perm which can be called to check if a target will handle a particular
> division in a special way in the back-end.
> 
> The vectorizer will then vectorize the division using the standard tree code
> and at expansion time the hook is called again to generate the code for the
> division.
> 
> Alot of the changes in the patch are to pass down the tree operands in all paths
> that can lead to the divmod expansion so that the target hook always has the
> type of the expression you're expanding since the types can change the
> expansion.

The type of the expression should be available via the mode and the
signedness, no?  So maybe to avoid having both RTX and TREE on the
target hook pass it a wide_int instead for the divisor?

> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
> +This hook is used to test whether the target has a special method of
> +division of vectors of type @var{vectype} using the two operands @code{treeop0},
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The division
> +will then not be decomposed by the and kept as a div.
> +
> +When the hook is being used to test whether the target supports a special
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
> +is being used to emit a division, @var{in0} and @var{in1} are the source
> +vectors of type @var{vecttype} and @var{output} is the destination vector of
> +type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
>  This hook should return the decl of a function that implements the
>  vectorized variant of the function with the @code{combined_fn} code
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
>  
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
>  
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
>  
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc
> index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
>  
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h
> index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
>  extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
>  
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
>  
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
>  
> +  /* Check if the target has specific expansions for the division.  */
> +  if (treeop0
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
> +						     treeop0, treeop1,
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p);
> @@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
>  
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
> -				     gen_int_mode (INTVAL (op1), word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
>  
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
> +				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
>  
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
>  
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
> +				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
>  
> diff --git a/gcc/target.def b/gcc/target.def
> index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
>  
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method of\n\
> +division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
> +will then not be decomposed by the and kept as a div.\n\
> +\n\
> +When the hook is being used to test whether the target supports a special\n\
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
> +is being used to emit a division, @var{in0} and @var{in1} are the source\n\
> +vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
> +type @var{vectype}.\n\
> +\n\
> +Return true if the operation is possible, emitting instructions for it\n\
> +if rtxes are provided and updating @var{output}.",
> + bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */
> diff --git a/gcc/target.h b/gcc/target.h
> index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
>  
>  #if CHECKING_P
>  
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
>  extern rtx default_addr_space_convert (rtx, tree, tree);
>  extern unsigned int default_case_values_threshold (void);
>  extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
> +					      rtx *, rtx, rtx);
>  
>  extern bool default_libc_has_function (enum function_class, tree);
>  extern bool default_libc_has_fast_function (int fcode);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
>  
> +/* Default that no division by constant operations are special.  */
> +bool
> +default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
>  
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> +							  rhs2, NULL,
> +							  NULL_RTX, NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>  
>        return pattern_stmt;
>      }
> +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> +						       oprnd0, oprnd1, NULL,
> +						       NULL_RTX, NULL_RTX))
> +    {
> +      return NULL;
> +    }
>  
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      if (!target_support_p)
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							op0, op1, NULL,
> +							NULL_RTX, NULL_RTX);
>      }
>  
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 
> 
> 
> 
>
  
Tamar Christina Oct. 31, 2022, 11:34 a.m. UTC | #2
> 
> The type of the expression should be available via the mode and the
> signedness, no?  So maybe to avoid having both RTX and TREE on the target
> hook pass it a wide_int instead for the divisor?
> 

Done.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
	* expmed.cc (expand_divmod): Likewise.
	* explow.cc (round_push, align_dynamic_address): Likewise.
	* expr.cc (force_operand, expand_expr_divmod): Likewise.
	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
	Likewise.
	* target.h: Include tree-core.
	* target.def (can_special_div_by_const): New.
	* targhooks.cc (default_can_special_div_by_const): New.
	* targhooks.h (default_can_special_div_by_const): New.
	* tree-vect-generic.cc (expand_vector_operation): Use it.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
	* tree-vect-stmts.cc (vectorizable_operation): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
	* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch ---

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094c7a712c0604fb77 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the value @var{constant},
+and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7333f271b1965b78 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  tree cst;
+  if (treeop0
+      && treeop1
+      && (cst = uniform_integer_cst_p (treeop1))
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     wi::to_wide (cst),
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dacf18d0e0b133e4c 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the value @var{constant},\n\
+and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, wide_int constant, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887dd980b1a7c5cf1c 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b69fe04593ae3419e 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, wide_int, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e79d66aa24e2f3b 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  tree cst = uniform_integer_cst_p (rhs2);
+	  if (cst &&
+	      targetm.vectorize.can_special_div_by_const (code, type,
+							  wi::to_wide (cst),
+							  NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0affd56e7314370 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
   gimple *pattern_stmt, *def_stmt;
   enum tree_code rhs_code;
   optab optab;
-  tree q;
+  tree q, cst;
   int dummy_int, prec;
 
   if (!is_gimple_assign (last_stmt))
@@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+							  wi::to_wide (cst),
+							  NULL, NULL_RTX,
+							  NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e83b67e3ea53d25f4 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      tree cst;
+      if (!target_support_p
+	  && (cst = uniform_integer_cst_p (op1)))
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							wi::to_wide (cst),
+							NULL, NULL_RTX,
+							NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
  
Jeff Law Oct. 31, 2022, 5:12 p.m. UTC | #3
On 10/31/22 05:34, Tamar Christina wrote:
>> The type of the expression should be available via the mode and the
>> signedness, no?  So maybe to avoid having both RTX and TREE on the target
>> hook pass it a wide_int instead for the divisor?
>>
> Done.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
>
> --- inline copy of patch ---
>
OK for the trunk.


Jeff
  
Tamar Christina Nov. 8, 2022, 5:36 p.m. UTC | #4
Ping.

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Richard Biener <rguenther@suse.de>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jeffreyalaw@gmail.com
> Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific
> divisions during vectorization.
> 
> >
> > The type of the expression should be available via the mode and the
> > signedness, no?  So maybe to avoid having both RTX and TREE on the
> > target hook pass it a wide_int instead for the divisor?
> >
> 
> Done.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition
> to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod,
> expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094
> c7a712c0604fb77 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> to handle these two  implementation approaches itself.
>  @end deftypefn
> 
> +@deftypefn {Target Hook} bool
> TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +(enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx
> +*@var{output}, rtx @var{in0}, rtx @var{in1}) This hook is used to test
> +whether the target has a special method of division of vectors of type
> +@var{vectype} using the value @var{constant}, and producing a vector of
> type @var{vectype}.  The division will then not be decomposed by the and
> kept as a div.
> +
> +When the hook is being used to test whether the target supports a
> +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> +When the hook is being used to emit a division, @var{in0} and @var{in1}
> +are the source vectors of type @var{vecttype} and @var{output} is the
> +destination vector of type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree
> TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> should return the decl of a function that implements the  vectorized variant
> of the function with the @code{combined_fn} code diff --git
> a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> 076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> can generate better code.
> 
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> 
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> 
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc index
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> 5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> + align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> 
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h index
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> rtx, poly_int64, rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods =
> OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7
> 333f271b1965b78 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0, HOST_WIDE_INT d)
> 
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
> 
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> 
> +  /* Check if the target has specific expansions for the division.  */
> +  tree cst;
> +  if (treeop0
> +      && treeop1
> +      && (cst = uniform_integer_cst_p (treeop1))
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> (treeop0),
> +						     wi::to_wide (cst),
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
> 
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> 872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> sum,
> -				     gen_int_mode (INTVAL (op1),
> word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
> 
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> quot1,
> +				op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> quot1,
> +				 op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
> 
> diff --git a/gcc/target.def b/gcc/target.def index
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dac
> f18d0e0b133e4c 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
> 
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method
> +of\n\ division of vectors of type @var{vectype} using the value
> +@var{constant},\n\ and producing a vector of type @var{vectype}.  The
> +division\n\ will then not be decomposed by the and kept as a div.\n\
> +\n\ When the hook is being used to test whether the target supports a
> +special\n\ divide, @var{in0}, @var{in1}, and @var{output} are all null.
> +When the hook\n\ is being used to emit a division, @var{in0} and
> +@var{in1} are the source\n\ vectors of type @var{vecttype} and
> +@var{output} is the destination vector of\n\ type @var{vectype}.\n\ \n\
> +Return true if the operation is possible, emitting instructions for
> +it\n\ if rtxes are provided and updating @var{output}.",  bool, (enum
> +tree_code, tree vectype, wide_int constant, rtx *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> b/gcc/target.h index
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
> 
>  #if CHECKING_P
> 
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887
> dd980b1a7c5cf1c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> tree, tree);  extern unsigned int default_case_values_threshold (void);
> extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree,
> wide_int,
> +					      rtx *, rtx, rtx);
> 
>  extern bool default_libc_has_function (enum function_class, tree);  extern
> bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> b/gcc/targhooks.cc index
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b6
> 9fe04593ae3419e 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
> 
> +/* Default that no division by constant operations are special.  */
> +bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
> +rtx *, rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> 157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> +} } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> 832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e7
> 9d66aa24e2f3b 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator
> *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
> 
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  tree cst = uniform_integer_cst_p (rhs2);
> +	  if (cst &&
> +	      targetm.vectorize.can_special_div_by_const (code, type,
> +							  wi::to_wide (cst),
> +							  NULL,
> +							  NULL_RTX,
> NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> patterns.cc b/gcc/tree-vect-patterns.cc index
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0
> affd56e7314370 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>    gimple *pattern_stmt, *def_stmt;
>    enum tree_code rhs_code;
>    optab optab;
> -  tree q;
> +  tree q, cst;
>    int dummy_int, prec;
> 
>    if (!is_gimple_assign (last_stmt))
> @@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> 
>        return pattern_stmt;
>      }
> +  else if ((cst = uniform_integer_cst_p (oprnd1))
> +	   && targetm.vectorize.can_special_div_by_const (rhs_code,
> vectype,
> +							  wi::to_wide (cst),
> +							  NULL, NULL_RTX,
> +							  NULL_RTX))
> +    {
> +      return NULL;
> +    }
> 
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e8
> 3b67e3ea53d25f4 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      tree cst;
> +      if (!target_support_p
> +	  && (cst = uniform_integer_cst_p (op1)))
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							wi::to_wide (cst),
> +							NULL, NULL_RTX,
> +							NULL_RTX);
>      }
> 
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
  
Richard Biener Nov. 9, 2022, 8:01 a.m. UTC | #5
On Tue, 8 Nov 2022, Tamar Christina wrote:

> Ping.

Jeff approved this already.  I think it's OK if the rest of the series
is approved.

Richard.

> > -----Original Message-----
> > From: Tamar Christina
> > Sent: Monday, October 31, 2022 11:35 AM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific
> > divisions during vectorization.
> > 
> > >
> > > The type of the expression should be available via the mode and the
> > > signedness, no?  So maybe to avoid having both RTX and TREE on the
> > > target hook pass it a wide_int instead for the divisor?
> > >
> > 
> > Done.
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> > 
> > Ok for master?
> > 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> > 	* expmed.h (expand_divmod): Pass tree operands down in addition
> > to RTX.
> > 	* expmed.cc (expand_divmod): Likewise.
> > 	* explow.cc (round_push, align_dynamic_address): Likewise.
> > 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> > 	* optabs.cc (expand_doubleword_mod,
> > expand_doubleword_divmod):
> > 	Likewise.
> > 	* target.h: Include tree-core.
> > 	* target.def (can_special_div_by_const): New.
> > 	* targhooks.cc (default_can_special_div_by_const): New.
> > 	* targhooks.h (default_can_special_div_by_const): New.
> > 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> > 	* doc/tm.texi.in: Document it.
> > 	* doc/tm.texi: Regenerate.
> > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> > support.
> > 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > 
> > --- inline copy of patch ---
> > 
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094
> > c7a712c0604fb77 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> > to handle these two  implementation approaches itself.
> >  @end deftypefn
> > 
> > +@deftypefn {Target Hook} bool
> > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +(enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx
> > +*@var{output}, rtx @var{in0}, rtx @var{in1}) This hook is used to test
> > +whether the target has a special method of division of vectors of type
> > +@var{vectype} using the value @var{constant}, and producing a vector of
> > type @var{vectype}.  The division will then not be decomposed by the and
> > kept as a div.
> > +
> > +When the hook is being used to test whether the target supports a
> > +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook is being used to emit a division, @var{in0} and @var{in1}
> > +are the source vectors of type @var{vecttype} and @var{output} is the
> > +destination vector of type @var{vectype}.
> > +
> > +Return true if the operation is possible, emitting instructions for it
> > +if rtxes are provided and updating @var{output}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} tree
> > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> > should return the decl of a function that implements the  vectorized variant
> > of the function with the @code{combined_fn} code diff --git
> > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> > 076d058c24ce093 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> > can generate better code.
> > 
> >  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> > 
> > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +
> >  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> > 
> >  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> > diff --git a/gcc/explow.cc b/gcc/explow.cc index
> > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> > 5e346bf34ba0036 100644
> > --- a/gcc/explow.cc
> > +++ b/gcc/explow.cc
> > @@ -1037,7 +1037,7 @@ round_push (rtx size)
> >       TRUNC_DIV_EXPR.  */
> >    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
> >  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> > +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> > + align_rtx,
> >  			NULL_RTX, 1);
> >    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> > 
> > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> > required_align)
> >  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
> >  				       Pmode),
> >  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> > +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> > target,
> >  			  gen_int_mode (required_align / BITS_PER_UNIT,
> >  					Pmode),
> >  			  NULL_RTX, 1);
> > diff --git a/gcc/expmed.h b/gcc/expmed.h index
> > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> > f33cb3595659b5 100644
> > --- a/gcc/expmed.h
> > +++ b/gcc/expmed.h
> > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> > machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> > rtx, poly_int64, rtx,
> >  			 int);
> >  #ifdef GCC_OPTABS_H
> > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> > -			  rtx, int, enum optab_methods =
> > OPTAB_LIB_WIDEN);
> > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> > tree,
> > +			  rtx, rtx, rtx, int,
> > +			  enum optab_methods = OPTAB_LIB_WIDEN);
> >  #endif
> >  #endif
> > 
> > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> > 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7
> > 333f271b1965b78 100644
> > --- a/gcc/expmed.cc
> > +++ b/gcc/expmed.cc
> > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> > op0, HOST_WIDE_INT d)
> > 
> >  rtx
> >  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> > mode,
> > -	       rtx op0, rtx op1, rtx target, int unsignedp,
> > -	       enum optab_methods methods)
> > +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> > +	       int unsignedp, enum optab_methods methods)
> >  {
> >    machine_mode compute_mode;
> >    rtx tquotient;
> > @@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> > 
> >    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> > 
> > +  /* Check if the target has specific expansions for the division.  */
> > +  tree cst;
> > +  if (treeop0
> > +      && treeop1
> > +      && (cst = uniform_integer_cst_p (treeop1))
> > +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> > (treeop0),
> > +						     wi::to_wide (cst),
> > +						     &target, op0, op1))
> > +    return target;
> > +
> > +
> >    /* Now convert to the best mode to use.  */
> >    if (compute_mode != mode)
> >      {
> > @@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  			    || (optab_handler (sdivmod_optab, int_mode)
> >  				!= CODE_FOR_nothing)))
> >  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> > -						int_mode, op0,
> > -						gen_int_mode (abs_d,
> > +						int_mode, treeop0, treeop1,
> > +						op0, gen_int_mode (abs_d,
> >  							      int_mode),
> >  						NULL_RTX, 0);
> >  		    else
> > @@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  				      size - 1, NULL_RTX, 0);
> >  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
> >  				    NULL_RTX);
> > -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> > op1,
> > -				    NULL_RTX, 0);
> > +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> > treeop0,
> > +				    treeop1, t3, op1, NULL_RTX, 0);
> >  		if (t4)
> >  		  {
> >  		    rtx t5;
> > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > index
> > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> > a8abc055fa34d9 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
> >  	    return expand_divmod (0,
> >  				  FLOAT_MODE_P (GET_MODE (value))
> >  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> > -				  GET_MODE (value), op1, op2, target, 0);
> > +				  GET_MODE (value), NULL, NULL, op1, op2,
> > +				  target, 0);
> >  	case MOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 0);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 0);
> >  	case UDIV:
> > -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case UMOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case ASHIFTRT:
> >  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
> >  				      target, 0, OPTAB_LIB_WIDEN);
> > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> > machine_mode mode, tree treeop0,
> >        bool speed_p = optimize_insn_for_speed_p ();
> >        do_pending_stack_adjust ();
> >        start_sequence ();
> > -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> > +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 1);
> >        rtx_insn *uns_insns = get_insns ();
> >        end_sequence ();
> >        start_sequence ();
> > -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> > +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 0);
> >        rtx_insn *sgn_insns = get_insns ();
> >        end_sequence ();
> >        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> > treeop0,
> >        emit_insn (sgn_insns);
> >        return sgn_ret;
> >      }
> > -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> > +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +			op0, op1, target, unsignedp);
> >  }
> > 
> >  rtx
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> > 872f340855dc96 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> > rtx op0, rtx op1, bool unsignedp)
> >  		return NULL_RTX;
> >  	    }
> >  	}
> > -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > sum,
> > -				     gen_int_mode (INTVAL (op1),
> > word_mode),
> > +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > NULL, NULL,
> > +				     sum, gen_int_mode (INTVAL (op1),
> > +							word_mode),
> >  				     NULL_RTX, 1, OPTAB_DIRECT);
> >        if (remainder == NULL_RTX)
> >  	return NULL_RTX;
> > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> > 
> >    if (op11 != const1_rtx)
> >      {
> > -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> > -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> > quot1,
> > +				op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> > -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> > quot1,
> > +				 op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (quot2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > diff --git a/gcc/target.def b/gcc/target.def index
> > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dac
> > f18d0e0b133e4c 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -1902,6 +1902,25 @@ implementation approaches itself.",
> >  	const vec_perm_indices &sel),
> >   NULL)
> > 
> > +DEFHOOK
> > +(can_special_div_by_const,
> > + "This hook is used to test whether the target has a special method
> > +of\n\ division of vectors of type @var{vectype} using the value
> > +@var{constant},\n\ and producing a vector of type @var{vectype}.  The
> > +division\n\ will then not be decomposed by the and kept as a div.\n\
> > +\n\ When the hook is being used to test whether the target supports a
> > +special\n\ divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook\n\ is being used to emit a division, @var{in0} and
> > +@var{in1} are the source\n\ vectors of type @var{vecttype} and
> > +@var{output} is the destination vector of\n\ type @var{vectype}.\n\ \n\
> > +Return true if the operation is possible, emitting instructions for
> > +it\n\ if rtxes are provided and updating @var{output}.",  bool, (enum
> > +tree_code, tree vectype, wide_int constant, rtx *output,
> > +	rtx in0, rtx in1),
> > + default_can_special_div_by_const)
> > +
> >  /* Return true if the target supports misaligned store/load of a
> >     specific factor denoted in the third parameter.  The last parameter
> >     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> > b/gcc/target.h index
> > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> > f39c061f68b665 100644
> > --- a/gcc/target.h
> > +++ b/gcc/target.h
> > @@ -51,6 +51,7 @@
> >  #include "insn-codes.h"
> >  #include "tm.h"
> >  #include "hard-reg-set.h"
> > +#include "tree-core.h"
> > 
> >  #if CHECKING_P
> > 
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> > ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887
> > dd980b1a7c5cf1c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> > (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> > tree, tree);  extern unsigned int default_case_values_threshold (void);
> > extern bool default_have_conditional_execution (void);
> > +extern bool default_can_special_div_by_const (enum tree_code, tree,
> > wide_int,
> > +					      rtx *, rtx, rtx);
> > 
> >  extern bool default_libc_has_function (enum function_class, tree);  extern
> > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> > b/gcc/targhooks.cc index
> > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b6
> > 9fe04593ae3419e 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
> >    return HAVE_conditional_execution;
> >  }
> > 
> > +/* Default that no division by constant operations are special.  */
> > +bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
> > +rtx *, rtx,
> > +				  rtx)
> > +{
> > +  return false;
> > +}
> > +
> >  /* By default we assume that c99 functions are present at the runtime,
> >     but sincos is not.  */
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> > d7b4d5b64a19b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> > db75b3e4112e2cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> > 157701d9d1cf852
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> > +} } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > 832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e7
> > 9d66aa24e2f3b 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator
> > *gsi, tree type, tree compute_type
> >  	  tree rhs2 = gimple_assign_rhs2 (assign);
> >  	  tree ret;
> > 
> > +	  /* Check if the target was going to handle it through the special
> > +	     division callback hook.  */
> > +	  tree cst = uniform_integer_cst_p (rhs2);
> > +	  if (cst &&
> > +	      targetm.vectorize.can_special_div_by_const (code, type,
> > +							  wi::to_wide (cst),
> > +							  NULL,
> > +							  NULL_RTX,
> > NULL_RTX))
> > +	    return NULL_TREE;
> > +
> > +
> >  	  if (!optimize
> >  	      || !VECTOR_INTEGER_TYPE_P (type)
> >  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> > patterns.cc b/gcc/tree-vect-patterns.cc index
> > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0
> > affd56e7314370 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >    gimple *pattern_stmt, *def_stmt;
> >    enum tree_code rhs_code;
> >    optab optab;
> > -  tree q;
> > +  tree q, cst;
> >    int dummy_int, prec;
> > 
> >    if (!is_gimple_assign (last_stmt))
> > @@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> > 
> >        return pattern_stmt;
> >      }
> > +  else if ((cst = uniform_integer_cst_p (oprnd1))
> > +	   && targetm.vectorize.can_special_div_by_const (rhs_code,
> > vectype,
> > +							  wi::to_wide (cst),
> > +							  NULL, NULL_RTX,
> > +							  NULL_RTX))
> > +    {
> > +      return NULL;
> > +    }
> > 
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e8
> > 3b67e3ea53d25f4 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
> >  	}
> >        target_support_p = (optab_handler (optab, vec_mode)
> >  			  != CODE_FOR_nothing);
> > +      tree cst;
> > +      if (!target_support_p
> > +	  && (cst = uniform_integer_cst_p (op1)))
> > +	target_support_p
> > +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> > +							wi::to_wide (cst),
> > +							NULL, NULL_RTX,
> > +							NULL_RTX);
> >      }
> > 
> >    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
>
  
Tamar Christina Nov. 9, 2022, 8:26 a.m. UTC | #6
Ah sorry, i missed that one.

Thanks,
Tamar
  
Kyrylo Tkachov Nov. 9, 2022, 10:37 a.m. UTC | #7
Hi Tamar,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+kyrylo.tkachov=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, September 23, 2022 10:33 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: [PATCH 1/4]middle-end Support not decomposing specific divisions
> during vectorization.
> 
> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel
> values
> by a widening operation and then scale them back into range by dividing by
> 255.
> 
> e.g.:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1
> 
> This patch adds a new target hook can_special_div_by_const, similar to
> can_vec_perm which can be called to check if a target will handle a particular
> division in a special way in the back-end.
> 
> The vectorizer will then vectorize the division using the standard tree code
> and at expansion time the hook is called again to generate the code for the
> division.
> 
> Alot of the changes in the patch are to pass down the tree operands in all
> paths
> that can lead to the divmod expansion so that the target hook always has the
> type of the expression you're expanding since the types can change the
> expansion.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition
> to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod,
> expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch --
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d
> 244a2a23e76cac097 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the
> hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
> 
> +@deftypefn {Target Hook} bool
> TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code},
> tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx
> *@var{output}, rtx @var{in0}, rtx @var{in1})
> +This hook is used to test whether the target has a special method of
> +division of vectors of type @var{vectype} using the two operands
> @code{treeop0},
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The
> division
> +will then not be decomposed by the and kept as a div.

I think the grammar here is wonky, can you reword this sentence please?
(I was just reading this patch to understand the optab semantics futher in the series)
Thanks,
Kyrill

> +
> +When the hook is being used to test whether the target supports a special
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
> +is being used to emit a division, @var{in0} and @var{in1} are the source
> +vectors of type @var{vecttype} and @var{output} is the destination vector
> of
> +type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree
> TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
>  This hook should return the decl of a function that implements the
>  vectorized variant of the function with the @code{combined_fn} code
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b
> 04076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> can generate better code.
> 
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> 
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> 
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc
> index
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae67
> 8f5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> 
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h
> index
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501
> c6f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> machine_mode,
>  extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64,
> rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods =
> OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb
> 0990db8b97d3af414 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0, HOST_WIDE_INT d)
> 
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
> 
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> 
> +  /* Check if the target has specific expansions for the division.  */
> +  if (treeop0
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> (treeop0),
> +						     treeop0, treeop1,
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd
> 96a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p);
> @@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
> 
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abf
> d872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> sum,
> -				     gen_int_mode (INTVAL (op1),
> word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
> 
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> quot1,
> +				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> quot1,
> +				 op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
> 
> diff --git a/gcc/target.def b/gcc/target.def
> index
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b0
> 7081cdd70113db9b1 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
> 
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method of\n\
> +division of vectors of type @var{vectype} using the two operands
> @code{treeop0},\n\
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The
> division\n\
> +will then not be decomposed by the and kept as a div.\n\
> +\n\
> +When the hook is being used to test whether the target supports a
> special\n\
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the
> hook\n\
> +is being used to emit a division, @var{in0} and @var{in1} are the source\n\
> +vectors of type @var{vecttype} and @var{output} is the destination vector
> of\n\
> +type @var{vectype}.\n\
> +\n\
> +Return true if the operation is possible, emitting instructions for it\n\
> +if rtxes are provided and updating @var{output}.",
> + bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx
> *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */
> diff --git a/gcc/target.h b/gcc/target.h
> index
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da
> 56f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
> 
>  #if CHECKING_P
> 
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index
> ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e
> 2640d63f936b336d 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> (addr_space_t, location_t);
>  extern rtx default_addr_space_convert (rtx, tree, tree);
>  extern unsigned int default_case_values_threshold (void);
>  extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree, tree,
> tree,
> +					      rtx *, rtx, rtx);
> 
>  extern bool default_libc_has_function (enum function_class, tree);
>  extern bool default_libc_has_fast_function (int fcode);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba2
> 41279936ced41ee95 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
> 
> +/* Default that no division by constant operations are special.  */
> +bool
> +default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *,
> rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b491
> 6f3d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2c
> d3db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e7
> 20157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bf
> d1832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index
> 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817
> c9a12046b6ec94f3 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator
> *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
> 
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> +							  rhs2, NULL,
> +							  NULL_RTX,
> NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85
> af0b1bfea10fe443 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> 
>        return pattern_stmt;
>      }
> +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> +						       oprnd0, oprnd1, NULL,
> +						       NULL_RTX, NULL_RTX))
> +    {
> +      return NULL;
> +    }
> 
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index
> c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288b
> d68e0e1c1e93faafe 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      if (!target_support_p)
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							op0, op1, NULL,
> +							NULL_RTX,
> NULL_RTX);
>      }
> 
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 
> 
> 
> 
> --
  

Patch

--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@  instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@  address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@  round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@  align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@  extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@  expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@  force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@  expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@  expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@  expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@  expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@  expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@  implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@ 
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@  extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@  default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@ 
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@ 
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@  expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@  vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@  vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);