[2/2] AArch64 Perform more late folding of reg moves and shifts which arrive after expand

Message ID Yy2b1o/foRR6xvBZ@arm.com
State New, archived
Headers
Series [1/2] middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone |

Commit Message

Tamar Christina Sept. 23, 2022, 11:43 a.m. UTC
  Hi All,

Similar to the 1/2 patch but adds additional back-end specific folding for if
the register sequence was created as a result of RTL optimizations.

Concretely:

#include <arm_neon.h>

unsigned int foor (uint32x4_t x)
{
    return x[1] >> 16;
}

generates:

foor:
        umov    w0, v0.h[3]
        ret

instead of

foor:
        umov    w0, v0.s[1]
        lsr     w0, w0, 16
        ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644




--
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
+	(zero_extend:DI (LSHIFTRT:SI
+	 (match_operand:SI 1 "register_operand" "w,r,r")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  machine_mode dest, vec_mode;
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+	  if (size == 16)
+	    dest = HImode;
+	  else if (size == 8)
+	    dest = QImode;
+	  else
+	    gcc_unreachable ();
+
+	  /* Get nearest 64-bit vector mode.  */
+	  int nunits = 64 / size;
+	  auto vector_mode
+	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
+	  if (!vector_mode.exists (&vec_mode))
+	    gcc_unreachable ();
+	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
+	  operands[2] = gen_int_mode (val / size, SImode);
+
+	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
+	     into a weird loop due to a mov of w -> r being present most time
+	     this instruction applies.  */
+	  switch (dest)
+	  {
+	    case QImode:
+	      return "umov\\t%w0, %1.b[%2]";
+	    case HImode:
+	      return "umov\\t%w0, %1.h[%2]";
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT [lshiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
  

Comments

Richard Sandiford Sept. 23, 2022, 2:32 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Similar to the 1/2 patch but adds additional back-end specific folding for if
> the register sequence was created as a result of RTL optimizations.
>
> Concretely:
>
> #include <arm_neon.h>
>
> unsigned int foor (uint32x4_t x)
> {
>     return x[1] >> 16;
> }
>
> generates:
>
> foor:
>         umov    w0, v0.h[3]
>         ret
>
> instead of
>
> foor:
>         umov    w0, v0.s[1]
>         lsr     w0, w0, 16
>         ret

The same thing ought to work for smov, so it would be good to do both.
That would also make the split between the original and new patterns
more obvious: left shift for the old pattern, right shift for the new
pattern.

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_uxtw"
> +  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")

Is the "?" justified?  It seems odd to penalise a native,
single-instruction r->r operation in favour of a w->r operation.

> +	(zero_extend:DI (LSHIFTRT:SI
> +	 (match_operand:SI 1 "register_operand" "w,r,r")
> +	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  ""
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  machine_mode dest, vec_mode;
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +	  if (size == 16)
> +	    dest = HImode;
> +	  else if (size == 8)
> +	    dest = QImode;
> +	  else
> +	    gcc_unreachable ();
> +
> +	  /* Get nearest 64-bit vector mode.  */
> +	  int nunits = 64 / size;
> +	  auto vector_mode
> +	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
> +	  if (!vector_mode.exists (&vec_mode))
> +	    gcc_unreachable ();
> +	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
> +	  operands[2] = gen_int_mode (val / size, SImode);
> +
> +	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
> +	     into a weird loop due to a mov of w -> r being present most time
> +	     this instruction applies.  */
> +	  switch (dest)
> +	  {
> +	    case QImode:
> +	      return "umov\\t%w0, %1.b[%2]";
> +	    case HImode:
> +	      return "umov\\t%w0, %1.h[%2]";
> +	    default:
> +	      gcc_unreachable ();
> +	  }

Doesn't this reduce to something like:

  if (size == 16)
    return "umov\\t%w0, %1.h[1]";
  if (size == 8)
    return "umov\\t%w0, %1.b[3]";
  gcc_unreachable ();

?  We should print %1 correctly as vN even with its original type.

Thanks,
Richard

> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
> +
> +;; Singleton code iterator for only logical right shift.
> +(define_code_iterator LSHIFTRT [lshiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
  
Tamar Christina Oct. 31, 2022, 11:48 a.m. UTC | #2
> 
> The same thing ought to work for smov, so it would be good to do both.
> That would also make the split between the original and new patterns more
> obvious: left shift for the old pattern, right shift for the new pattern.
> 

Done, though because umov can do multilevel extensions I couldn't combine them
Into a single pattern.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
	smov.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read_1.c: New test.
	* gcc.target/aarch64/shift-read_2.c: New test.
	* gcc.target/aarch64/shift-read_3.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5370,20 +5370,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_sxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(sign_extend:GPI (ASHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%<w>0, %<w>1, %2";
+      case 2:
+	return "<shift>\\t%<w>0, %<w>1, %<w>2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(zero_extend:GPI (LSHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "umov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "umov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
+
+;; Singleton code iterator for only arithmetic right shift.
+(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** ufoo:
+**	...
+** 	umov	w0, v0.h\[1\]
+** 	ret
+*/
+uint64_t ufoo (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo:
+**	...
+** 	smov	x0, v0.h\[1\]
+** 	ret
+*/
+int64_t sfoo (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo2:
+**	...
+** 	smov	w0, v0.h\[1\]
+** 	ret
+*/
+int32_t sfoo2 (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** ubar:
+**	...
+** 	umov	w0, v0.b\[3\]
+** 	ret
+*/
+uint64_t ubar (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}
+
+/* 
+** sbar:
+**	...
+** 	smov	x0, v0.b\[3\]
+** 	ret
+*/
+int64_t sbar (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}
  
Richard Sandiford Nov. 14, 2022, 9:54 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> 
>> The same thing ought to work for smov, so it would be good to do both.
>> That would also make the split between the original and new patterns more
>> obvious: left shift for the old pattern, right shift for the new pattern.
>> 
>
> Done, though because umov can do multilevel extensions I couldn't combine them
> Into a single pattern.

Hmm, but the pattern is:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
	(zero_extend:GPI (LSHIFTRT_ONLY:SI
	  (match_operand:SI 1 "register_operand" "w,r,r")
	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
which isn't a valid operation.  The original patch was just for extending
to DI, which seems correct.  The choice between printing %x for smov and
%w for umov can then depend on the code.

>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
> 	smov.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read_1.c: New test.
> 	* gcc.target/aarch64/shift-read_2.c: New test.
> 	* gcc.target/aarch64/shift-read_3.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5370,20 +5370,42 @@ (define_split
>  
>  ;; Arithmetic right shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>  	(ashiftrt:GPI
> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>    ""
> -  "@
> -   asr\t%<w>0, %<w>1, %2
> -   asr\t%<w>0, %<w>1, %<w>2
> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
> -   #
> -   #"
> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
> -   (set_attr "arch" "*,*,simd,simd,simd")]
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	return "asr\t%<w>0, %<w>1, %2";
> +      case 1:
> +	return "asr\t%<w>0, %<w>1, %<w>2";
> +      case 2:
> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
> +      case 3:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%w0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%w0, %1.b[3]";

This only looks right for SI, not DI.  (But we can do something
similar for DI.)

Thanks,
Richard

> +	  gcc_unreachable ();
> +	}
> +      case 4:
> +	return "#";
> +      case 5:
> +	return "#";
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }
> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>  )
>  
>  (define_split
> @@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_sxtw"
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> +	(sign_extend:GPI (ASHIFTRT_ONLY:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%<w>0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%<w>0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%<w>0, %<w>1, %2";
> +      case 2:
> +	return "<shift>\\t%<w>0, %<w>1, %<w>2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
> +(define_insn "*<optab>si3_insn2_uxtw"
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> +	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  ""
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "umov\\t%w0, %1.h[1]";
> +	  if (size == 8)
> +	    return "umov\\t%w0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
> +
> +;; Singleton code iterator for only logical right shift.
> +(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
> +
> +;; Singleton code iterator for only arithmetic right shift.
> +(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> @@ -0,0 +1,96 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor_1:
> +** 	smov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +int32_t foor_1 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** foor_2:
> +** 	smov	x0, v0.h\[3\]
> +** 	ret
> +*/
> +int64_t foor_2 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +
> +/*
> +** fool:
> +** 	[su]mov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool (int32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +short foor2 (int32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool2 (int32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +short foo (int x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** ufoo:
> +**	...
> +** 	umov	w0, v0.h\[1\]
> +** 	ret
> +*/
> +uint64_t ufoo (uint32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** sfoo:
> +**	...
> +** 	smov	x0, v0.h\[1\]
> +** 	ret
> +*/
> +int64_t sfoo (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** sfoo2:
> +**	...
> +** 	smov	w0, v0.h\[1\]
> +** 	ret
> +*/
> +int32_t sfoo2 (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** ubar:
> +**	...
> +** 	umov	w0, v0.b\[3\]
> +** 	ret
> +*/
> +uint64_t ubar (uint32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 24;
> +}
> +
> +/* 
> +** sbar:
> +**	...
> +** 	smov	x0, v0.b\[3\]
> +** 	ret
> +*/
> +int64_t sbar (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 24;
> +}
  
Richard Sandiford Nov. 14, 2022, 9:59 p.m. UTC | #4
(Sorry, immediately following up to myself for a second time recently.)

Richard Sandiford <richard.sandiford@arm.com> writes:
> Tamar Christina <Tamar.Christina@arm.com> writes:
>>> 
>>> The same thing ought to work for smov, so it would be good to do both.
>>> That would also make the split between the original and new patterns more
>>> obvious: left shift for the old pattern, right shift for the new pattern.
>>> 
>>
>> Done, though because umov can do multilevel extensions I couldn't combine them
>> Into a single pattern.
>
> Hmm, but the pattern is:
>
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> 	  (match_operand:SI 1 "register_operand" "w,r,r")
> 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>
> GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
> which isn't a valid operation.  The original patch was just for extending
> to DI, which seems correct.  The choice between printing %x for smov and
> %w for umov can then depend on the code.

My original comment quoted above was about using smov in the zero-extend
pattern.  I.e. the original:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
	(zero_extend:DI (LSHIFTRT:SI
	 (match_operand:SI 1 "register_operand" "w,r,r")
	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

could instead be:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
	(zero_extend:DI (SHIFTRT:SI
	 (match_operand:SI 1 "register_operand" "w,r,r")
	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

with the pattern using "smov %w0, ..." for ashiftft case.

Thanks,
Richard

>
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
>> 	left and right ones.
>> 	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
>> 	smov.
>> 	* config/aarch64/constraints.md (Usl): New.
>> 	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/aarch64/shift-read_1.c: New test.
>> 	* gcc.target/aarch64/shift-read_2.c: New test.
>> 	* gcc.target/aarch64/shift-read_3.c: New test.
>>
>> --- inline copy of patch ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -5370,20 +5370,42 @@ (define_split
>>  
>>  ;; Arithmetic right shift using SISD or Integer instruction
>>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
>> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>>  	(ashiftrt:GPI
>> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
>> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
>> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
>> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>>    ""
>> -  "@
>> -   asr\t%<w>0, %<w>1, %2
>> -   asr\t%<w>0, %<w>1, %<w>2
>> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
>> -   #
>> -   #"
>> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
>> -   (set_attr "arch" "*,*,simd,simd,simd")]
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	return "asr\t%<w>0, %<w>1, %2";
>> +      case 1:
>> +	return "asr\t%<w>0, %<w>1, %<w>2";
>> +      case 2:
>> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
>> +      case 3:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "smov\\t%w0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "smov\\t%w0, %1.b[3]";
>
> This only looks right for SI, not DI.  (But we can do something
> similar for DI.)
>
> Thanks,
> Richard
>
>> +	  gcc_unreachable ();
>> +	}
>> +      case 4:
>> +	return "#";
>> +      case 5:
>> +	return "#";
>> +      default:
>> +	gcc_unreachable ();
>> +    }
>> +  }
>> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
>> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>>  )
>>  
>>  (define_split
>> @@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
>>  ;; zero_extend version of shifts
>>  (define_insn "*<optab>si3_insn_uxtw"
>>    [(set (match_operand:DI 0 "register_operand" "=r,r")
>> -	(zero_extend:DI (SHIFT_no_rotate:SI
>> +	(zero_extend:DI (SHIFT_arith:SI
>>  	 (match_operand:SI 1 "register_operand" "r,r")
>>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>>    ""
>> @@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
>>    [(set_attr "type" "rotate_imm")]
>>  )
>>  
>> +(define_insn "*<optab>si3_insn2_sxtw"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> +	(sign_extend:GPI (ASHIFTRT_ONLY:SI
>> +	  (match_operand:SI 1 "register_operand" "w,r,r")
>> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>> +  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "smov\\t%<w>0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "smov\\t%<w>0, %1.b[3]";
>> +	  gcc_unreachable ();
>> +	}
>> +      case 1:
>> +	return "<shift>\\t%<w>0, %<w>1, %2";
>> +      case 2:
>> +	return "<shift>\\t%<w>0, %<w>1, %<w>2";
>> +      default:
>> +	gcc_unreachable ();
>> +      }
>> +  }
>> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
>> +)
>> +
>> +(define_insn "*<optab>si3_insn2_uxtw"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> +	(zero_extend:GPI (LSHIFTRT_ONLY:SI
>> +	  (match_operand:SI 1 "register_operand" "w,r,r")
>> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>> +  ""
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "umov\\t%w0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "umov\\t%w0, %1.b[3]";
>> +	  gcc_unreachable ();
>> +	}
>> +      case 1:
>> +	return "<shift>\\t%w0, %w1, %2";
>> +      case 2:
>> +	return "<shift>\\t%w0, %w1, %w2";
>> +      default:
>> +	gcc_unreachable ();
>> +      }
>> +  }
>> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
>> +)
>> +
>>  (define_insn "*<optab><mode>3_insn"
>>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
>> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
>> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
>> --- a/gcc/config/aarch64/constraints.md
>> +++ b/gcc/config/aarch64/constraints.md
>> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>>    (and (match_code "const_int")
>>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>>  
>> +(define_constraint "Usl"
>> +  "@internal
>> +  A constraint that matches an immediate shift constant in SImode that has an
>> +  exact mode available to use."
>> +  (and (match_code "const_int")
>> +       (and (match_test "satisfies_constraint_Uss (op)")
>> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
>> +
>>  (define_constraint "Usn"
>>   "A constant that can be used with a CCMN operation (once negated)."
>>   (and (match_code "const_int")
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>>  ;; This code iterator allows the various shifts supported on the core
>>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>>  
>> -;; This code iterator allows all shifts except for rotates.
>> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
>> +;; This code iterator allows arithmetic shifts
>> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
>> +
>> +;; Singleton code iterator for only logical right shift.
>> +(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
>> +
>> +;; Singleton code iterator for only arithmetic right shift.
>> +(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
>>  
>>  ;; This code iterator allows the shifts supported in arithmetic instructions
>>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
>> @@ -0,0 +1,85 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foor:
>> +** 	umov	w0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +unsigned int foor (uint32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +/*
>> +** fool:
>> +** 	umov	w0, v0.s\[1\]
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned int fool (uint32x4_t x)
>> +{
>> +    return x[1] << 16;
>> +}
>> +
>> +/*
>> +** foor2:
>> +** 	umov	w0, v0.h\[7\]
>> +** 	ret
>> +*/
>> +unsigned short foor2 (uint32x4_t x)
>> +{
>> +    return x[3] >> 16;
>> +}
>> +
>> +/*
>> +** fool2:
>> +** 	fmov	w0, s0
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned int fool2 (uint32x4_t x)
>> +{
>> +    return x[0] << 16;
>> +}
>> +
>> +typedef int v4si __attribute__ ((vector_size (16)));
>> +
>> +/*
>> +** bar:
>> +**	addv	s0, v0.4s
>> +**	fmov	w0, s0
>> +**	lsr	w1, w0, 16
>> +**	add	w0, w1, w0, uxth
>> +**	ret
>> +*/
>> +int bar (v4si x)
>> +{
>> +  unsigned int sum = vaddvq_s32 (x);
>> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
>> +}
>> +
>> +/*
>> +** foo:
>> +** 	lsr	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned short foo (unsigned x)
>> +{
>> +  return x >> 16;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**	...
>> +** 	umov	w0, v[0-8]+.h\[1\]
>> +** 	ret
>> +*/
>> +unsigned short foo2 (v4si x)
>> +{
>> +  int y = x[0] + x[1];
>> +  return y >> 16;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
>> @@ -0,0 +1,96 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foor_1:
>> +** 	smov	w0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +int32_t foor_1 (int32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +/*
>> +** foor_2:
>> +** 	smov	x0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +int64_t foor_2 (int32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +
>> +/*
>> +** fool:
>> +** 	[su]mov	w0, v0.s\[1\]
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +int fool (int32x4_t x)
>> +{
>> +    return x[1] << 16;
>> +}
>> +
>> +/*
>> +** foor2:
>> +** 	umov	w0, v0.h\[7\]
>> +** 	ret
>> +*/
>> +short foor2 (int32x4_t x)
>> +{
>> +    return x[3] >> 16;
>> +}
>> +
>> +/*
>> +** fool2:
>> +** 	fmov	w0, s0
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +int fool2 (int32x4_t x)
>> +{
>> +    return x[0] << 16;
>> +}
>> +
>> +typedef int v4si __attribute__ ((vector_size (16)));
>> +
>> +/*
>> +** bar:
>> +**	addv	s0, v0.4s
>> +**	fmov	w0, s0
>> +**	lsr	w1, w0, 16
>> +**	add	w0, w1, w0, uxth
>> +**	ret
>> +*/
>> +int bar (v4si x)
>> +{
>> +  unsigned int sum = vaddvq_s32 (x);
>> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
>> +}
>> +
>> +/*
>> +** foo:
>> +** 	lsr	w0, w0, 16
>> +** 	ret
>> +*/
>> +short foo (int x)
>> +{
>> +  return x >> 16;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**	...
>> +** 	umov	w0, v[0-8]+.h\[1\]
>> +** 	ret
>> +*/
>> +short foo2 (v4si x)
>> +{
>> +  int y = x[0] + x[1];
>> +  return y >> 16;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
>> @@ -0,0 +1,60 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** ufoo:
>> +**	...
>> +** 	umov	w0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +uint64_t ufoo (uint32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** sfoo:
>> +**	...
>> +** 	smov	x0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +int64_t sfoo (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** sfoo2:
>> +**	...
>> +** 	smov	w0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +int32_t sfoo2 (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** ubar:
>> +**	...
>> +** 	umov	w0, v0.b\[3\]
>> +** 	ret
>> +*/
>> +uint64_t ubar (uint32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 24;
>> +}
>> +
>> +/* 
>> +** sbar:
>> +**	...
>> +** 	smov	x0, v0.b\[3\]
>> +** 	ret
>> +*/
>> +int64_t sbar (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 24;
>> +}
  
Tamar Christina Dec. 1, 2022, 4:25 p.m. UTC | #5
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, November 14, 2022 9:59 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves
> and shifts which arrive after expand
> 
> (Sorry, immediately following up to myself for a second time recently.)
> 
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > Tamar Christina <Tamar.Christina@arm.com> writes:
> >>>
> >>> The same thing ought to work for smov, so it would be good to do both.
> >>> That would also make the split between the original and new patterns
> >>> more
> >>> obvious: left shift for the old pattern, right shift for the new pattern.
> >>>
> >>
> >> Done, though because umov can do multilevel extensions I couldn't
> >> combine them Into a single pattern.
> >
> > Hmm, but the pattern is:
> >
> > (define_insn "*<optab>si3_insn2_uxtw"
> >   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> > 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> > 	  (match_operand:SI 1 "register_operand" "w,r,r")
> > 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> >
> > GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
> > which isn't a valid operation.  The original patch was just for
> > extending to DI, which seems correct.  The choice between printing %x
> > for smov and %w for umov can then depend on the code.

You're right, GPI made no sense here.  Fixed.

> 
> My original comment quoted above was about using smov in the zero-
> extend pattern.  I.e. the original:
> 
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
> 	(zero_extend:DI (LSHIFTRT:SI
> 	 (match_operand:SI 1 "register_operand" "w,r,r")
> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> 
> could instead be:
> 
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
> 	(zero_extend:DI (SHIFTRT:SI
> 	 (match_operand:SI 1 "register_operand" "w,r,r")
> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> 
> with the pattern using "smov %w0, ..." for ashiftft case.

Almost, except the none immediate cases don't work with shifts.
i.e. a right shift can't be used to sign extend from 32 to 64 bits.

I've merged the cases but added a guard for this.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	(*aarch64_ashr_sisd_or_int_<mode>3): Support smov.
	(*<optab>si3_insn2_<sra_op>xtw): New.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (is_zeroE, extend_op): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read_1.c: New test.
	* gcc.target/aarch64/shift-read_2.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5425,20 +5425,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5548,7 +5570,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_<sra_op>xtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(<extend_op>:DI (SHIFTRT:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<is_zeroE> || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "<sra_op>mov\\t%x0, %1.h[1]";
+	  if (size == 8)
+	    return "<sra_op>mov\\t%x0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
@@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
 (define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
 			   (lshiftrt "0") (rotatert "0") (rotate "1")])
 
+;; True if zero extending operation or not
+(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false")
+			   (lshiftrt "true")])
+
+
 ;; Op prefix for shift right and accumulate.
 (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
 
+;; Extensions that can be performed with Op
+(define_code_attr extend_op [(ashiftrt "sign_extend")
+			     (lshiftrt "zero_extend")])
+
 ;; op prefix for shift right and narrow.
 (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}
  
Richard Sandiford Dec. 1, 2022, 6:38 p.m. UTC | #6
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, November 14, 2022 9:59 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves
>> and shifts which arrive after expand
>> 
>> (Sorry, immediately following up to myself for a second time recently.)
>> 
>> Richard Sandiford <richard.sandiford@arm.com> writes:
>> > Tamar Christina <Tamar.Christina@arm.com> writes:
>> >>>
>> >>> The same thing ought to work for smov, so it would be good to do both.
>> >>> That would also make the split between the original and new patterns
>> >>> more
>> >>> obvious: left shift for the old pattern, right shift for the new pattern.
>> >>>
>> >>
>> >> Done, though because umov can do multilevel extensions I couldn't
>> >> combine them Into a single pattern.
>> >
>> > Hmm, but the pattern is:
>> >
>> > (define_insn "*<optab>si3_insn2_uxtw"
>> >   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> > 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
>> > 	  (match_operand:SI 1 "register_operand" "w,r,r")
>> > 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> >
>> > GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
>> > which isn't a valid operation.  The original patch was just for
>> > extending to DI, which seems correct.  The choice between printing %x
>> > for smov and %w for umov can then depend on the code.
>
> You're right, GPI made no sense here.  Fixed.
>
>> 
>> My original comment quoted above was about using smov in the zero-
>> extend pattern.  I.e. the original:
>> 
>> (define_insn "*<optab>si3_insn2_uxtw"
>>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
>> 	(zero_extend:DI (LSHIFTRT:SI
>> 	 (match_operand:SI 1 "register_operand" "w,r,r")
>> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> 
>> could instead be:
>> 
>> (define_insn "*<optab>si3_insn2_uxtw"
>>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
>> 	(zero_extend:DI (SHIFTRT:SI
>> 	 (match_operand:SI 1 "register_operand" "w,r,r")
>> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> 
>> with the pattern using "smov %w0, ..." for ashiftft case.
>
> Almost, except the none immediate cases don't work with shifts.
> i.e. a right shift can't be used to sign extend from 32 to 64 bits.

Right, but the pattern I quoted above is doing a zero-extend rather than
a sign-extend, even for the ashiftrt case.  That is, I was suggesting that
we keep the zero_extend fixed but allow zero extensions of both lshiftrts
and ashiftrts.  That works because ASR Wx and SMOV Wx zero-extend the Wn
result to Xn.

I wasn't suggesting that you add support for SI->DI sign extensions,
although obviously the more cases we optimise the better :-)

The original comment was only supposed to be a small tweak, sorry for
not explaining it properly.

Thanks,
Richard

>
> I've merged the cases but added a guard for this.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	(*aarch64_ashr_sisd_or_int_<mode>3): Support smov.
> 	(*<optab>si3_insn2_<sra_op>xtw): New.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (is_zeroE, extend_op): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read_1.c: New test.
> 	* gcc.target/aarch64/shift-read_2.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5425,20 +5425,42 @@ (define_split
>  
>  ;; Arithmetic right shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>  	(ashiftrt:GPI
> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>    ""
> -  "@
> -   asr\t%<w>0, %<w>1, %2
> -   asr\t%<w>0, %<w>1, %<w>2
> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
> -   #
> -   #"
> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
> -   (set_attr "arch" "*,*,simd,simd,simd")]
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	return "asr\t%<w>0, %<w>1, %2";
> +      case 1:
> +	return "asr\t%<w>0, %<w>1, %<w>2";
> +      case 2:
> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
> +      case 3:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%<w>0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%<w>0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 4:
> +	return "#";
> +      case 5:
> +	return "#";
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }
> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>  )
>  
>  (define_split
> @@ -5548,7 +5570,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_<sra_op>xtw"
> +  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
> +	(<extend_op>:DI (SHIFTRT:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  "<is_zeroE> || satisfies_constraint_Usl (operands[2])"
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "<sra_op>mov\\t%x0, %1.h[1]";
> +	  if (size == 8)
> +	    return "<sra_op>mov\\t%x0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -171,6 +171,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> @@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
>  (define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
>  			   (lshiftrt "0") (rotatert "0") (rotate "1")])
>  
> +;; True if zero extending operation or not
> +(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false")
> +			   (lshiftrt "true")])
> +
> +
>  ;; Op prefix for shift right and accumulate.
>  (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
>  
> +;; Extensions that can be performed with Op
> +(define_code_attr extend_op [(ashiftrt "sign_extend")
> +			     (lshiftrt "zero_extend")])
> +
>  ;; op prefix for shift right and narrow.
>  (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")])
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> @@ -0,0 +1,84 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor_1:
> +** 	smov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +int32_t foor_1 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** foor_2:
> +** 	smov	x0, v0.h\[3\]
> +** 	ret
> +*/
> +int64_t foor_2 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +
> +/*
> +** fool:
> +** 	[su]mov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool (int32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +short foor2 (int32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool2 (int32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +short foo (int x)
> +{
> +  return x >> 16;
> +}
  

Patch

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5493,7 +5493,7 @@  (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5528,60 @@  (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
+	(zero_extend:DI (LSHIFTRT:SI
+	 (match_operand:SI 1 "register_operand" "w,r,r")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  machine_mode dest, vec_mode;
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+	  if (size == 16)
+	    dest = HImode;
+	  else if (size == 8)
+	    dest = QImode;
+	  else
+	    gcc_unreachable ();
+
+	  /* Get nearest 64-bit vector mode.  */
+	  int nunits = 64 / size;
+	  auto vector_mode
+	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
+	  if (!vector_mode.exists (&vec_mode))
+	    gcc_unreachable ();
+	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
+	  operands[2] = gen_int_mode (val / size, SImode);
+
+	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
+	     into a weird loop due to a mov of w -> r being present most time
+	     this instruction applies.  */
+	  switch (dest)
+	  {
+	    case QImode:
+	      return "umov\\t%w0, %1.b[%2]";
+	    case HImode:
+	      return "umov\\t%w0, %1.h[%2]";
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@  (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,11 @@  (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT [lshiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
@@ -0,0 +1,85 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}