AArch64 Handle copysign (x, -1) expansion efficiently

Message ID patch-17811-tamar@arm.com
State Unresolved
Headers
Series AArch64 Handle copysign (x, -1) expansion efficiently |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Tamar Christina Oct. 5, 2023, 6:20 p.m. UTC
  Hi All,

copysign (x, -1) is effectively fneg (abs (x)) which on AArch64 can be
most efficiently done by doing an OR of the signbit.

The middle-end will optimize fneg (abs (x)) now to copysign as the
canonical form and so this optimizes the expansion.

If the target has an inclusive-OR that takes an immediate, then the transformed
instruction is both shorter and faster.  For those that don't, the immediate
has to be separately constructed, but this still ends up being faster as the
immediate construction is not on the critical path.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Note that this is part of another patch series, the additional testcases
are mutually dependent on the match.pd patch.  As such the tests are added
there insteadof here.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/109154
	* config/aarch64/aarch64.md (copysign<GPF:mode>3): Handle
	copysign (x, -1).
	* config/aarch64/aarch64-simd.md (copysign<mode>3): Likewise.
	* config/aarch64/aarch64-sve.md (copysign<mode>3): Likewise.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 25a1e4e8ecf767636c0ff3cdab6cad6e1482f73e..a78e77dcc3473445108b06c50f9c28a8369f3e3f 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 25a1e4e8ecf767636c0ff3cdab6cad6e1482f73e..a78e77dcc3473445108b06c50f9c28a8369f3e3f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -754,15 +754,33 @@ (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
-   (match_operand:VHSDF 2 "register_operand")]
+   (match_operand:VHSDF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx v_bitmask = gen_reg_rtx (int_mode);
   int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
 
   emit_move_insn (v_bitmask,
 		  aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
 						     HOST_WIDE_INT_M1U << bits));
+
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  if (!REG_P (operands[2]))
+    {
+      auto r0
+	= CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
+      if (-1 == real_to_integer (r0))
+	{
+	  emit_insn (gen_ior<v_int_equiv>3 (
+	    lowpart_subreg (int_mode, operands[0], <MODE>mode),
+	    lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
+	  DONE;
+	}
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
 					 operands[2], operands[1]));
   DONE;
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5a652d8536a0ef9461f40da7b22834e683e73ceb..071400c820a5b106ddf9dc9faebb117975d74ea0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6387,7 +6387,7 @@ (define_insn "*<optab><mode>3"
 (define_expand "copysign<mode>3"
   [(match_operand:SVE_FULL_F 0 "register_operand")
    (match_operand:SVE_FULL_F 1 "register_operand")
-   (match_operand:SVE_FULL_F 2 "register_operand")]
+   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
   "TARGET_SVE"
   {
     rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
@@ -6398,11 +6398,26 @@ (define_expand "copysign<mode>3"
     rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
     rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
 
-    emit_insn (gen_and<v_int_equiv>3
-	       (sign, arg2,
-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-						   HOST_WIDE_INT_M1U
-						   << bits)));
+    rtx v_sign_bitmask
+      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+					   HOST_WIDE_INT_M1U << bits);
+
+    /* copysign (x, -1) should instead be expanded as orr with the sign
+       bit.  */
+    if (!REG_P (operands[2]))
+      {
+	auto r0
+	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
+	if (-1 == real_to_integer (r0))
+	  {
+	    emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
+	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+	    DONE;
+	  }
+      }
+
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
     emit_insn (gen_and<v_int_equiv>3
 	       (mant, arg1,
 		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 24349ecdbbab875f21975f116732a9e53762d4c1..d6c581ad81615b4feb095391cbcf4f5b78fa72f1 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6940,12 +6940,25 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
 (define_expand "copysign<GPF:mode>3"
   [(match_operand:GPF 0 "register_operand")
    (match_operand:GPF 1 "register_operand")
-   (match_operand:GPF 2 "register_operand")]
+   (match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx bitmask = gen_reg_rtx (int_mode);
   emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
 				    << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  auto r0 = CONST_DOUBLE_REAL_VALUE (operands[2]);
+  if (-1 == real_to_integer (r0))
+    {
+      emit_insn (gen_ior<v_int_equiv>3 (
+	lowpart_subreg (int_mode, operands[0], <MODE>mode),
+	lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
+      DONE;
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
 				       bitmask));
   DONE;
  

Comments

Andrew Pinski Oct. 5, 2023, 6:29 p.m. UTC | #1
On Thu, Oct 5, 2023 at 11:22 AM Tamar Christina <tamar.christina@arm.com> wrote:
>
> Hi All,
>
> copysign (x, -1) is effectively fneg (abs (x)) which on AArch64 can be
> most efficiently done by doing an OR of the signbit.
>
> The middle-end will optimize fneg (abs (x)) now to copysign as the
> canonical form and so this optimizes the expansion.
>
> If the target has an inclusive-OR that takes an immediate, then the transformed
> instruction is both shorter and faster.  For those that don't, the immediate
> has to be separately constructed, but this still ends up being faster as the
> immediate construction is not on the critical path.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Note that this is part of another patch series, the additional testcases
> are mutually dependent on the match.pd patch.  As such the tests are added
> there insteadof here.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         PR tree-optimization/109154
>         * config/aarch64/aarch64.md (copysign<GPF:mode>3): Handle
>         copysign (x, -1).
>         * config/aarch64/aarch64-simd.md (copysign<mode>3): Likewise.
>         * config/aarch64/aarch64-sve.md (copysign<mode>3): Likewise.
>
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 25a1e4e8ecf767636c0ff3cdab6cad6e1482f73e..a78e77dcc3473445108b06c50f9c28a8369f3e3f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -754,15 +754,33 @@ (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
>  (define_expand "copysign<mode>3"
>    [(match_operand:VHSDF 0 "register_operand")
>     (match_operand:VHSDF 1 "register_operand")
> -   (match_operand:VHSDF 2 "register_operand")]
> +   (match_operand:VHSDF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx v_bitmask = gen_reg_rtx (int_mode);
>    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
>
>    emit_move_insn (v_bitmask,
>                   aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
>                                                      HOST_WIDE_INT_M1U << bits));
> +
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  if (!REG_P (operands[2]))
> +    {
> +      auto r0
> +       = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +      if (-1 == real_to_integer (r0))

I think this should be: REAL_VALUE_NEGATIVE (r0) instead. Just copying
the sign here is needed, right?
Also seems like double check that this is a vec_duplicate of a const
and that the constant is a CONST_DOUBLE?


> +       {
> +         emit_insn (gen_ior<v_int_equiv>3 (
> +           lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +           lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
> +         DONE;
> +       }
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
>                                          operands[2], operands[1]));
>    DONE;
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 5a652d8536a0ef9461f40da7b22834e683e73ceb..071400c820a5b106ddf9dc9faebb117975d74ea0 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6387,7 +6387,7 @@ (define_insn "*<optab><mode>3"
>  (define_expand "copysign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> -   (match_operand:SVE_FULL_F 2 "register_operand")]
> +   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
>    "TARGET_SVE"
>    {
>      rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> @@ -6398,11 +6398,26 @@ (define_expand "copysign<mode>3"
>      rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
>      rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
>
> -    emit_insn (gen_and<v_int_equiv>3
> -              (sign, arg2,
> -               aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> -                                                  HOST_WIDE_INT_M1U
> -                                                  << bits)));
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +                                          HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[2]))
> +      {
> +       auto r0
> +         = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +       if (-1 == real_to_integer (r0))

Likewise.

> +         {
> +           emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
> +           emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +           DONE;
> +         }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[2]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
>      emit_insn (gen_and<v_int_equiv>3
>                (mant, arg1,
>                 aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 24349ecdbbab875f21975f116732a9e53762d4c1..d6c581ad81615b4feb095391cbcf4f5b78fa72f1 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6940,12 +6940,25 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
>  (define_expand "copysign<GPF:mode>3"
>    [(match_operand:GPF 0 "register_operand")
>     (match_operand:GPF 1 "register_operand")
> -   (match_operand:GPF 2 "register_operand")]
> +   (match_operand:GPF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx bitmask = gen_reg_rtx (int_mode);
>    emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
>                                     << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  auto r0 = CONST_DOUBLE_REAL_VALUE (operands[2]);
> +  if (-1 == real_to_integer (r0))

Likewise.

Thanks,
Andrew

> +    {
> +      emit_insn (gen_ior<v_int_equiv>3 (
> +       lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +       lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
> +      DONE;
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
>                                        bitmask));
>    DONE;
>
>
>
>
> --
  
Richard Sandiford Oct. 19, 2023, 9:17 p.m. UTC | #2
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> copysign (x, -1) is effectively fneg (abs (x)) which on AArch64 can be
> most efficiently done by doing an OR of the signbit.
>
> The middle-end will optimize fneg (abs (x)) now to copysign as the
> canonical form and so this optimizes the expansion.
>
> If the target has an inclusive-OR that takes an immediate, then the transformed
> instruction is both shorter and faster.  For those that don't, the immediate
> has to be separately constructed, but this still ends up being faster as the
> immediate construction is not on the critical path.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Note that this is part of another patch series, the additional testcases
> are mutually dependent on the match.pd patch.  As such the tests are added
> there insteadof here.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR tree-optimization/109154
> 	* config/aarch64/aarch64.md (copysign<GPF:mode>3): Handle
> 	copysign (x, -1).
> 	* config/aarch64/aarch64-simd.md (copysign<mode>3): Likewise.
> 	* config/aarch64/aarch64-sve.md (copysign<mode>3): Likewise.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 25a1e4e8ecf767636c0ff3cdab6cad6e1482f73e..a78e77dcc3473445108b06c50f9c28a8369f3e3f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -754,15 +754,33 @@ (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
>  (define_expand "copysign<mode>3"
>    [(match_operand:VHSDF 0 "register_operand")
>     (match_operand:VHSDF 1 "register_operand")
> -   (match_operand:VHSDF 2 "register_operand")]
> +   (match_operand:VHSDF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx v_bitmask = gen_reg_rtx (int_mode);
>    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
>  
>    emit_move_insn (v_bitmask,
>  		  aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
>  						     HOST_WIDE_INT_M1U << bits));
> +
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  if (!REG_P (operands[2]))
> +    {
> +      auto r0
> +	= CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +      if (-1 == real_to_integer (r0))

Just restating Andrew's review really, but I think this is safer and
more general as:

  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
  if (GET_CODE (op2_elt) == CONST_DOUBLE
      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))

For vectors, the condition (in principle) is instead whether each
element is individually negative, but we can leave that for later.

OK with that change if the patch to reverse the current
(copysign x -1) -> (fneg (fabs x)) canonicalisation is accepted.
(As said, I'm nervous that it'll regress powerpc.)

Thanks,
Richard

> +	{
> +	  emit_insn (gen_ior<v_int_equiv>3 (
> +	    lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +	    lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
> +	  DONE;
> +	}
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
>  					 operands[2], operands[1]));
>    DONE;
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 5a652d8536a0ef9461f40da7b22834e683e73ceb..071400c820a5b106ddf9dc9faebb117975d74ea0 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6387,7 +6387,7 @@ (define_insn "*<optab><mode>3"
>  (define_expand "copysign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> -   (match_operand:SVE_FULL_F 2 "register_operand")]
> +   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
>    "TARGET_SVE"
>    {
>      rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> @@ -6398,11 +6398,26 @@ (define_expand "copysign<mode>3"
>      rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
>      rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
>  
> -    emit_insn (gen_and<v_int_equiv>3
> -	       (sign, arg2,
> -		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> -						   HOST_WIDE_INT_M1U
> -						   << bits)));
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +					   HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[2]))
> +      {
> +	auto r0
> +	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +	if (-1 == real_to_integer (r0))
> +	  {
> +	    emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
> +	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +	    DONE;
> +	  }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[2]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
>      emit_insn (gen_and<v_int_equiv>3
>  	       (mant, arg1,
>  		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 24349ecdbbab875f21975f116732a9e53762d4c1..d6c581ad81615b4feb095391cbcf4f5b78fa72f1 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6940,12 +6940,25 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
>  (define_expand "copysign<GPF:mode>3"
>    [(match_operand:GPF 0 "register_operand")
>     (match_operand:GPF 1 "register_operand")
> -   (match_operand:GPF 2 "register_operand")]
> +   (match_operand:GPF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx bitmask = gen_reg_rtx (int_mode);
>    emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
>  				    << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  auto r0 = CONST_DOUBLE_REAL_VALUE (operands[2]);
> +  if (-1 == real_to_integer (r0))
> +    {
> +      emit_insn (gen_ior<v_int_equiv>3 (
> +	lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +	lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
> +      DONE;
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
>  				       bitmask));
>    DONE;
  

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -754,15 +754,33 @@  (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
-   (match_operand:VHSDF 2 "register_operand")]
+   (match_operand:VHSDF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx v_bitmask = gen_reg_rtx (int_mode);
   int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
 
   emit_move_insn (v_bitmask,
 		  aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
 						     HOST_WIDE_INT_M1U << bits));
+
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  if (!REG_P (operands[2]))
+    {
+      auto r0
+	= CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
+      if (-1 == real_to_integer (r0))
+	{
+	  emit_insn (gen_ior<v_int_equiv>3 (
+	    lowpart_subreg (int_mode, operands[0], <MODE>mode),
+	    lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
+	  DONE;
+	}
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
 					 operands[2], operands[1]));
   DONE;
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5a652d8536a0ef9461f40da7b22834e683e73ceb..071400c820a5b106ddf9dc9faebb117975d74ea0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6387,7 +6387,7 @@  (define_insn "*<optab><mode>3"
 (define_expand "copysign<mode>3"
   [(match_operand:SVE_FULL_F 0 "register_operand")
    (match_operand:SVE_FULL_F 1 "register_operand")
-   (match_operand:SVE_FULL_F 2 "register_operand")]
+   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
   "TARGET_SVE"
   {
     rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
@@ -6398,11 +6398,26 @@  (define_expand "copysign<mode>3"
     rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
     rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
 
-    emit_insn (gen_and<v_int_equiv>3
-	       (sign, arg2,
-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-						   HOST_WIDE_INT_M1U
-						   << bits)));
+    rtx v_sign_bitmask
+      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+					   HOST_WIDE_INT_M1U << bits);
+
+    /* copysign (x, -1) should instead be expanded as orr with the sign
+       bit.  */
+    if (!REG_P (operands[2]))
+      {
+	auto r0
+	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
+	if (-1 == real_to_integer (r0))
+	  {
+	    emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
+	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+	    DONE;
+	  }
+      }
+
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
     emit_insn (gen_and<v_int_equiv>3
 	       (mant, arg1,
 		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 24349ecdbbab875f21975f116732a9e53762d4c1..d6c581ad81615b4feb095391cbcf4f5b78fa72f1 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6940,12 +6940,25 @@  (define_expand "lrint<GPF:mode><GPI:mode>2"
 (define_expand "copysign<GPF:mode>3"
   [(match_operand:GPF 0 "register_operand")
    (match_operand:GPF 1 "register_operand")
-   (match_operand:GPF 2 "register_operand")]
+   (match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx bitmask = gen_reg_rtx (int_mode);
   emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
 				    << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  auto r0 = CONST_DOUBLE_REAL_VALUE (operands[2]);
+  if (-1 == real_to_integer (r0))
+    {
+      emit_insn (gen_ior<v_int_equiv>3 (
+	lowpart_subreg (int_mode, operands[0], <MODE>mode),
+	lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
+      DONE;
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
 				       bitmask));
   DONE;