[6/6] arm: [MVE intrinsics] rework vldq1 vst1q

Message ID 20231116152617.2193377-6-christophe.lyon@linaro.org
State Unresolved
Headers
Series [1/6] arm: Fix arm_simd_types and MVE scalar_types |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Christophe Lyon Nov. 16, 2023, 3:26 p.m. UTC
  Implement vld1q, vst1q using the new MVE builtins framework.

2023-11-16  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm-mve-builtins-base.cc (vld1_impl, vld1q)
	(vst1_impl, vst1q): New.
	* config/arm/arm-mve-builtins-base.def (vld1q, vst1q): New.
	* config/arm/arm-mve-builtins-base.h (vld1q, vst1q): New.
	* config/arm/arm_mve.h
	(vld1q): Delete.
	(vst1q): Delete.
	(vld1q_s8): Delete.
	(vld1q_s32): Delete.
	(vld1q_s16): Delete.
	(vld1q_u8): Delete.
	(vld1q_u32): Delete.
	(vld1q_u16): Delete.
	(vld1q_f32): Delete.
	(vld1q_f16): Delete.
	(vst1q_f32): Delete.
	(vst1q_f16): Delete.
	(vst1q_s8): Delete.
	(vst1q_s32): Delete.
	(vst1q_s16): Delete.
	(vst1q_u8): Delete.
	(vst1q_u32): Delete.
	(vst1q_u16): Delete.
	(__arm_vld1q_s8): Delete.
	(__arm_vld1q_s32): Delete.
	(__arm_vld1q_s16): Delete.
	(__arm_vld1q_u8): Delete.
	(__arm_vld1q_u32): Delete.
	(__arm_vld1q_u16): Delete.
	(__arm_vst1q_s8): Delete.
	(__arm_vst1q_s32): Delete.
	(__arm_vst1q_s16): Delete.
	(__arm_vst1q_u8): Delete.
	(__arm_vst1q_u32): Delete.
	(__arm_vst1q_u16): Delete.
	(__arm_vld1q_f32): Delete.
	(__arm_vld1q_f16): Delete.
	(__arm_vst1q_f32): Delete.
	(__arm_vst1q_f16): Delete.
	(__arm_vld1q): Delete.
	(__arm_vst1q): Delete.
	* config/arm/mve.md (mve_vld1q_f<mode>): Rename into ...
	(@mve_vld1q_f<mode>): ... this.
	(mve_vld1q_<supf><mode>): Rename into ...
	(@mve_vld1q_<supf><mode>) ... this.
	(mve_vst1q_f<mode>): Rename into ...
	(@mve_vst1q_f<mode>): ... this.
	(mve_vst1q_<supf><mode>): Rename into ...
	(@mve_vst1q_<supf><mode>) ... this.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |  58 +++++
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +-
 gcc/config/arm/arm_mve.h                 | 282 -----------------------
 gcc/config/arm/mve.md                    |   8 +-
 5 files changed, 69 insertions(+), 287 deletions(-)
  

Patch

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc
index 5478cac8aeb..cfe1b954a29 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -83,6 +83,62 @@  class vuninitializedq_impl : public quiet<function_base>
   }
 };
 
+class vld1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const override
+  {
+    return CP_READ_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const override
+  {
+    insn_code icode;
+    if (e.type_suffix (0).float_p)
+      icode = code_for_mve_vld1q_f(e.vector_mode (0));
+    else
+      {
+	if (e.type_suffix (0).unsigned_p)
+	  icode = code_for_mve_vld1q(VLD1Q_U,
+				     e.vector_mode (0));
+	else
+	  icode = code_for_mve_vld1q(VLD1Q_S,
+				     e.vector_mode (0));
+      }
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+class vst1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const override
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const override
+  {
+    insn_code icode;
+    if (e.type_suffix (0).float_p)
+      icode = code_for_mve_vst1q_f(e.vector_mode (0));
+    else
+      {
+	if (e.type_suffix (0).unsigned_p)
+	  icode = code_for_mve_vst1q(VST1Q_U,
+				     e.vector_mode (0));
+	else
+	  icode = code_for_mve_vst1q(VST1Q_S,
+				     e.vector_mode (0));
+      }
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
 } /* end anonymous namespace */
 
 namespace arm_mve {
@@ -290,6 +346,7 @@  FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_
 FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, VFMSQ_F, -1, -1, -1, -1, -1, VFMSQ_M_F, -1, -1, -1))
 FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ)
 FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ)
+FUNCTION (vld1q, vld1_impl,)
 FUNCTION_PRED_P_S (vmaxavq, VMAXAVQ)
 FUNCTION_WITHOUT_N_NO_U_F (vmaxaq, VMAXAQ)
 FUNCTION_ONLY_F (vmaxnmaq, VMAXNMAQ)
@@ -405,6 +462,7 @@  FUNCTION_ONLY_N_NO_F (vshrntq, VSHRNTQ)
 FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ)
 FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ)
 FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ)
+FUNCTION (vst1q, vst1_impl,)
 FUNCTION_WITH_RTX_M_N (vsubq, MINUS, VSUBQ)
 FUNCTION (vuninitializedq, vuninitializedq_impl,)
 
diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def
index 01dfbdef8a3..16879246237 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -47,6 +47,7 @@  DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
 DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmaxq, binary, all_integer, mx_or_none)
@@ -150,6 +151,7 @@  DEF_MVE_FUNCTION (vshrntq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none)
+DEF_MVE_FUNCTION (vst1q, store, all_integer, none)
 DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vuninitializedq, inherent, all_integer_with_64, none)
 #undef REQUIRES_FLOAT
@@ -182,6 +184,7 @@  DEF_MVE_FUNCTION (veorq, binary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, m_or_none)
 DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none)
 DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none)
+DEF_MVE_FUNCTION (vld1q, load, all_float, none)
 DEF_MVE_FUNCTION (vmaxnmaq, binary, all_float, m_or_none)
 DEF_MVE_FUNCTION (vmaxnmavq, binary_maxvminv, all_float, p_or_none)
 DEF_MVE_FUNCTION (vmaxnmq, binary, all_float, mx_or_none)
@@ -203,6 +206,7 @@  DEF_MVE_FUNCTION (vrndnq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vrndpq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vrndq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vrndxq, unary, all_float, mx_or_none)
+DEF_MVE_FUNCTION (vst1q, store, all_float, none)
 DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vuninitializedq, inherent, all_float, none)
 #undef REQUIRES_FLOAT
diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h
index c574c32ac53..8c7e5fe5c3e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -63,6 +63,7 @@  extern const function_base *const vhaddq;
 extern const function_base *const vhcaddq_rot270;
 extern const function_base *const vhcaddq_rot90;
 extern const function_base *const vhsubq;
+extern const function_base *const vld1q;
 extern const function_base *const vmaxaq;
 extern const function_base *const vmaxavq;
 extern const function_base *const vmaxnmaq;
@@ -103,8 +104,8 @@  extern const function_base *const vmovnbq;
 extern const function_base *const vmovntq;
 extern const function_base *const vmulhq;
 extern const function_base *const vmullbq_int;
-extern const function_base *const vmulltq_int;
 extern const function_base *const vmullbq_poly;
+extern const function_base *const vmulltq_int;
 extern const function_base *const vmulltq_poly;
 extern const function_base *const vmulq;
 extern const function_base *const vmvnq;
@@ -178,6 +179,7 @@  extern const function_base *const vshrntq;
 extern const function_base *const vshrq;
 extern const function_base *const vsliq;
 extern const function_base *const vsriq;
+extern const function_base *const vst1q;
 extern const function_base *const vsubq;
 extern const function_base *const vuninitializedq;
 
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index b82d94e59bd..cc027f9cbb5 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -56,7 +56,6 @@ 
 #define vstrbq_scatter_offset_p(__base, __offset, __value, __p) __arm_vstrbq_scatter_offset_p(__base, __offset, __value, __p)
 #define vstrwq_scatter_base_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_p(__addr, __offset, __value, __p)
 #define vldrbq_gather_offset_z(__base, __offset, __p) __arm_vldrbq_gather_offset_z(__base, __offset, __p)
-#define vld1q(__base) __arm_vld1q(__base)
 #define vldrhq_gather_offset(__base, __offset) __arm_vldrhq_gather_offset(__base, __offset)
 #define vldrhq_gather_offset_z(__base, __offset, __p) __arm_vldrhq_gather_offset_z(__base, __offset, __p)
 #define vldrhq_gather_shifted_offset(__base, __offset) __arm_vldrhq_gather_shifted_offset(__base, __offset)
@@ -69,7 +68,6 @@ 
 #define vldrwq_gather_offset_z(__base, __offset, __p) __arm_vldrwq_gather_offset_z(__base, __offset, __p)
 #define vldrwq_gather_shifted_offset(__base, __offset) __arm_vldrwq_gather_shifted_offset(__base, __offset)
 #define vldrwq_gather_shifted_offset_z(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z(__base, __offset, __p)
-#define vst1q(__addr, __value) __arm_vst1q(__addr, __value)
 #define vstrhq_scatter_offset(__base, __offset, __value) __arm_vstrhq_scatter_offset(__base, __offset, __value)
 #define vstrhq_scatter_offset_p(__base, __offset, __value, __p) __arm_vstrhq_scatter_offset_p(__base, __offset, __value, __p)
 #define vstrhq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrhq_scatter_shifted_offset(__base, __offset, __value)
@@ -346,12 +344,6 @@ 
 #define vldrbq_z_u32(__base, __p) __arm_vldrbq_z_u32(__base, __p)
 #define vldrwq_gather_base_z_u32(__addr,  __offset, __p) __arm_vldrwq_gather_base_z_u32(__addr,  __offset, __p)
 #define vldrwq_gather_base_z_s32(__addr,  __offset, __p) __arm_vldrwq_gather_base_z_s32(__addr,  __offset, __p)
-#define vld1q_s8(__base) __arm_vld1q_s8(__base)
-#define vld1q_s32(__base) __arm_vld1q_s32(__base)
-#define vld1q_s16(__base) __arm_vld1q_s16(__base)
-#define vld1q_u8(__base) __arm_vld1q_u8(__base)
-#define vld1q_u32(__base) __arm_vld1q_u32(__base)
-#define vld1q_u16(__base) __arm_vld1q_u16(__base)
 #define vldrhq_gather_offset_s32(__base, __offset) __arm_vldrhq_gather_offset_s32(__base, __offset)
 #define vldrhq_gather_offset_s16(__base, __offset) __arm_vldrhq_gather_offset_s16(__base, __offset)
 #define vldrhq_gather_offset_u32(__base, __offset) __arm_vldrhq_gather_offset_u32(__base, __offset)
@@ -380,8 +372,6 @@ 
 #define vldrwq_u32(__base) __arm_vldrwq_u32(__base)
 #define vldrwq_z_s32(__base, __p) __arm_vldrwq_z_s32(__base, __p)
 #define vldrwq_z_u32(__base, __p) __arm_vldrwq_z_u32(__base, __p)
-#define vld1q_f32(__base) __arm_vld1q_f32(__base)
-#define vld1q_f16(__base) __arm_vld1q_f16(__base)
 #define vldrhq_f16(__base) __arm_vldrhq_f16(__base)
 #define vldrhq_z_f16(__base, __p) __arm_vldrhq_z_f16(__base, __p)
 #define vldrwq_f32(__base) __arm_vldrwq_f32(__base)
@@ -416,14 +406,6 @@ 
 #define vldrwq_gather_shifted_offset_z_f32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_f32(__base, __offset, __p)
 #define vldrwq_gather_shifted_offset_z_s32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_s32(__base, __offset, __p)
 #define vldrwq_gather_shifted_offset_z_u32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_u32(__base, __offset, __p)
-#define vst1q_f32(__addr, __value) __arm_vst1q_f32(__addr, __value)
-#define vst1q_f16(__addr, __value) __arm_vst1q_f16(__addr, __value)
-#define vst1q_s8(__addr, __value) __arm_vst1q_s8(__addr, __value)
-#define vst1q_s32(__addr, __value) __arm_vst1q_s32(__addr, __value)
-#define vst1q_s16(__addr, __value) __arm_vst1q_s16(__addr, __value)
-#define vst1q_u8(__addr, __value) __arm_vst1q_u8(__addr, __value)
-#define vst1q_u32(__addr, __value) __arm_vst1q_u32(__addr, __value)
-#define vst1q_u16(__addr, __value) __arm_vst1q_u16(__addr, __value)
 #define vstrhq_f16(__addr, __value) __arm_vstrhq_f16(__addr, __value)
 #define vstrhq_scatter_offset_s32( __base, __offset, __value) __arm_vstrhq_scatter_offset_s32( __base, __offset, __value)
 #define vstrhq_scatter_offset_s16( __base, __offset, __value) __arm_vstrhq_scatter_offset_s16( __base, __offset, __value)
@@ -1537,48 +1519,6 @@  __arm_vldrwq_gather_base_z_u32 (uint32x4_t __addr, const int __offset, mve_pred1
   return __builtin_mve_vldrwq_gather_base_z_uv4si (__addr, __offset, __p);
 }
 
-__extension__ extern __inline int8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_s8 (int8_t const * __base)
-{
-  return __builtin_mve_vld1q_sv16qi ((__builtin_neon_qi *) __base);
-}
-
-__extension__ extern __inline int32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_s32 (int32_t const * __base)
-{
-  return __builtin_mve_vld1q_sv4si ((__builtin_neon_si *) __base);
-}
-
-__extension__ extern __inline int16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_s16 (int16_t const * __base)
-{
-  return __builtin_mve_vld1q_sv8hi ((__builtin_neon_hi *) __base);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_u8 (uint8_t const * __base)
-{
-  return __builtin_mve_vld1q_uv16qi ((__builtin_neon_qi *) __base);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_u32 (uint32_t const * __base)
-{
-  return __builtin_mve_vld1q_uv4si ((__builtin_neon_si *) __base);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_u16 (uint16_t const * __base)
-{
-  return __builtin_mve_vld1q_uv8hi ((__builtin_neon_hi *) __base);
-}
-
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrhq_gather_offset_s32 (int16_t const * __base, uint32x4_t __offset)
@@ -1917,48 +1857,6 @@  __arm_vldrwq_gather_shifted_offset_z_u32 (uint32_t const * __base, uint32x4_t __
   return __builtin_mve_vldrwq_gather_shifted_offset_z_uv4si ((__builtin_neon_si *) __base, __offset, __p);
 }
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_s8 (int8_t * __addr, int8x16_t __value)
-{
-  __builtin_mve_vst1q_sv16qi ((__builtin_neon_qi *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_s32 (int32_t * __addr, int32x4_t __value)
-{
-  __builtin_mve_vst1q_sv4si ((__builtin_neon_si *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_s16 (int16_t * __addr, int16x8_t __value)
-{
-  __builtin_mve_vst1q_sv8hi ((__builtin_neon_hi *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_u8 (uint8_t * __addr, uint8x16_t __value)
-{
-  __builtin_mve_vst1q_uv16qi ((__builtin_neon_qi *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_u32 (uint32_t * __addr, uint32x4_t __value)
-{
-  __builtin_mve_vst1q_uv4si ((__builtin_neon_si *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_u16 (uint16_t * __addr, uint16x8_t __value)
-{
-  __builtin_mve_vst1q_uv8hi ((__builtin_neon_hi *) __addr, __value);
-}
-
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrhq_scatter_offset_s32 (int16_t * __base, uint32x4_t __offset, int32x4_t __value)
@@ -4421,20 +4319,6 @@  __arm_vornq_m_f16 (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve
   return __builtin_mve_vornq_m_fv8hf (__inactive, __a, __b, __p);
 }
 
-__extension__ extern __inline float32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_f32 (float32_t const * __base)
-{
-  return __builtin_mve_vld1q_fv4sf((__builtin_neon_si *) __base);
-}
-
-__extension__ extern __inline float16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q_f16 (float16_t const * __base)
-{
-  return __builtin_mve_vld1q_fv8hf((__builtin_neon_hi *) __base);
-}
-
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrwq_f32 (float32_t const * __base)
@@ -4547,20 +4431,6 @@  __arm_vstrwq_f32 (float32_t * __addr, float32x4_t __value)
   __builtin_mve_vstrwq_fv4sf ((__builtin_neon_si *) __addr, __value);
 }
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_f32 (float32_t * __addr, float32x4_t __value)
-{
-  __builtin_mve_vst1q_fv4sf ((__builtin_neon_si *) __addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q_f16 (float16_t * __addr, float16x8_t __value)
-{
-  __builtin_mve_vst1q_fv8hf ((__builtin_neon_hi *) __addr, __value);
-}
-
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrhq_f16 (float16_t * __addr, float16x8_t __value)
@@ -5651,48 +5521,6 @@  __arm_vldrbq_gather_offset_z (uint8_t const * __base, uint16x8_t __offset, mve_p
  return __arm_vldrbq_gather_offset_z_u16 (__base, __offset, __p);
 }
 
-__extension__ extern __inline int8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (int8_t const * __base)
-{
- return __arm_vld1q_s8 (__base);
-}
-
-__extension__ extern __inline int32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (int32_t const * __base)
-{
- return __arm_vld1q_s32 (__base);
-}
-
-__extension__ extern __inline int16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (int16_t const * __base)
-{
- return __arm_vld1q_s16 (__base);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (uint8_t const * __base)
-{
- return __arm_vld1q_u8 (__base);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (uint32_t const * __base)
-{
- return __arm_vld1q_u32 (__base);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (uint16_t const * __base)
-{
- return __arm_vld1q_u16 (__base);
-}
-
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrhq_gather_offset (int16_t const * __base, uint32x4_t __offset)
@@ -5917,48 +5745,6 @@  __arm_vldrwq_gather_shifted_offset_z (uint32_t const * __base, uint32x4_t __offs
  return __arm_vldrwq_gather_shifted_offset_z_u32 (__base, __offset, __p);
 }
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (int8_t * __addr, int8x16_t __value)
-{
- __arm_vst1q_s8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (int32_t * __addr, int32x4_t __value)
-{
- __arm_vst1q_s32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (int16_t * __addr, int16x8_t __value)
-{
- __arm_vst1q_s16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (uint8_t * __addr, uint8x16_t __value)
-{
- __arm_vst1q_u8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (uint32_t * __addr, uint32x4_t __value)
-{
- __arm_vst1q_u32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (uint16_t * __addr, uint16x8_t __value)
-{
- __arm_vst1q_u16 (__addr, __value);
-}
-
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrhq_scatter_offset (int16_t * __base, uint32x4_t __offset, int32x4_t __value)
@@ -7809,20 +7595,6 @@  __arm_vornq_m (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve_pre
  return __arm_vornq_m_f16 (__inactive, __a, __b, __p);
 }
 
-__extension__ extern __inline float32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (float32_t const * __base)
-{
- return __arm_vld1q_f32 (__base);
-}
-
-__extension__ extern __inline float16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld1q (float16_t const * __base)
-{
- return __arm_vld1q_f16 (__base);
-}
-
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrhq_gather_offset (float16_t const * __base, uint16x8_t __offset)
@@ -7893,20 +7665,6 @@  __arm_vstrwq (float32_t * __addr, float32x4_t __value)
  __arm_vstrwq_f32 (__addr, __value);
 }
 
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (float32_t * __addr, float32x4_t __value)
-{
- __arm_vst1q_f32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst1q (float16_t * __addr, float16x8_t __value)
-{
- __arm_vst1q_f16 (__addr, __value);
-}
-
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrhq (float16_t * __addr, float16x8_t __value)
@@ -8670,17 +8428,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vornq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vornq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3));})
 
-#define __arm_vld1q(p0) (\
-  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *))))
-
 #define __arm_vld1q_z(p0,p1) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
   int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), p1), \
@@ -8792,17 +8539,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x2_t]: __arm_vst2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x2_t)), \
   int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x2_t]: __arm_vst2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x2_t)));})
 
-#define __arm_vst1q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vst1q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vst1q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vst1q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vst1q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vst1q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vst1q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8_t]: __arm_vst1q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vst1q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4_t)));})
-
 #define __arm_vstrhq(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
   int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrhq_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t)), \
@@ -9149,15 +8885,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32 (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32 (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})
 
-#define __arm_vld1q(p0) (\
-  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *))))
-
 #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
   int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrhq_gather_offset_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
@@ -9206,15 +8933,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_s32 (__ARM_mve_coerce_s32_ptr(__p0, int32_t *), p1, p2), \
   int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
 
-#define __arm_vst1q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vst1q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vst1q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vst1q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vst1q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vst1q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vst1q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
-
 #define __arm_vst1q_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
   int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vst1q_p_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 366cec0812a..b0d3443da9c 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -3690,7 +3690,7 @@  (define_insn "mve_vldrwq_z_<supf>v4si"
 }
   [(set_attr "length" "8")])
 
-(define_expand "mve_vld1q_f<mode>"
+(define_expand "@mve_vld1q_f<mode>"
   [(match_operand:MVE_0 0 "s_register_operand")
    (unspec:MVE_0 [(match_operand:<MVE_CNVT> 1 "mve_memory_operand")] VLD1Q_F)
   ]
@@ -3700,7 +3700,7 @@  (define_expand "mve_vld1q_f<mode>"
   DONE;
 })
 
-(define_expand "mve_vld1q_<supf><mode>"
+(define_expand "@mve_vld1q_<supf><mode>"
   [(match_operand:MVE_2 0 "s_register_operand")
    (unspec:MVE_2 [(match_operand:MVE_2 1 "mve_memory_operand")] VLD1Q)
   ]
@@ -4408,7 +4408,7 @@  (define_insn "mve_vstrwq_<supf>v4si"
 }
   [(set_attr "length" "4")])
 
-(define_expand "mve_vst1q_f<mode>"
+(define_expand "@mve_vst1q_f<mode>"
   [(match_operand:<MVE_CNVT> 0 "mve_memory_operand")
    (unspec:<MVE_CNVT> [(match_operand:MVE_0 1 "s_register_operand")] VST1Q_F)
   ]
@@ -4418,7 +4418,7 @@  (define_expand "mve_vst1q_f<mode>"
   DONE;
 })
 
-(define_expand "mve_vst1q_<supf><mode>"
+(define_expand "@mve_vst1q_<supf><mode>"
   [(match_operand:MVE_2 0 "mve_memory_operand")
    (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand")] VST1Q)
   ]