[16/35] arm: Add integer vector overloading of vsubq_x instrinsic
Checks
Commit Message
From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
In the past we had only defined the vsubq_x generic overload of the
vsubq_x_* intrinsics for float vector types. This would cause them
to fall back to the `__ARM_undef` failure state if they was called
through the generic version.
This patch simply adds these overloads.
gcc/ChangeLog:
* config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
(__arm_vsubq_x Integer): New.
---
gcc/config/arm/arm_mve.h | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
Comments
On 11/17/22 17:37, Andrea Corallo via Gcc-patches wrote:
> From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
>
> In the past we had only defined the vsubq_x generic overload of the
> vsubq_x_* intrinsics for float vector types. This would cause them
> to fall back to the `__ARM_undef` failure state if they was called
> through the generic version.
> This patch simply adds these overloads.
>
> gcc/ChangeLog:
>
> * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
> (__arm_vsubq_x Integer): New.
Hi Stam,
To hopefully help Kyrill in the review, I think this fix is tested by
patch #19, where we now have
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
(this line explains why this bug was not noticed so far)
Thanks,
Christophe
> ---
> gcc/config/arm/arm_mve.h | 28 ++++++++++++++++++++++++++++
> 1 file changed, 28 insertions(+)
>
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index f6b42dc3fab..09167ec118e 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
> #define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> __typeof(p2) __p2 = (p2); \
> _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3), \
> int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
> int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
> int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(p2, double), p3), \
> @@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
> int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
> int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
>
> +#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> + __typeof(p2) __p2 = (p2); \
> + _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3));})
> +
> #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1), \
Christophe Lyon <christophe.lyon@arm.com> writes:
> On 11/17/22 17:37, Andrea Corallo via Gcc-patches wrote:
>> From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
>> In the past we had only defined the vsubq_x generic overload of the
>> vsubq_x_* intrinsics for float vector types. This would cause them
>> to fall back to the `__ARM_undef` failure state if they was called
>> through the generic version.
>> This patch simply adds these overloads.
>> gcc/ChangeLog:
>> * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
>> (__arm_vsubq_x Integer): New.
>
> Hi Stam,
>
> To hopefully help Kyrill in the review, I think this fix is tested by
> patch #19, where we now have
> +/* { dg-final { scan-assembler-not "__ARM_undef" } } */
> (this line explains why this bug was not noticed so far)
>
> Thanks,
>
> Christophe
Exactly
PS also the fact that now tests are 'check-function-bodies' should catch
that.
Thanks
Andrea
> -----Original Message-----
> From: Andrea Corallo <andrea.corallo@arm.com>
> Sent: Thursday, November 17, 2022 4:38 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Stam Markianos-Wright <Stam.Markianos-
> Wright@arm.com>
> Subject: [PATCH 16/35] arm: Add integer vector overloading of vsubq_x
> instrinsic
>
> From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
>
> In the past we had only defined the vsubq_x generic overload of the
> vsubq_x_* intrinsics for float vector types. This would cause them
> to fall back to the `__ARM_undef` failure state if they was called
> through the generic version.
> This patch simply adds these overloads.
Ok.
Thanks,
Kyrill
>
> gcc/ChangeLog:
>
> * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
> (__arm_vsubq_x Integer): New.
> ---
> gcc/config/arm/arm_mve.h | 28 ++++++++++++++++++++++++++++
> 1 file changed, 28 insertions(+)
>
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index f6b42dc3fab..09167ec118e 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
> #define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> __typeof(p2) __p2 = (p2); \
> _Generic( (int
> (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
> __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce(__p2, int8x16_t), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
> __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce(__p2, int16x8_t), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
> __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce(__p2, int32x4_t), p3), \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
> __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce(__p2, uint8x16_t), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
> __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce(__p2, uint16x8_t), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
> __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, uint32x4_t), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
> int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
> __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce(__p2, float16x8_t), p3), \
> int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
> __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t),
> __ARM_mve_coerce(__p2, float32x4_t), p3), \
> int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]:
> __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce2(p2, double), p3), \
> @@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
> int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16
> (__ARM_mve_coerce1(p0, uint16_t *)), \
> int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32
> (__ARM_mve_coerce1(p0, uint32_t *))))
>
> +#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> + __typeof(p2) __p2 = (p2); \
> + _Generic( (int
> (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
> __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce(__p2, int8x16_t), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
> __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce(__p2, int16x8_t), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
> __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce(__p2, int32x4_t), p3), \
> + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
> __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce(__p2, uint8x16_t), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
> __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce(__p2, uint16x8_t), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
> __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, uint32x4_t), p3), \
> + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce3(p2, int), p3));})
> +
> #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8
> (__ARM_mve_coerce(__p0, int8x16_t), p1), \
> --
> 2.25.1
@@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
__typeof(p2) __p2 = (p2); \
_Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \
+ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
+ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \
+ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
+ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
+ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \
+ int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3), \
int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(p2, double), p3), \
@@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
+#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
+ __typeof(p2) __p2 = (p2); \
+ _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \
+ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
+ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \
+ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \
+ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
+ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \
+ int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3));})
+
#define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
_Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1), \