RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization

Message ID 20230513114421.196081-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai May 13, 2023, 11:44 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch support vector alignement for RVV auto-vectorization.

Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

Before this patch:
	ble	a3,zero,.L1
	srli	a4,a1,2
	negw	a4,a4
	andi	a5,a4,3
	sext.w	a3,a3
	beq	a5,zero,.L3
	lw	a7,0(a1)
	lw	a6,0(a2)
	andi	a4,a4,2
	addw	a6,a6,a7
	sw	a6,0(a0)
	beq	a4,zero,.L3
	lw	a7,4(a1)
	lw	a4,4(a2)
	li	a6,3
	addw	a4,a4,a7
	sw	a4,4(a0)
	bne	a5,a6,.L3
	lw	a6,8(a2)
	lw	a4,8(a1)
	addw	a4,a4,a6
	sw	a4,8(a0)
.L3:
	subw	a3,a3,a5
	slli	a4,a3,32
	csrr	a6,vlenb
	srli	a4,a4,32
	srli	a6,a6,2
	slli	a3,a5,2
	mv	a5,a4
	bgtu	a4,a6,.L17
.L5:
	csrr	a6,vlenb
	add	a1,a1,a3
	add	a2,a2,a3
	add	a0,a0,a3
	srli	a7,a6,2
	li	a3,0
.L8:
	vsetvli	zero,a5,e32,m1,ta,ma
	vle32.v	v1,0(a1)
	vle32.v	v2,0(a2)
	vsetvli	t1,zero,e32,m1,ta,ma
	add	a3,a3,a7
	vadd.vv	v1,v1,v2
	vsetvli	zero,a5,e32,m1,ta,ma
	vse32.v	v1,0(a0)
	mv	a5,a4
	bleu	a4,a3,.L6
	mv	a5,a3
.L6:
	sub	a5,a4,a5
	bleu	a5,a7,.L7
	mv	a5,a7
.L7:
	add	a1,a1,a6
	add	a2,a2,a6
	add	a0,a0,a6
	bne	a5,zero,.L8
.L1:
	ret
.L17:
	mv	a5,a6
	j	.L5

After this patch:
f:
        ble     a3,zero,.L1
        csrr    a4,vlenb
        srli    a4,a4,2
        mv      a5,a3
        bgtu    a3,a4,.L9
.L3:
        csrr    a6,vlenb
        li      a4,0
        srli    a7,a6,2
.L6:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a4,a4,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a3
        bleu    a3,a4,.L4
        mv      a5,a4
.L4:
        sub     a5,a3,a5
        bleu    a5,a7,.L5
        mv      a5,a7
.L5:
        add     a0,a0,a6
        add     a2,a2,a6
        add     a1,a1,a6
        bne     a5,zero,.L6
.L1:
        ret
.L9:
        mv      a5,a4
        j       .L3

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
        (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/align-1.c: New test.

---
 gcc/config/riscv/riscv.cc                     | 22 +++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-1.c    | 12 ++++++++++
 .../riscv/rvv/autovec/binop/shift-rv32gcv.c   | 10 +++++----
 3 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
  

Comments

juzhe.zhong@rivai.ai May 15, 2023, 12:35 a.m. UTC | #1
The implementation is copied directly from ARM SVE.
I applied in my downstream GCC for a year and there is no issue so far.
Ok for trunk ?


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-05-13 19:44
To: gcc-patches
CC: kito.cheng; palmer; rdapp.gcc; jeffreyalaw; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
 
This patch support vector alignement for RVV auto-vectorization.
 
Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}
 
Before this patch:
ble a3,zero,.L1
srli a4,a1,2
negw a4,a4
andi a5,a4,3
sext.w a3,a3
beq a5,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a4,a4,2
addw a6,a6,a7
sw a6,0(a0)
beq a4,zero,.L3
lw a7,4(a1)
lw a4,4(a2)
li a6,3
addw a4,a4,a7
sw a4,4(a0)
bne a5,a6,.L3
lw a6,8(a2)
lw a4,8(a1)
addw a4,a4,a6
sw a4,8(a0)
.L3:
subw a3,a3,a5
slli a4,a3,32
csrr a6,vlenb
srli a4,a4,32
srli a6,a6,2
slli a3,a5,2
mv a5,a4
bgtu a4,a6,.L17
.L5:
csrr a6,vlenb
add a1,a1,a3
add a2,a2,a3
add a0,a0,a3
srli a7,a6,2
li a3,0
.L8:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a3,a3,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a4
bleu a4,a3,.L6
mv a5,a3
.L6:
sub a5,a4,a5
bleu a5,a7,.L7
mv a5,a7
.L7:
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
bne a5,zero,.L8
.L1:
ret
.L17:
mv a5,a6
j .L5
 
After this patch:
f:
        ble     a3,zero,.L1
        csrr    a4,vlenb
        srli    a4,a4,2
        mv      a5,a3
        bgtu    a3,a4,.L9
.L3:
        csrr    a6,vlenb
        li      a4,0
        srli    a7,a6,2
.L6:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a4,a4,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a3
        bleu    a3,a4,.L4
        mv      a5,a4
.L4:
        sub     a5,a3,a5
        bleu    a5,a7,.L5
        mv      a5,a7
.L5:
        add     a0,a0,a6
        add     a2,a2,a6
        add     a1,a1,a6
        bne     a5,zero,.L6
.L1:
        ret
.L9:
        mv      a5,a4
        j       .L3
 
gcc/ChangeLog:
 
        * config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
        (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
 
gcc/testsuite/ChangeLog:
 
        * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/align-1.c: New test.
 
---
gcc/config/riscv/riscv.cc                     | 22 +++++++++++++++++++
.../gcc.target/riscv/rvv/autovec/align-1.c    | 12 ++++++++++
.../riscv/rvv/autovec/binop/shift-rv32gcv.c   | 10 +++++----
3 files changed, 40 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a5776a550b2 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,24 @@ riscv_preferred_simd_mode (scalar_mode mode)
   return word_mode;
}
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+    {
+      /* If the length of the vector is a fixed power of 2, try to align
+ to that length, otherwise don't try to align at all.  */
+      HOST_WIDE_INT result;
+      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
+   || !pow2p_hwi (result))
+ result = TYPE_ALIGN (TREE_TYPE (type));
+      return result;
+    }
+  return TYPE_ALIGN (type);
+}
+
/* Initialize the GCC target structure.  */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7789,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+  riscv_vectorize_preferred_vector_alignment
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@
#include "shift-template.h"
/* TODO: For int16_t and uint16_t we need widening/promotion patterns.
-   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
+   We don't check the assembler number since lacking patterns make
+   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
-- 
2.36.1
  
Kito Cheng May 15, 2023, 2:38 a.m. UTC | #2
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a5776a550b2 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,24 @@ riscv_preferred_simd_mode (scalar_mode mode)
>    return word_mode;
> }
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> +  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> +    {
> +      /* If the length of the vector is a fixed power of 2, try to align
> + to that length, otherwise don't try to align at all.  */
> +      HOST_WIDE_INT result;
> +      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
> +   || !pow2p_hwi (result))
> + result = TYPE_ALIGN (TREE_TYPE (type));
> +      return result;

Why we block VLS here and then relaxed by different way in another patch?
  
juzhe.zhong@rivai.ai May 15, 2023, 2:47 a.m. UTC | #3
Oh, I see. I try this way:
static poly_uint64
riscv_vectorize_preferred_vector_alignment (const_tree type)
{
  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
    return TYPE_ALIGN (TREE_TYPE (type));
  return TYPE_ALIGN (type);
}

And disable both TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
and TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST

Both VLA and VLS works well after the testing of align-1.c && align-2.c
Is the way you prefer ?

But I am not sure whether it's safe since both my downstream GCC and ARM SVE didn't use this approach.
I am Ok of both for now. So do you prefer the second approach ? Then I can send V2 patch.



juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-05-15 10:38
To: juzhe.zhong@rivai.ai
CC: gcc-patches; palmer; Robin Dapp; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a5776a550b2 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,24 @@ riscv_preferred_simd_mode (scalar_mode mode)
>    return word_mode;
> }
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> +  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> +    {
> +      /* If the length of the vector is a fixed power of 2, try to align
> + to that length, otherwise don't try to align at all.  */
> +      HOST_WIDE_INT result;
> +      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
> +   || !pow2p_hwi (result))
> + result = TYPE_ALIGN (TREE_TYPE (type));
> +      return result;
 
Why we block VLS here and then relaxed by different way in another patch?
  
Kito Cheng May 15, 2023, 2:56 a.m. UTC | #4
On Mon, May 15, 2023 at 10:48 AM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> Oh, I see. I try this way:
> static poly_uint64
> riscv_vectorize_preferred_vector_alignment (const_tree type)
> {
>   if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
>     return TYPE_ALIGN (TREE_TYPE (type));
>   return TYPE_ALIGN (type);
> }
>
> And disable both TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
> and TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
>
> Both VLA and VLS works well after the testing of align-1.c && align-2.c
> Is the way you prefer ?
>
> But I am not sure whether it's safe since both my downstream GCC and ARM SVE didn't use this approach.
> I am Ok of both for now. So do you prefer the second approach ? Then I can send V2 patch.

I am not sure it's safe either, but I am not convinced we need
TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE for improving the
VLS-VLMAX alignment, so I would prefer just using
TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT for now unless we found
it's not safe or we know why it's better way.
  
juzhe.zhong@rivai.ai May 15, 2023, 3:17 a.m. UTC | #5
Hi, address comment and send V2 patch here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618477.html 

Thanks.


juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-05-15 10:56
To: juzhe.zhong@rivai.ai
CC: gcc-patches; palmer; Robin Dapp; jeffreyalaw
Subject: Re: Re: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
On Mon, May 15, 2023 at 10:48 AM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> Oh, I see. I try this way:
> static poly_uint64
> riscv_vectorize_preferred_vector_alignment (const_tree type)
> {
>   if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
>     return TYPE_ALIGN (TREE_TYPE (type));
>   return TYPE_ALIGN (type);
> }
>
> And disable both TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
> and TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
>
> Both VLA and VLS works well after the testing of align-1.c && align-2.c
> Is the way you prefer ?
>
> But I am not sure whether it's safe since both my downstream GCC and ARM SVE didn't use this approach.
> I am Ok of both for now. So do you prefer the second approach ? Then I can send V2 patch.
 
I am not sure it's safe either, but I am not convinced we need
TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE for improving the
VLS-VLMAX alignment, so I would prefer just using
TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT for now unless we found
it's not safe or we know why it's better way.
  
Robin Dapp May 15, 2023, 7:19 a.m. UTC | #6
Hi,

we need to discern what we want to achieve here.  The goal might
be to prevent the vectorizer from performing peeling or versioning
for alignment.  I realize the peeling code looks ugly but it's
actually for a good cause when the target does not support
misaligned vector access or only with severe penalty.

So while I was still commenting on the other V1 patch I realized
that V2 is already in now :)

>     return TYPE_ALIGN (TREE_TYPE (type));

I'm not sure this is what we want or rather what the hardware wants.
The preferred alignment of a VNx4SI (or larger) is 32 bit/4 bytes
then - even 1 byte for a QI vector?
Of course, we don't peel for alignment then anymore but we
might actually want to, depending on the hardware.

I mean as a temporary workaround it might be acceptable but do we
actually need to get rid of the peeling code?  If the hardware
supports unaligned access without penalty we should specify in
the vectorization costs

e.g.
      case unaligned_load:
      case unaligned_store:
	return 1;

(Your "return 1" for riscv_builtin_vectorization_cost will do
that as well of course).

and the peeling cost check should do the rest (i.e. nothing :) ).

So I'd much rather prefer that over the current approach as it
is more localized and will need an mtune-related approach later
anyway.

Regards
 Robin
  
Kito Cheng May 15, 2023, 8:02 a.m. UTC | #7
> we need to discern what we want to achieve here.  The goal might
> be to prevent the vectorizer from performing peeling or versioning
> for alignment.  I realize the peeling code looks ugly but it's
> actually for a good cause when the target does not support
> misaligned vector access or only with severe penalty.

Vector spec says it should support element alignment, so my
understanding is default behavior should be just aligned to
vector-spec said :)

I guess Ju-Zhe might have different thoughts on that, we might need
some more comment from him.


> So I'd much rather prefer that over the current approach as it
> is more localized and will need an mtune-related approach later
> anyway.

I know there is some HW implementation that might be faster if the
address is aligned to 128 bit or 256 bit, and some HW implementation
might only get a few penalties from the first iteration if not aligned
to some alignment.

Anyway those are all mtune-related, so I guess eventually both
riscv_builtin_vectorization_cost and
riscv_vectorize_preferred_vector_alignment should get info from mtune.


>
> Regards
>  Robin
  
juzhe.zhong@rivai.ai May 15, 2023, 8:10 a.m. UTC | #8
After this patch, RVV GCC by default support alignment of RVV modes according to riscv-modes.def.
In riscv-modes.def, we define each RVV modes are element align which is aligned to RVV ISA spec.

If you want to support other alignment, you should add tunning info for this in the future.
And the default behavior in case of alignment which is already in this patch should not be changed in the future.

Thanks.


juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-05-15 16:02
To: Robin Dapp
CC: juzhe.zhong@rivai.ai; gcc-patches; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
> we need to discern what we want to achieve here.  The goal might
> be to prevent the vectorizer from performing peeling or versioning
> for alignment.  I realize the peeling code looks ugly but it's
> actually for a good cause when the target does not support
> misaligned vector access or only with severe penalty.
 
Vector spec says it should support element alignment, so my
understanding is default behavior should be just aligned to
vector-spec said :)
 
I guess Ju-Zhe might have different thoughts on that, we might need
some more comment from him.
 
 
> So I'd much rather prefer that over the current approach as it
> is more localized and will need an mtune-related approach later
> anyway.
 
I know there is some HW implementation that might be faster if the
address is aligned to 128 bit or 256 bit, and some HW implementation
might only get a few penalties from the first iteration if not aligned
to some alignment.
 
Anyway those are all mtune-related, so I guess eventually both
riscv_builtin_vectorization_cost and
riscv_vectorize_preferred_vector_alignment should get info from mtune.
 
 
>
> Regards
>  Robin
  
Robin Dapp May 15, 2023, 8:58 a.m. UTC | #9
> After this patch, RVV GCC by default support alignment of RVV modes
> according to riscv-modes.def. In riscv-modes.def, we define each RVV
> modes are element align which is aligned to RVV ISA spec.
> 
> If you want to support other alignment, you should add tunning info
> for this in the future. And the default behavior in case of alignment
> which is already in this patch should not be changed in the future.

We're changing the preferred_vector_alignment hook, not the
vector_alignment hook.  The way I see it, we will never peel for alignment
when setting the preferred_alignment to the alignment of a single
element, no matter the cost of misaligned loads/stores.  Is the idea
then to adjust the preferred_alignment hook depending on mtune? 

IMHO it's much more "natural" to set the vectorization costs according to
mtune (done anyway at some point), not change the preferred_alignment and
let the vectorizer decide whether is has enough information on whether
to peel or not.

Maybe Richard can comment here on why the vector_alignment_preferred hook
was originally introduced or what wasn't possible without it (i.e. why not
go via costs).  At some point the vectorizer would peel even if the costs
were low for misaligned accesses but that had already been fixed before the
hook was introduced.

And ABI-wise (and that's what the V spec documents) we are already safe
by using MIN (128, TYPE_SIZE (type)).  We could probably even lower that?

Regards
 Robin
  
juzhe.zhong@rivai.ai May 15, 2023, 10:32 a.m. UTC | #10
No, ARM SVE is 128bit alignment instead of element align (in aarch64-modes.def).

If you want to tune the alignment, you should add tunning info into riscv-modes.def
instead of this target hook.

Thanks.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-05-15 16:58
To: juzhe.zhong@rivai.ai; Kito.cheng; richard.sandiford
CC: rdapp.gcc; gcc-patches; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
> After this patch, RVV GCC by default support alignment of RVV modes
> according to riscv-modes.def. In riscv-modes.def, we define each RVV
> modes are element align which is aligned to RVV ISA spec.
> 
> If you want to support other alignment, you should add tunning info
> for this in the future. And the default behavior in case of alignment
> which is already in this patch should not be changed in the future.
 
We're changing the preferred_vector_alignment hook, not the
vector_alignment hook.  The way I see it, we will never peel for alignment
when setting the preferred_alignment to the alignment of a single
element, no matter the cost of misaligned loads/stores.  Is the idea
then to adjust the preferred_alignment hook depending on mtune? 
 
IMHO it's much more "natural" to set the vectorization costs according to
mtune (done anyway at some point), not change the preferred_alignment and
let the vectorizer decide whether is has enough information on whether
to peel or not.
 
Maybe Richard can comment here on why the vector_alignment_preferred hook
was originally introduced or what wasn't possible without it (i.e. why not
go via costs).  At some point the vectorizer would peel even if the costs
were low for misaligned accesses but that had already been fixed before the
hook was introduced.
 
And ABI-wise (and that's what the V spec documents) we are already safe
by using MIN (128, TYPE_SIZE (type)).  We could probably even lower that?
 
Regards
Robin
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a5776a550b2 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,24 @@  riscv_preferred_simd_mode (scalar_mode mode)
   return word_mode;
 }
 
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+    {
+      /* If the length of the vector is a fixed power of 2, try to align
+	 to that length, otherwise don't try to align at all.  */
+      HOST_WIDE_INT result;
+      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
+	  || !pow2p_hwi (result))
+	result = TYPE_ALIGN (TREE_TYPE (type));
+      return result;
+    }
+  return TYPE_ALIGN (type);
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7789,10 @@  riscv_preferred_simd_mode (scalar_mode mode)
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
 
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+  riscv_vectorize_preferred_vector_alignment
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@ 
 #include "shift-template.h"
 
 /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
-   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
+   We don't check the assembler number since lacking patterns make
+   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
 
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */