[V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.

Message ID 20230515031549.242051-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series [V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization. |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai May 15, 2023, 3:15 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch optimizes both RVV VLA && VLS vectorization.

Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

VLA:
Before this patch:
	ble	a3,zero,.L1
	srli	a4,a1,2
	negw	a4,a4
	andi	a5,a4,3
	sext.w	a3,a3
	beq	a5,zero,.L3
	lw	a7,0(a1)
	lw	a6,0(a2)
	andi	a4,a4,2
	addw	a6,a6,a7
	sw	a6,0(a0)
	beq	a4,zero,.L3
	lw	a7,4(a1)
	lw	a4,4(a2)
	li	a6,3
	addw	a4,a4,a7
	sw	a4,4(a0)
	bne	a5,a6,.L3
	lw	a6,8(a2)
	lw	a4,8(a1)
	addw	a4,a4,a6
	sw	a4,8(a0)
.L3:
	subw	a3,a3,a5
	slli	a4,a3,32
	csrr	a6,vlenb
	srli	a4,a4,32
	srli	a6,a6,2
	slli	a3,a5,2
	mv	a5,a4
	bgtu	a4,a6,.L17
.L5:
	csrr	a6,vlenb
	add	a1,a1,a3
	add	a2,a2,a3
	add	a0,a0,a3
	srli	a7,a6,2
	li	a3,0
.L8:
	vsetvli	zero,a5,e32,m1,ta,ma
	vle32.v	v1,0(a1)
	vle32.v	v2,0(a2)
	vsetvli	t1,zero,e32,m1,ta,ma
	add	a3,a3,a7
	vadd.vv	v1,v1,v2
	vsetvli	zero,a5,e32,m1,ta,ma
	vse32.v	v1,0(a0)
	mv	a5,a4
	bleu	a4,a3,.L6
	mv	a5,a3
.L6:
	sub	a5,a4,a5
	bleu	a5,a7,.L7
	mv	a5,a7
.L7:
	add	a1,a1,a6
	add	a2,a2,a6
	add	a0,a0,a6
	bne	a5,zero,.L8
.L1:
	ret
.L17:
	mv	a5,a6
	j	.L5

After this patch:
f:
        ble     a3,zero,.L1
        csrr    a4,vlenb
        srli    a4,a4,2
        mv      a5,a3
        bgtu    a3,a4,.L9
.L3:
        csrr    a6,vlenb
        li      a4,0
        srli    a7,a6,2
.L6:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a4,a4,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a3
        bleu    a3,a4,.L4
        mv      a5,a4
.L4:
        sub     a5,a3,a5
        bleu    a5,a7,.L5
        mv      a5,a7
.L5:
        add     a0,a0,a6
        add     a2,a2,a6
        add     a1,a1,a6
        bne     a5,zero,.L6
.L1:
        ret
.L9:
        mv      a5,a4
        j       .L3

VLS:
Before this patch:
f3:
	ble	a3,zero,.L1
	srli	a5,a1,2
	negw	a5,a5
	andi	a4,a5,3
	sext.w	a3,a3
	beq	a4,zero,.L3
	lw	a7,0(a1)
	lw	a6,0(a2)
	andi	a5,a5,2
	addw	a6,a6,a7
	sw	a6,0(a0)
	beq	a5,zero,.L3
	lw	a7,4(a1)
	lw	a5,4(a2)
	li	a6,3
	addw	a5,a5,a7
	sw	a5,4(a0)
	bne	a4,a6,.L3
	lw	a6,8(a2)
	lw	a5,8(a1)
	addw	a5,a5,a6
	sw	a5,8(a0)
.L3:
	subw	a3,a3,a4
	slli	a6,a4,2
	slli	a5,a3,32
	srli	a5,a5,32
	add	a1,a1,a6
	add	a2,a2,a6
	add	a0,a0,a6
	li	a3,4
.L6:
	mv	a4,a5
	bleu	a5,a3,.L5
	li	a4,4
.L5:
	vsetvli	zero,a4,e32,m1,ta,ma
	vle32.v	v1,0(a1)
	vle32.v	v2,0(a2)
	vsetivli	zero,4,e32,m1,ta,ma
	sub	a5,a5,a4
	vadd.vv	v1,v1,v2
	vsetvli	zero,a4,e32,m1,ta,ma
	vse32.v	v1,0(a0)
	addi	a1,a1,16
	addi	a2,a2,16
	addi	a0,a0,16
	bne	a5,zero,.L6
.L1:
	ret

After this patch:
f3:
	ble	a3,zero,.L1
	li	a4,4
.L4:
	mv	a5,a3
	bleu	a3,a4,.L3
	li	a5,4
.L3:
	vsetvli	zero,a5,e32,m1,ta,ma
	vle32.v	v2,0(a1)
	vle32.v	v1,0(a2)
	vsetivli	zero,4,e32,m1,ta,ma
	sub	a3,a3,a5
	vadd.vv	v1,v1,v2
	vsetvli	zero,a5,e32,m1,ta,ma
	vse32.v	v1,0(a0)
	addi	a2,a2,16
	addi	a0,a0,16
	addi	a1,a1,16
	bne	a3,zero,.L4
.L1:
	ret

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
        (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/align-1.c: New test.
        * gcc.target/riscv/rvv/autovec/align-2.c: New test.

---
 gcc/config/riscv/riscv.cc                          | 14 ++++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-1.c         | 12 ++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-2.c         | 12 ++++++++++++
 .../riscv/rvv/autovec/binop/shift-rv32gcv.c        | 10 ++++++----
 4 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
  

Comments

Kito Cheng May 15, 2023, 5:01 a.m. UTC | #1
LGTM

<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:

> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch optimizes both RVV VLA && VLS vectorization.
>
> Consider this following case:
> void __attribute__((noinline, noclone))
> f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> {
>   for (int i = 0; i < count; ++i)
>     dst[i] = op1[i] + op2[i];
> }
>
> VLA:
> Before this patch:
>         ble     a3,zero,.L1
>         srli    a4,a1,2
>         negw    a4,a4
>         andi    a5,a4,3
>         sext.w  a3,a3
>         beq     a5,zero,.L3
>         lw      a7,0(a1)
>         lw      a6,0(a2)
>         andi    a4,a4,2
>         addw    a6,a6,a7
>         sw      a6,0(a0)
>         beq     a4,zero,.L3
>         lw      a7,4(a1)
>         lw      a4,4(a2)
>         li      a6,3
>         addw    a4,a4,a7
>         sw      a4,4(a0)
>         bne     a5,a6,.L3
>         lw      a6,8(a2)
>         lw      a4,8(a1)
>         addw    a4,a4,a6
>         sw      a4,8(a0)
> .L3:
>         subw    a3,a3,a5
>         slli    a4,a3,32
>         csrr    a6,vlenb
>         srli    a4,a4,32
>         srli    a6,a6,2
>         slli    a3,a5,2
>         mv      a5,a4
>         bgtu    a4,a6,.L17
> .L5:
>         csrr    a6,vlenb
>         add     a1,a1,a3
>         add     a2,a2,a3
>         add     a0,a0,a3
>         srli    a7,a6,2
>         li      a3,0
> .L8:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v1,0(a1)
>         vle32.v v2,0(a2)
>         vsetvli t1,zero,e32,m1,ta,ma
>         add     a3,a3,a7
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         mv      a5,a4
>         bleu    a4,a3,.L6
>         mv      a5,a3
> .L6:
>         sub     a5,a4,a5
>         bleu    a5,a7,.L7
>         mv      a5,a7
> .L7:
>         add     a1,a1,a6
>         add     a2,a2,a6
>         add     a0,a0,a6
>         bne     a5,zero,.L8
> .L1:
>         ret
> .L17:
>         mv      a5,a6
>         j       .L5
>
> After this patch:
> f:
>         ble     a3,zero,.L1
>         csrr    a4,vlenb
>         srli    a4,a4,2
>         mv      a5,a3
>         bgtu    a3,a4,.L9
> .L3:
>         csrr    a6,vlenb
>         li      a4,0
>         srli    a7,a6,2
> .L6:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetvli t1,zero,e32,m1,ta,ma
>         add     a4,a4,a7
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         mv      a5,a3
>         bleu    a3,a4,.L4
>         mv      a5,a4
> .L4:
>         sub     a5,a3,a5
>         bleu    a5,a7,.L5
>         mv      a5,a7
> .L5:
>         add     a0,a0,a6
>         add     a2,a2,a6
>         add     a1,a1,a6
>         bne     a5,zero,.L6
> .L1:
>         ret
> .L9:
>         mv      a5,a4
>         j       .L3
>
> VLS:
> Before this patch:
> f3:
>         ble     a3,zero,.L1
>         srli    a5,a1,2
>         negw    a5,a5
>         andi    a4,a5,3
>         sext.w  a3,a3
>         beq     a4,zero,.L3
>         lw      a7,0(a1)
>         lw      a6,0(a2)
>         andi    a5,a5,2
>         addw    a6,a6,a7
>         sw      a6,0(a0)
>         beq     a5,zero,.L3
>         lw      a7,4(a1)
>         lw      a5,4(a2)
>         li      a6,3
>         addw    a5,a5,a7
>         sw      a5,4(a0)
>         bne     a4,a6,.L3
>         lw      a6,8(a2)
>         lw      a5,8(a1)
>         addw    a5,a5,a6
>         sw      a5,8(a0)
> .L3:
>         subw    a3,a3,a4
>         slli    a6,a4,2
>         slli    a5,a3,32
>         srli    a5,a5,32
>         add     a1,a1,a6
>         add     a2,a2,a6
>         add     a0,a0,a6
>         li      a3,4
> .L6:
>         mv      a4,a5
>         bleu    a5,a3,.L5
>         li      a4,4
> .L5:
>         vsetvli zero,a4,e32,m1,ta,ma
>         vle32.v v1,0(a1)
>         vle32.v v2,0(a2)
>         vsetivli        zero,4,e32,m1,ta,ma
>         sub     a5,a5,a4
>         vadd.vv v1,v1,v2
>         vsetvli zero,a4,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         addi    a1,a1,16
>         addi    a2,a2,16
>         addi    a0,a0,16
>         bne     a5,zero,.L6
> .L1:
>         ret
>
> After this patch:
> f3:
>         ble     a3,zero,.L1
>         li      a4,4
> .L4:
>         mv      a5,a3
>         bleu    a3,a4,.L3
>         li      a5,4
> .L3:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetivli        zero,4,e32,m1,ta,ma
>         sub     a3,a3,a5
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         addi    a2,a2,16
>         addi    a0,a0,16
>         addi    a1,a1,16
>         bne     a3,zero,.L4
> .L1:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv.cc
> (riscv_vectorize_preferred_vector_alignment): New function.
>         (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt
> testcase.
>         * gcc.target/riscv/rvv/autovec/align-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/align-2.c: New test.
>
> ---
>  gcc/config/riscv/riscv.cc                          | 14 ++++++++++++++
>  .../gcc.target/riscv/rvv/autovec/align-1.c         | 12 ++++++++++++
>  .../gcc.target/riscv/rvv/autovec/align-2.c         | 12 ++++++++++++
>  .../riscv/rvv/autovec/binop/shift-rv32gcv.c        | 10 ++++++----
>  4 files changed, 44 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a770fdfaa0e 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
>    return word_mode;
>  }
>
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> +  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> +    return TYPE_ALIGN (TREE_TYPE (type));
> +  return TYPE_ALIGN (type);
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
>  #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>  #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
>
> +#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
> +#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
> +  riscv_vectorize_preferred_vector_alignment
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-riscv.h"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> new file mode 100644
> index 00000000000..14201e1f7e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=scalable" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> +  for (int i = 0; i < count; ++i)
> +    dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> new file mode 100644
> index 00000000000..812584e9d25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=fixed-vlmax" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> +  for (int i = 0; i < count; ++i)
> +    dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> index da0f79a1cf0..d98100b3276 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> @@ -4,8 +4,10 @@
>  #include "shift-template.h"
>
>  /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
> -   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
> +   We don't check the assembler number since lacking patterns make
> +   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
> +
> +/* { dg-final { scan-assembler {\tvsll\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsra\.vv} } } */
>
> -/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
> -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
> -/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
> --
> 2.36.1
>
>
>
  
juzhe.zhong@rivai.ai May 15, 2023, 5:11 a.m. UTC | #2
Thanks. Can you take a look at this patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618398.html 
This has been fixed 5 rounds. I already fixed it as you suggested



juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-05-15 13:01
To: 钟居哲
CC: GCC Patches; Kito Cheng; Palmer Dabbelt; Palmer Dabbelt; Jeff Law; rdapp.gcc
Subject: Re: [PATCH V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.
LGTM 

<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch optimizes both RVV VLA && VLS vectorization.

Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

VLA:
Before this patch:
        ble     a3,zero,.L1
        srli    a4,a1,2
        negw    a4,a4
        andi    a5,a4,3
        sext.w  a3,a3
        beq     a5,zero,.L3
        lw      a7,0(a1)
        lw      a6,0(a2)
        andi    a4,a4,2
        addw    a6,a6,a7
        sw      a6,0(a0)
        beq     a4,zero,.L3
        lw      a7,4(a1)
        lw      a4,4(a2)
        li      a6,3
        addw    a4,a4,a7
        sw      a4,4(a0)
        bne     a5,a6,.L3
        lw      a6,8(a2)
        lw      a4,8(a1)
        addw    a4,a4,a6
        sw      a4,8(a0)
.L3:
        subw    a3,a3,a5
        slli    a4,a3,32
        csrr    a6,vlenb
        srli    a4,a4,32
        srli    a6,a6,2
        slli    a3,a5,2
        mv      a5,a4
        bgtu    a4,a6,.L17
.L5:
        csrr    a6,vlenb
        add     a1,a1,a3
        add     a2,a2,a3
        add     a0,a0,a3
        srli    a7,a6,2
        li      a3,0
.L8:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v1,0(a1)
        vle32.v v2,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a3,a3,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a4
        bleu    a4,a3,.L6
        mv      a5,a3
.L6:
        sub     a5,a4,a5
        bleu    a5,a7,.L7
        mv      a5,a7
.L7:
        add     a1,a1,a6
        add     a2,a2,a6
        add     a0,a0,a6
        bne     a5,zero,.L8
.L1:
        ret
.L17:
        mv      a5,a6
        j       .L5

After this patch:
f:
        ble     a3,zero,.L1
        csrr    a4,vlenb
        srli    a4,a4,2
        mv      a5,a3
        bgtu    a3,a4,.L9
.L3:
        csrr    a6,vlenb
        li      a4,0
        srli    a7,a6,2
.L6:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a4,a4,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a3
        bleu    a3,a4,.L4
        mv      a5,a4
.L4:
        sub     a5,a3,a5
        bleu    a5,a7,.L5
        mv      a5,a7
.L5:
        add     a0,a0,a6
        add     a2,a2,a6
        add     a1,a1,a6
        bne     a5,zero,.L6
.L1:
        ret
.L9:
        mv      a5,a4
        j       .L3

VLS:
Before this patch:
f3:
        ble     a3,zero,.L1
        srli    a5,a1,2
        negw    a5,a5
        andi    a4,a5,3
        sext.w  a3,a3
        beq     a4,zero,.L3
        lw      a7,0(a1)
        lw      a6,0(a2)
        andi    a5,a5,2
        addw    a6,a6,a7
        sw      a6,0(a0)
        beq     a5,zero,.L3
        lw      a7,4(a1)
        lw      a5,4(a2)
        li      a6,3
        addw    a5,a5,a7
        sw      a5,4(a0)
        bne     a4,a6,.L3
        lw      a6,8(a2)
        lw      a5,8(a1)
        addw    a5,a5,a6
        sw      a5,8(a0)
.L3:
        subw    a3,a3,a4
        slli    a6,a4,2
        slli    a5,a3,32
        srli    a5,a5,32
        add     a1,a1,a6
        add     a2,a2,a6
        add     a0,a0,a6
        li      a3,4
.L6:
        mv      a4,a5
        bleu    a5,a3,.L5
        li      a4,4
.L5:
        vsetvli zero,a4,e32,m1,ta,ma
        vle32.v v1,0(a1)
        vle32.v v2,0(a2)
        vsetivli        zero,4,e32,m1,ta,ma
        sub     a5,a5,a4
        vadd.vv v1,v1,v2
        vsetvli zero,a4,e32,m1,ta,ma
        vse32.v v1,0(a0)
        addi    a1,a1,16
        addi    a2,a2,16
        addi    a0,a0,16
        bne     a5,zero,.L6
.L1:
        ret

After this patch:
f3:
        ble     a3,zero,.L1
        li      a4,4
.L4:
        mv      a5,a3
        bleu    a3,a4,.L3
        li      a5,4
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetivli        zero,4,e32,m1,ta,ma
        sub     a3,a3,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        addi    a2,a2,16
        addi    a0,a0,16
        addi    a1,a1,16
        bne     a3,zero,.L4
.L1:
        ret

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
        (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/align-1.c: New test.
        * gcc.target/riscv/rvv/autovec/align-2.c: New test.

---
 gcc/config/riscv/riscv.cc                          | 14 ++++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-1.c         | 12 ++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-2.c         | 12 ++++++++++++
 .../riscv/rvv/autovec/binop/shift-rv32gcv.c        | 10 ++++++----
 4 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a770fdfaa0e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
   return word_mode;
 }

+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+    return TYPE_ALIGN (TREE_TYPE (type));
+  return TYPE_ALIGN (type);
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode

+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+  riscv_vectorize_preferred_vector_alignment
+
 struct gcc_target targetm = TARGET_INITIALIZER;

 #include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
new file mode 100644
index 00000000000..812584e9d25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=fixed-vlmax" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@
 #include "shift-template.h"

 /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
-   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
+   We don't check the assembler number since lacking patterns make
+   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */

-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
-- 
2.36.1
  
Li, Pan2 via Gcc-patches May 15, 2023, 6:18 a.m. UTC | #3
Committed, thanks kito.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Kito Cheng via Gcc-patches
Sent: Monday, May 15, 2023 1:01 PM
To: 钟居哲 <juzhe.zhong@rivai.ai>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Kito Cheng <kito.cheng@sifive.com>; Palmer Dabbelt <palmer@dabbelt.com>; Palmer Dabbelt <palmer@rivosinc.com>; Jeff Law <jeffreyalaw@gmail.com>; rdapp.gcc@gmail.co
Subject: Re: [PATCH V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.

LGTM

<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:

> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch optimizes both RVV VLA && VLS vectorization.
>
> Consider this following case:
> void __attribute__((noinline, noclone))
> f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> {
>   for (int i = 0; i < count; ++i)
>     dst[i] = op1[i] + op2[i];
> }
>
> VLA:
> Before this patch:
>         ble     a3,zero,.L1
>         srli    a4,a1,2
>         negw    a4,a4
>         andi    a5,a4,3
>         sext.w  a3,a3
>         beq     a5,zero,.L3
>         lw      a7,0(a1)
>         lw      a6,0(a2)
>         andi    a4,a4,2
>         addw    a6,a6,a7
>         sw      a6,0(a0)
>         beq     a4,zero,.L3
>         lw      a7,4(a1)
>         lw      a4,4(a2)
>         li      a6,3
>         addw    a4,a4,a7
>         sw      a4,4(a0)
>         bne     a5,a6,.L3
>         lw      a6,8(a2)
>         lw      a4,8(a1)
>         addw    a4,a4,a6
>         sw      a4,8(a0)
> .L3:
>         subw    a3,a3,a5
>         slli    a4,a3,32
>         csrr    a6,vlenb
>         srli    a4,a4,32
>         srli    a6,a6,2
>         slli    a3,a5,2
>         mv      a5,a4
>         bgtu    a4,a6,.L17
> .L5:
>         csrr    a6,vlenb
>         add     a1,a1,a3
>         add     a2,a2,a3
>         add     a0,a0,a3
>         srli    a7,a6,2
>         li      a3,0
> .L8:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v1,0(a1)
>         vle32.v v2,0(a2)
>         vsetvli t1,zero,e32,m1,ta,ma
>         add     a3,a3,a7
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         mv      a5,a4
>         bleu    a4,a3,.L6
>         mv      a5,a3
> .L6:
>         sub     a5,a4,a5
>         bleu    a5,a7,.L7
>         mv      a5,a7
> .L7:
>         add     a1,a1,a6
>         add     a2,a2,a6
>         add     a0,a0,a6
>         bne     a5,zero,.L8
> .L1:
>         ret
> .L17:
>         mv      a5,a6
>         j       .L5
>
> After this patch:
> f:
>         ble     a3,zero,.L1
>         csrr    a4,vlenb
>         srli    a4,a4,2
>         mv      a5,a3
>         bgtu    a3,a4,.L9
> .L3:
>         csrr    a6,vlenb
>         li      a4,0
>         srli    a7,a6,2
> .L6:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetvli t1,zero,e32,m1,ta,ma
>         add     a4,a4,a7
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         mv      a5,a3
>         bleu    a3,a4,.L4
>         mv      a5,a4
> .L4:
>         sub     a5,a3,a5
>         bleu    a5,a7,.L5
>         mv      a5,a7
> .L5:
>         add     a0,a0,a6
>         add     a2,a2,a6
>         add     a1,a1,a6
>         bne     a5,zero,.L6
> .L1:
>         ret
> .L9:
>         mv      a5,a4
>         j       .L3
>
> VLS:
> Before this patch:
> f3:
>         ble     a3,zero,.L1
>         srli    a5,a1,2
>         negw    a5,a5
>         andi    a4,a5,3
>         sext.w  a3,a3
>         beq     a4,zero,.L3
>         lw      a7,0(a1)
>         lw      a6,0(a2)
>         andi    a5,a5,2
>         addw    a6,a6,a7
>         sw      a6,0(a0)
>         beq     a5,zero,.L3
>         lw      a7,4(a1)
>         lw      a5,4(a2)
>         li      a6,3
>         addw    a5,a5,a7
>         sw      a5,4(a0)
>         bne     a4,a6,.L3
>         lw      a6,8(a2)
>         lw      a5,8(a1)
>         addw    a5,a5,a6
>         sw      a5,8(a0)
> .L3:
>         subw    a3,a3,a4
>         slli    a6,a4,2
>         slli    a5,a3,32
>         srli    a5,a5,32
>         add     a1,a1,a6
>         add     a2,a2,a6
>         add     a0,a0,a6
>         li      a3,4
> .L6:
>         mv      a4,a5
>         bleu    a5,a3,.L5
>         li      a4,4
> .L5:
>         vsetvli zero,a4,e32,m1,ta,ma
>         vle32.v v1,0(a1)
>         vle32.v v2,0(a2)
>         vsetivli        zero,4,e32,m1,ta,ma
>         sub     a5,a5,a4
>         vadd.vv v1,v1,v2
>         vsetvli zero,a4,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         addi    a1,a1,16
>         addi    a2,a2,16
>         addi    a0,a0,16
>         bne     a5,zero,.L6
> .L1:
>         ret
>
> After this patch:
> f3:
>         ble     a3,zero,.L1
>         li      a4,4
> .L4:
>         mv      a5,a3
>         bleu    a3,a4,.L3
>         li      a5,4
> .L3:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetivli        zero,4,e32,m1,ta,ma
>         sub     a3,a3,a5
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a0)
>         addi    a2,a2,16
>         addi    a0,a0,16
>         addi    a1,a1,16
>         bne     a3,zero,.L4
> .L1:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv.cc
> (riscv_vectorize_preferred_vector_alignment): New function.
>         (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt
> testcase.
>         * gcc.target/riscv/rvv/autovec/align-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/align-2.c: New test.
>
> ---
>  gcc/config/riscv/riscv.cc                          | 14 ++++++++++++++
>  .../gcc.target/riscv/rvv/autovec/align-1.c         | 12 ++++++++++++
>  .../gcc.target/riscv/rvv/autovec/align-2.c         | 12 ++++++++++++
>  .../riscv/rvv/autovec/binop/shift-rv32gcv.c        | 10 ++++++----
>  4 files changed, 44 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a770fdfaa0e 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
>    return word_mode;
>  }
>
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> +  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> +    return TYPE_ALIGN (TREE_TYPE (type));
> +  return TYPE_ALIGN (type);
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
>  #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>  #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
>
> +#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
> +#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
> +  riscv_vectorize_preferred_vector_alignment
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-riscv.h"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> new file mode 100644
> index 00000000000..14201e1f7e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=scalable" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> +  for (int i = 0; i < count; ++i)
> +    dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> new file mode 100644
> index 00000000000..812584e9d25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=fixed-vlmax" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> +  for (int i = 0; i < count; ++i)
> +    dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> index da0f79a1cf0..d98100b3276 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> @@ -4,8 +4,10 @@
>  #include "shift-template.h"
>
>  /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
> -   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
> +   We don't check the assembler number since lacking patterns make
> +   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
> +
> +/* { dg-final { scan-assembler {\tvsll\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsra\.vv} } } */
>
> -/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
> -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
> -/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
> --
> 2.36.1
>
>
>
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a770fdfaa0e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,16 @@  riscv_preferred_simd_mode (scalar_mode mode)
   return word_mode;
 }
 
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+    return TYPE_ALIGN (TREE_TYPE (type));
+  return TYPE_ALIGN (type);
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7781,10 @@  riscv_preferred_simd_mode (scalar_mode mode)
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
 
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+  riscv_vectorize_preferred_vector_alignment
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
new file mode 100644
index 00000000000..812584e9d25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=fixed-vlmax" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@ 
 #include "shift-template.h"
 
 /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
-   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
+   We don't check the assembler number since lacking patterns make
+   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
 
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */