[V2] RISC-V: Fix PR109615

Message ID 20230505063344.1085156-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series [V2] RISC-V: Fix PR109615 |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai May 5, 2023, 6:33 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch is to fix following case:
void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
{
  size_t vl = 101;
  if (cond)
    vl = m * 2;
  else
    vl = m * 2 * vl;

  for (size_t i = 0; i < n; i++)
    {
      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
      __riscv_vse8_v_i8mf8 (out + i, v, vl);

      vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);

      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
      __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
    }

  for (size_t i = 0; i < n; i++)
    {
      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
      __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
    }
}

The value of "vl" is coming from different blocks so it will be wrapped as a PHI node of each
block.

In the first loop, the "vl" source is a PHI node from bb 4.
In the second loop, the "vl" source is a PHI node from bb 5.
since bb 5 is dominated by bb 4, the PHI input of "vl" in the second loop is the PHI node of "vl" 
in bb 4.
So when 2 "vl" PHI node are both degenerate PHI node (the phi->num_inputs () == 1) and their only
input are same, it's safe for us to consider they are compatible.

This patch is only optimize degenerate PHI since it's safe and simple optimization.

non-dengerate PHI are considered as incompatible unless the PHI are the same in RTL_SSA.
TODO: non-generate PHI is complicated, we can support it when it is necessary in the future.

Before this patch:

...
.L2:
        addi    a4,a1,100
        add     t1,a0,a2
        mv      t0,a0
        beq     a2,zero,.L1
        vsetvli zero,a3,e8,mf8,tu,mu
.L4:
        addi    a6,t0,100
        addi    a7,a4,-100
        vle8.v  v1,0(t0)
        addi    t0,t0,1
        vse8.v  v1,0(a7)
        vlm.v   v0,0(a6)
        vle8.v  v1,0(a6),v0.t
        vse8.v  v1,0(a4)
        addi    a4,a4,1
        bne     t0,t1,.L4
        addi    a0,a0,300
        addi    a1,a1,300
        add     a2,a0,a2
        vsetvli zero,a3,e8,mf8,ta,ma
.L5:
        vle8.v  v2,0(a0)
        addi    a0,a0,1
        vse8.v  v2,0(a1)
        addi    a1,a1,1
        bne     a2,a0,.L5
.L1:
        ret

After this patch:

...
.L2:
        addi    a4,a1,100
        add     t1,a0,a2
        mv      t0,a0
        beq     a2,zero,.L1
        vsetvli zero,a3,e8,mf8,tu,mu
.L4:
        addi    a6,t0,100
        addi    a7,a4,-100
        vle8.v  v1,0(t0)
        addi    t0,t0,1
        vse8.v  v1,0(a7)
        vlm.v   v0,0(a6)
        vle8.v  v1,0(a6),v0.t
        vse8.v  v1,0(a4)
        addi    a4,a4,1
        bne     t0,t1,.L4
        addi    a0,a0,300
        addi    a1,a1,300
        add     a2,a0,a2
.L5:
        vle8.v  v2,0(a0)
        addi    a0,a0,1
        vse8.v  v2,0(a1)
        addi    a1,a1,1
        bne     a2,a0,.L5
.L1:
        ret

        PR target/109615

gcc/ChangeLog:

        * config/riscv/riscv-vsetvl.cc (avl_info::multiple_source_equal_p): Add denegrate PHI optmization.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/vsetvl/avl_single-74.c: Adapt testcase.
        * gcc.target/riscv/rvv/vsetvl/vsetvl-11.c: Ditto.
        * gcc.target/riscv/rvv/vsetvl/pr109615.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc              | 81 +++++--------------
 .../riscv/rvv/vsetvl/avl_single-74.c          |  4 +-
 .../gcc.target/riscv/rvv/vsetvl/pr109615.c    | 33 ++++++++
 .../gcc.target/riscv/rvv/vsetvl/vsetvl-11.c   |  2 +-
 4 files changed, 54 insertions(+), 66 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
  

Comments

Kito Cheng May 5, 2023, 1:18 p.m. UTC | #1
LGTM, committed to trunk with few changelog adjustments and few extra comments.


On Fri, May 5, 2023 at 2:33 PM <juzhe.zhong@rivai.ai> wrote:
>
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch is to fix following case:
> void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
> {
>   size_t vl = 101;
>   if (cond)
>     vl = m * 2;
>   else
>     vl = m * 2 * vl;
>
>   for (size_t i = 0; i < n; i++)
>     {
>       vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
>       __riscv_vse8_v_i8mf8 (out + i, v, vl);
>
>       vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
>
>       vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
>     }
>
>   for (size_t i = 0; i < n; i++)
>     {
>       vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
>     }
> }
>
> The value of "vl" is coming from different blocks so it will be wrapped as a PHI node of each
> block.
>
> In the first loop, the "vl" source is a PHI node from bb 4.
> In the second loop, the "vl" source is a PHI node from bb 5.
> since bb 5 is dominated by bb 4, the PHI input of "vl" in the second loop is the PHI node of "vl"
> in bb 4.
> So when 2 "vl" PHI node are both degenerate PHI node (the phi->num_inputs () == 1) and their only
> input are same, it's safe for us to consider they are compatible.
>
> This patch is only optimize degenerate PHI since it's safe and simple optimization.
>
> non-dengerate PHI are considered as incompatible unless the PHI are the same in RTL_SSA.
> TODO: non-generate PHI is complicated, we can support it when it is necessary in the future.
>
> Before this patch:
>
> ...
> .L2:
>         addi    a4,a1,100
>         add     t1,a0,a2
>         mv      t0,a0
>         beq     a2,zero,.L1
>         vsetvli zero,a3,e8,mf8,tu,mu
> .L4:
>         addi    a6,t0,100
>         addi    a7,a4,-100
>         vle8.v  v1,0(t0)
>         addi    t0,t0,1
>         vse8.v  v1,0(a7)
>         vlm.v   v0,0(a6)
>         vle8.v  v1,0(a6),v0.t
>         vse8.v  v1,0(a4)
>         addi    a4,a4,1
>         bne     t0,t1,.L4
>         addi    a0,a0,300
>         addi    a1,a1,300
>         add     a2,a0,a2
>         vsetvli zero,a3,e8,mf8,ta,ma
> .L5:
>         vle8.v  v2,0(a0)
>         addi    a0,a0,1
>         vse8.v  v2,0(a1)
>         addi    a1,a1,1
>         bne     a2,a0,.L5
> .L1:
>         ret
>
> After this patch:
>
> ...
> .L2:
>         addi    a4,a1,100
>         add     t1,a0,a2
>         mv      t0,a0
>         beq     a2,zero,.L1
>         vsetvli zero,a3,e8,mf8,tu,mu
> .L4:
>         addi    a6,t0,100
>         addi    a7,a4,-100
>         vle8.v  v1,0(t0)
>         addi    t0,t0,1
>         vse8.v  v1,0(a7)
>         vlm.v   v0,0(a6)
>         vle8.v  v1,0(a6),v0.t
>         vse8.v  v1,0(a4)
>         addi    a4,a4,1
>         bne     t0,t1,.L4
>         addi    a0,a0,300
>         addi    a1,a1,300
>         add     a2,a0,a2
> .L5:
>         vle8.v  v2,0(a0)
>         addi    a0,a0,1
>         vse8.v  v2,0(a1)
>         addi    a1,a1,1
>         bne     a2,a0,.L5
> .L1:
>         ret
>
>         PR target/109615
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vsetvl.cc (avl_info::multiple_source_equal_p): Add denegrate PHI optmization.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/vsetvl/avl_single-74.c: Adapt testcase.
>         * gcc.target/riscv/rvv/vsetvl/vsetvl-11.c: Ditto.
>         * gcc.target/riscv/rvv/vsetvl/pr109615.c: New test.
>
> ---
>  gcc/config/riscv/riscv-vsetvl.cc              | 81 +++++--------------
>  .../riscv/rvv/vsetvl/avl_single-74.c          |  4 +-
>  .../gcc.target/riscv/rvv/vsetvl/pr109615.c    | 33 ++++++++
>  .../gcc.target/riscv/rvv/vsetvl/vsetvl-11.c   |  2 +-
>  4 files changed, 54 insertions(+), 66 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
> index 609f86d8704..39b4d21210b 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -1676,72 +1676,27 @@ avl_info::single_source_equal_p (const avl_info &other) const
>  bool
>  avl_info::multiple_source_equal_p (const avl_info &other) const
>  {
> -  /* TODO: We don't do too much optimization here since it's
> -     too complicated in case of analyzing the PHI node.
> +  /* When the def info is same in RTL_SSA namespace, it's safe
> +     to consider they are avl compatible.  */
> +  if (m_source == other.get_source ())
> +    return true;
>
> -     For example:
> -       void f (void * restrict in, void * restrict out, int n, int m, int cond)
> -       {
> -         size_t vl;
> -         switch (cond)
> -         {
> -         case 1:
> -           vl = 100;
> -           break;
> -         case 2:
> -           vl = *(size_t*)(in + 100);
> -           break;
> -         case 3:
> -           {
> -             size_t new_vl = *(size_t*)(in + 500);
> -             size_t new_vl2 = *(size_t*)(in + 600);
> -             vl = new_vl + new_vl2 + 777;
> -             break;
> -           }
> -         default:
> -           vl = 4000;
> -           break;
> -         }
> -         for (size_t i = 0; i < n; i++)
> -           {
> -             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> -             __riscv_vse8_v_i8mf8 (out + i, v, vl);
> +  /* We only consider handle PHI node.  */
> +  if (!m_source->insn ()->is_phi () || !other.get_source ()->insn ()->is_phi ())
> +    return false;
>
> -             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100, vl);
> -             __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> -           }
> +  phi_info *phi1 = as_a<phi_info *> (m_source);
> +  phi_info *phi2 = as_a<phi_info *> (other.get_source ());
>
> -         size_t vl2;
> -         switch (cond)
> -         {
> -         case 1:
> -           vl2 = 100;
> -           break;
> -         case 2:
> -           vl2 = *(size_t*)(in + 100);
> -           break;
> -         case 3:
> -           {
> -             size_t new_vl = *(size_t*)(in + 500);
> -             size_t new_vl2 = *(size_t*)(in + 600);
> -             vl2 = new_vl + new_vl2 + 777;
> -             break;
> -           }
> -         default:
> -           vl2 = 4000;
> -           break;
> -         }
> -         for (size_t i = 0; i < m; i++)
> -           {
> -             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
> -             __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
> -             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200, vl2);
> -             __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
> -           }
> -       }
> -     Such case may not be necessary to optimize since the codes of defining
> -     vl and vl2 are redundant.  */
> -  return m_source == other.get_source ();
> +  if (phi1->is_degenerate () && phi2->is_degenerate ())
> +    {
> +      /* Case 1: If both PHI nodes have the same single input in use list.
> +        We consider they are AVL compatible.  */
> +      if (phi1->input_value (0) == phi2->input_value (0))
> +       return true;
> +    }
> +  /* TODO: We can support more optimization cases in the future.  */
> +  return false;
>  }
>
>  avl_info &
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> index ff540ec792d..cc4f88be888 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> @@ -23,5 +23,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond, size_t vl,
>      }
>  }
>
> -/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> -/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
> new file mode 100644
> index 00000000000..90b0bb79937
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
> +
> +#include "riscv_vector.h"
> +
> +void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
> +{
> +  size_t vl = 101;
> +  if (cond)
> +    vl = m * 2;
> +  else
> +    vl = m * 2 * vl;
> +
> +  for (size_t i = 0; i < n; i++)
> +    {
> +      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> +      __riscv_vse8_v_i8mf8 (out + i, v, vl);
> +
> +      vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
> +
> +      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
> +      __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> +    }
> +
> +  for (size_t i = 0; i < n; i++)
> +    {
> +      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
> +      __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*mu} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> index fa825f031f9..3ef0fdcb66d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> @@ -18,4 +18,4 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
>    }
>  }
>
> -/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
> --
> 2.36.3
>
  

Patch

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 609f86d8704..39b4d21210b 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1676,72 +1676,27 @@  avl_info::single_source_equal_p (const avl_info &other) const
 bool
 avl_info::multiple_source_equal_p (const avl_info &other) const
 {
-  /* TODO: We don't do too much optimization here since it's
-     too complicated in case of analyzing the PHI node.
+  /* When the def info is same in RTL_SSA namespace, it's safe
+     to consider they are avl compatible.  */
+  if (m_source == other.get_source ())
+    return true;
 
-     For example:
-       void f (void * restrict in, void * restrict out, int n, int m, int cond)
-	{
-	  size_t vl;
-	  switch (cond)
-	  {
-	  case 1:
-	    vl = 100;
-	    break;
-	  case 2:
-	    vl = *(size_t*)(in + 100);
-	    break;
-	  case 3:
-	    {
-	      size_t new_vl = *(size_t*)(in + 500);
-	      size_t new_vl2 = *(size_t*)(in + 600);
-	      vl = new_vl + new_vl2 + 777;
-	      break;
-	    }
-	  default:
-	    vl = 4000;
-	    break;
-	  }
-	  for (size_t i = 0; i < n; i++)
-	    {
-	      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
-	      __riscv_vse8_v_i8mf8 (out + i, v, vl);
+  /* We only consider handle PHI node.  */
+  if (!m_source->insn ()->is_phi () || !other.get_source ()->insn ()->is_phi ())
+    return false;
 
-	      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100, vl);
-	      __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
-	    }
+  phi_info *phi1 = as_a<phi_info *> (m_source);
+  phi_info *phi2 = as_a<phi_info *> (other.get_source ());
 
-	  size_t vl2;
-	  switch (cond)
-	  {
-	  case 1:
-	    vl2 = 100;
-	    break;
-	  case 2:
-	    vl2 = *(size_t*)(in + 100);
-	    break;
-	  case 3:
-	    {
-	      size_t new_vl = *(size_t*)(in + 500);
-	      size_t new_vl2 = *(size_t*)(in + 600);
-	      vl2 = new_vl + new_vl2 + 777;
-	      break;
-	    }
-	  default:
-	    vl2 = 4000;
-	    break;
-	  }
-	  for (size_t i = 0; i < m; i++)
-	    {
-	      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
-	      __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
-	      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200, vl2);
-	      __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
-	    }
-	}
-     Such case may not be necessary to optimize since the codes of defining
-     vl and vl2 are redundant.  */
-  return m_source == other.get_source ();
+  if (phi1->is_degenerate () && phi2->is_degenerate ())
+    {
+      /* Case 1: If both PHI nodes have the same single input in use list.
+	 We consider they are AVL compatible.  */
+      if (phi1->input_value (0) == phi2->input_value (0))
+	return true;
+    }
+  /* TODO: We can support more optimization cases in the future.  */
+  return false;
 }
 
 avl_info &
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
index ff540ec792d..cc4f88be888 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
@@ -23,5 +23,5 @@  void f (int8_t * restrict in, int8_t * restrict out, int n, int cond, size_t vl,
     }
 }
 
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
new file mode 100644
index 00000000000..90b0bb79937
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
+{
+  size_t vl = 101;
+  if (cond)
+    vl = m * 2;
+  else
+    vl = m * 2 * vl;
+
+  for (size_t i = 0; i < n; i++)
+    {
+      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
+      __riscv_vse8_v_i8mf8 (out + i, v, vl);
+
+      vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
+
+      vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
+      __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
+    }
+
+  for (size_t i = 0; i < n; i++)
+    {
+      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
+      __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
+    }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*mu} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
index fa825f031f9..3ef0fdcb66d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
@@ -18,4 +18,4 @@  void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
   }
 }
 
-/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */