[V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE

Message ID 20230627064737.16257-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai June 27, 2023, 6:47 a.m. UTC
  From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richi.

I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.

This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
I have appended testcase in this patch to test VN for LEN_MASK_STORE.

I am not sure whether I am on the same page with you.

Feel free to correct me, Thanks.

gcc/ChangeLog:

        * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.

---
 .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
 gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
 2 files changed, 49 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
  

Comments

Richard Biener June 27, 2023, 7:33 a.m. UTC | #1
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richi.
> 
> I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> 
> This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> 
> I am not sure whether I am on the same page with you.
> 
> Feel free to correct me, Thanks.
> 
> gcc/ChangeLog:
> 
>         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> 
> ---
>  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
>  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
>  2 files changed, 49 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> 
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> new file mode 100644
> index 00000000000..0b2d03693dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> +
> +void __attribute__((noinline,noclone))
> +foo (int *out, int *res)
> +{
> +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> +  int i;
> +  for (i = 0; i < 16; ++i)
> +    {
> +      if (mask[i])
> +        out[i] = i;
> +    }
> +  int o0 = out[0];
> +  int o7 = out[7];
> +  int o14 = out[14];
> +  int o15 = out[15];
> +  res[0] = o0;
> +  res[2] = o7;
> +  res[4] = o14;
> +  res[6] = o15;
> +}
> +
> +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> +   vector iterations.  FRE5 after that should be able to CSE
> +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> index 11061a374a2..242d82d6274 100644
> --- a/gcc/tree-ssa-sccvn.cc
> +++ b/gcc/tree-ssa-sccvn.cc
> @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
>  	    return (void *)-1;
>  	  break;
> +	case IFN_LEN_MASK_STORE:
> +	  len = gimple_call_arg (call, 2);
> +	  bias = gimple_call_arg (call, 5);
> +	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> +	    return (void *)-1;
> +	  mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> +	  mask = vn_valueize (mask);
> +	  if (TREE_CODE (mask) != VECTOR_CST)
> +	    return (void *)-1;
> +	  break;
>  	default:
>  	  return (void *)-1;
>  	}
> @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  	      tree vectype = TREE_TYPE (def_rhs);
>  	      unsigned HOST_WIDE_INT elsz
>  		= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> +	      /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> +		 is always true for MASK_STORE.  */
> +	      unsigned actual_len = UINT_MAX;
> +	      if (len)
> +		actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> +	      unsigned nunits
> +		= MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);

No, that's not correct and what I meant.  There's
vector_cst_encoded_nelts (mask), but for example for an
all-ones mask that would be 1.  You'd then also not access
VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
is how to recover the implicit walk over all elements for the
purpose of computing start/length and how generally this will
work for variable-length vectors where enumerating "all"
elements touched is required for correctness.

The most simple thing would be to make this all conditional
to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
asking for VL vector testcases.

Richard.

>  	      if (mask)
>  		{
>  		  HOST_WIDE_INT start = 0, length = 0;
> -		  unsigned mask_idx = 0;
> -		  do
> +		  for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
>  		    {
>  		      if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
>  			{
> @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  			}
>  		      else
>  			length += elsz;
> -		      mask_idx++;
>  		    }
> -		  while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
>  		  if (length != 0)
>  		    {
>  		      pd.rhs_off = start;
> @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  		{
>  		  pd.offset = offset2i;
>  		  pd.size = (tree_to_uhwi (len)
> -			     + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> +			     + tree_to_shwi (bias)) * BITS_PER_UNIT;
>  		  if (BYTES_BIG_ENDIAN)
>  		    pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
>  		  else
>
  
juzhe.zhong@rivai.ai June 27, 2023, 7:41 a.m. UTC | #2
Hi, Richi.

When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
then I failed to walk all elements analysis.

>> The most simple thing would be to make this all conditional
>> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
>> asking for VL vector testcases.
Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
in C code.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-27 15:33
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford; pan2.li
Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richi.
> 
> I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> 
> This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> 
> I am not sure whether I am on the same page with you.
> 
> Feel free to correct me, Thanks.
> 
> gcc/ChangeLog:
> 
>         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> 
> ---
>  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
>  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
>  2 files changed, 49 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> 
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> new file mode 100644
> index 00000000000..0b2d03693dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> +
> +void __attribute__((noinline,noclone))
> +foo (int *out, int *res)
> +{
> +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> +  int i;
> +  for (i = 0; i < 16; ++i)
> +    {
> +      if (mask[i])
> +        out[i] = i;
> +    }
> +  int o0 = out[0];
> +  int o7 = out[7];
> +  int o14 = out[14];
> +  int o15 = out[15];
> +  res[0] = o0;
> +  res[2] = o7;
> +  res[4] = o14;
> +  res[6] = o15;
> +}
> +
> +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> +   vector iterations.  FRE5 after that should be able to CSE
> +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> index 11061a374a2..242d82d6274 100644
> --- a/gcc/tree-ssa-sccvn.cc
> +++ b/gcc/tree-ssa-sccvn.cc
> @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
>      return (void *)-1;
>    break;
> + case IFN_LEN_MASK_STORE:
> +   len = gimple_call_arg (call, 2);
> +   bias = gimple_call_arg (call, 5);
> +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> +     return (void *)-1;
> +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> +   mask = vn_valueize (mask);
> +   if (TREE_CODE (mask) != VECTOR_CST)
> +     return (void *)-1;
> +   break;
>  default:
>    return (void *)-1;
>  }
> @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>        tree vectype = TREE_TYPE (def_rhs);
>        unsigned HOST_WIDE_INT elsz
>  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> + is always true for MASK_STORE.  */
> +       unsigned actual_len = UINT_MAX;
> +       if (len)
> + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> +       unsigned nunits
> + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
 
No, that's not correct and what I meant.  There's
vector_cst_encoded_nelts (mask), but for example for an
all-ones mask that would be 1.  You'd then also not access
VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
is how to recover the implicit walk over all elements for the
purpose of computing start/length and how generally this will
work for variable-length vectors where enumerating "all"
elements touched is required for correctness.
 
The most simple thing would be to make this all conditional
to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
asking for VL vector testcases.
 
Richard.
 
>        if (mask)
>  {
>    HOST_WIDE_INT start = 0, length = 0;
> -   unsigned mask_idx = 0;
> -   do
> +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
>      {
>        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
>  {
> @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  }
>        else
>  length += elsz;
> -       mask_idx++;
>      }
> -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
>    if (length != 0)
>      {
>        pd.rhs_off = start;
> @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  {
>    pd.offset = offset2i;
>    pd.size = (tree_to_uhwi (len)
> -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
>    if (BYTES_BIG_ENDIAN)
>      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
>    else
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  
Richard Biener June 27, 2023, 7:47 a.m. UTC | #3
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> then I failed to walk all elements analysis.
> 
> >> The most simple thing would be to make this all conditional
> >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> >> asking for VL vector testcases.
> Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> in C code.

The original motivation was from unrolled vector code for the
conditional masking case.  Does RVV allow MASK_STORE from
intrinsics?

Otherwise how about

 for (i = 0; i < 8; ++i)
   a[i] = 42;
 for (i = 2; i < 6; ++i)
   b[i] = a[i];

with disabling complete unrolling -fdisable-tree-cunrolli both
loops should get vectorized, hopefully not iterating(?)  Then
we end up with the overlapping CSE opportunity, no?  Maybe
it needs support for fixed size vectors and using VL vectors
just for the epilog.

Richard.


> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:33
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richi.
> > 
> > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > 
> > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > 
> > I am not sure whether I am on the same page with you.
> > 
> > Feel free to correct me, Thanks.
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > 
> > ---
> >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> >  2 files changed, 49 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > 
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > new file mode 100644
> > index 00000000000..0b2d03693dc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > +
> > +void __attribute__((noinline,noclone))
> > +foo (int *out, int *res)
> > +{
> > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > +  int i;
> > +  for (i = 0; i < 16; ++i)
> > +    {
> > +      if (mask[i])
> > +        out[i] = i;
> > +    }
> > +  int o0 = out[0];
> > +  int o7 = out[7];
> > +  int o14 = out[14];
> > +  int o15 = out[15];
> > +  res[0] = o0;
> > +  res[2] = o7;
> > +  res[4] = o14;
> > +  res[6] = o15;
> > +}
> > +
> > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > +   vector iterations.  FRE5 after that should be able to CSE
> > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > index 11061a374a2..242d82d6274 100644
> > --- a/gcc/tree-ssa-sccvn.cc
> > +++ b/gcc/tree-ssa-sccvn.cc
> > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> >      return (void *)-1;
> >    break;
> > + case IFN_LEN_MASK_STORE:
> > +   len = gimple_call_arg (call, 2);
> > +   bias = gimple_call_arg (call, 5);
> > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > +     return (void *)-1;
> > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > +   mask = vn_valueize (mask);
> > +   if (TREE_CODE (mask) != VECTOR_CST)
> > +     return (void *)-1;
> > +   break;
> >  default:
> >    return (void *)-1;
> >  }
> > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >        tree vectype = TREE_TYPE (def_rhs);
> >        unsigned HOST_WIDE_INT elsz
> >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > + is always true for MASK_STORE.  */
> > +       unsigned actual_len = UINT_MAX;
> > +       if (len)
> > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > +       unsigned nunits
> > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
>  
> No, that's not correct and what I meant.  There's
> vector_cst_encoded_nelts (mask), but for example for an
> all-ones mask that would be 1.  You'd then also not access
> VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> is how to recover the implicit walk over all elements for the
> purpose of computing start/length and how generally this will
> work for variable-length vectors where enumerating "all"
> elements touched is required for correctness.
>  
> The most simple thing would be to make this all conditional
> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> asking for VL vector testcases.
>  
> Richard.
>  
> >        if (mask)
> >  {
> >    HOST_WIDE_INT start = 0, length = 0;
> > -   unsigned mask_idx = 0;
> > -   do
> > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> >      {
> >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> >  {
> > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  }
> >        else
> >  length += elsz;
> > -       mask_idx++;
> >      }
> > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> >    if (length != 0)
> >      {
> >        pd.rhs_off = start;
> > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  {
> >    pd.offset = offset2i;
> >    pd.size = (tree_to_uhwi (len)
> > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> >    if (BYTES_BIG_ENDIAN)
> >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> >    else
> > 
>  
>
  
juzhe.zhong@rivai.ai June 27, 2023, 8:01 a.m. UTC | #4
Hi, Richi.

>> Does RVV allow MASK_STORE from
>> intrinsics?
No, RVV didn't use any internal_fn in intrinsics.

>>with disabling complete unrolling -fdisable-tree-cunrolli both
>>loops should get vectorized, hopefully not iterating(?)  Then
>>we end up with the overlapping CSE opportunity, no?  Maybe
>>it needs support for fixed size vectors and using VL vectors
>>just for the epilog.

I tried on ARM:
https://godbolt.org/z/cTET8f7W9 

It seems that it can't reproduce CSE opportunity.
I am not sure whether I am doing something wrong.
Besides, RVV currently can't have VLS modes (V4SI) since LTO issue.
I am trying to find a CSE opportunity on VL vectors.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-27 15:47
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; pan2.li
Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> 
> When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> then I failed to walk all elements analysis.
> 
> >> The most simple thing would be to make this all conditional
> >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> >> asking for VL vector testcases.
> Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> in C code.
 
The original motivation was from unrolled vector code for the
conditional masking case.  Does RVV allow MASK_STORE from
intrinsics?
 
Otherwise how about
 
for (i = 0; i < 8; ++i)
   a[i] = 42;
for (i = 2; i < 6; ++i)
   b[i] = a[i];
 
with disabling complete unrolling -fdisable-tree-cunrolli both
loops should get vectorized, hopefully not iterating(?)  Then
we end up with the overlapping CSE opportunity, no?  Maybe
it needs support for fixed size vectors and using VL vectors
just for the epilog.
 
Richard.
 
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:33
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richi.
> > 
> > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > 
> > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > 
> > I am not sure whether I am on the same page with you.
> > 
> > Feel free to correct me, Thanks.
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > 
> > ---
> >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> >  2 files changed, 49 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > 
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > new file mode 100644
> > index 00000000000..0b2d03693dc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > +
> > +void __attribute__((noinline,noclone))
> > +foo (int *out, int *res)
> > +{
> > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > +  int i;
> > +  for (i = 0; i < 16; ++i)
> > +    {
> > +      if (mask[i])
> > +        out[i] = i;
> > +    }
> > +  int o0 = out[0];
> > +  int o7 = out[7];
> > +  int o14 = out[14];
> > +  int o15 = out[15];
> > +  res[0] = o0;
> > +  res[2] = o7;
> > +  res[4] = o14;
> > +  res[6] = o15;
> > +}
> > +
> > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > +   vector iterations.  FRE5 after that should be able to CSE
> > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > index 11061a374a2..242d82d6274 100644
> > --- a/gcc/tree-ssa-sccvn.cc
> > +++ b/gcc/tree-ssa-sccvn.cc
> > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> >      return (void *)-1;
> >    break;
> > + case IFN_LEN_MASK_STORE:
> > +   len = gimple_call_arg (call, 2);
> > +   bias = gimple_call_arg (call, 5);
> > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > +     return (void *)-1;
> > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > +   mask = vn_valueize (mask);
> > +   if (TREE_CODE (mask) != VECTOR_CST)
> > +     return (void *)-1;
> > +   break;
> >  default:
> >    return (void *)-1;
> >  }
> > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >        tree vectype = TREE_TYPE (def_rhs);
> >        unsigned HOST_WIDE_INT elsz
> >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > + is always true for MASK_STORE.  */
> > +       unsigned actual_len = UINT_MAX;
> > +       if (len)
> > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > +       unsigned nunits
> > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
>  
> No, that's not correct and what I meant.  There's
> vector_cst_encoded_nelts (mask), but for example for an
> all-ones mask that would be 1.  You'd then also not access
> VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> is how to recover the implicit walk over all elements for the
> purpose of computing start/length and how generally this will
> work for variable-length vectors where enumerating "all"
> elements touched is required for correctness.
>  
> The most simple thing would be to make this all conditional
> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> asking for VL vector testcases.
>  
> Richard.
>  
> >        if (mask)
> >  {
> >    HOST_WIDE_INT start = 0, length = 0;
> > -   unsigned mask_idx = 0;
> > -   do
> > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> >      {
> >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> >  {
> > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  }
> >        else
> >  length += elsz;
> > -       mask_idx++;
> >      }
> > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> >    if (length != 0)
> >      {
> >        pd.rhs_off = start;
> > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  {
> >    pd.offset = offset2i;
> >    pd.size = (tree_to_uhwi (len)
> > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> >    if (BYTES_BIG_ENDIAN)
> >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> >    else
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  
juzhe.zhong@rivai.ai June 27, 2023, 8:09 a.m. UTC | #5
Hi, Richi.
After several tries, I found a case that is "close to" have CSE opportunity in RVV for VL vectors:

void __attribute__((noinline,noclone))
foo (uint16_t *out, uint16_t *res)
{
  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
  int i;
  for (i = 0; i < 8; ++i)
    {
      if (mask[i])
        out[i] = 33;
    }
  uint16_t o0 = out[0];
  uint16_t o7 = out[3];
  uint16_t o14 = out[6];
  uint16_t o15 = out[7];
  res[0] = o0;
  res[2] = o7;
  res[4] = o14;
  res[6] = o15;
}

The Gimple IR:
  _64 = .SELECT_VL (ivtmp_31, POLY_INT_CST [4, 4]);
  vect__1.15_54 = .LEN_MASK_LOAD (vectp_mask.13_21, 32B, _64, { -1, ... }, 0);
  mask__7.16_56 = vect__1.15_54 != { 0, ... };
  .LEN_MASK_STORE (vectp_out.17_34, 16B, _64, mask__7.16_56, { 33, ... }, 0);

You can see the "len" is always variable produced by SELECT_VL so it failed to have CSE opportunity.
And I tried in ARM SVE: https://godbolt.org/z/63a6WcT9o
It also fail to have CSE opportunity.

It seems that it's difficult to have such CSE opportunity in VL vectors.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-27 15:47
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; pan2.li
Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> 
> When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> then I failed to walk all elements analysis.
> 
> >> The most simple thing would be to make this all conditional
> >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> >> asking for VL vector testcases.
> Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> in C code.
 
The original motivation was from unrolled vector code for the
conditional masking case.  Does RVV allow MASK_STORE from
intrinsics?
 
Otherwise how about
 
for (i = 0; i < 8; ++i)
   a[i] = 42;
for (i = 2; i < 6; ++i)
   b[i] = a[i];
 
with disabling complete unrolling -fdisable-tree-cunrolli both
loops should get vectorized, hopefully not iterating(?)  Then
we end up with the overlapping CSE opportunity, no?  Maybe
it needs support for fixed size vectors and using VL vectors
just for the epilog.
 
Richard.
 
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:33
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richi.
> > 
> > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > 
> > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > 
> > I am not sure whether I am on the same page with you.
> > 
> > Feel free to correct me, Thanks.
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > 
> > ---
> >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> >  2 files changed, 49 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > 
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > new file mode 100644
> > index 00000000000..0b2d03693dc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > +
> > +void __attribute__((noinline,noclone))
> > +foo (int *out, int *res)
> > +{
> > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > +  int i;
> > +  for (i = 0; i < 16; ++i)
> > +    {
> > +      if (mask[i])
> > +        out[i] = i;
> > +    }
> > +  int o0 = out[0];
> > +  int o7 = out[7];
> > +  int o14 = out[14];
> > +  int o15 = out[15];
> > +  res[0] = o0;
> > +  res[2] = o7;
> > +  res[4] = o14;
> > +  res[6] = o15;
> > +}
> > +
> > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > +   vector iterations.  FRE5 after that should be able to CSE
> > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > index 11061a374a2..242d82d6274 100644
> > --- a/gcc/tree-ssa-sccvn.cc
> > +++ b/gcc/tree-ssa-sccvn.cc
> > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> >      return (void *)-1;
> >    break;
> > + case IFN_LEN_MASK_STORE:
> > +   len = gimple_call_arg (call, 2);
> > +   bias = gimple_call_arg (call, 5);
> > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > +     return (void *)-1;
> > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > +   mask = vn_valueize (mask);
> > +   if (TREE_CODE (mask) != VECTOR_CST)
> > +     return (void *)-1;
> > +   break;
> >  default:
> >    return (void *)-1;
> >  }
> > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >        tree vectype = TREE_TYPE (def_rhs);
> >        unsigned HOST_WIDE_INT elsz
> >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > + is always true for MASK_STORE.  */
> > +       unsigned actual_len = UINT_MAX;
> > +       if (len)
> > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > +       unsigned nunits
> > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
>  
> No, that's not correct and what I meant.  There's
> vector_cst_encoded_nelts (mask), but for example for an
> all-ones mask that would be 1.  You'd then also not access
> VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> is how to recover the implicit walk over all elements for the
> purpose of computing start/length and how generally this will
> work for variable-length vectors where enumerating "all"
> elements touched is required for correctness.
>  
> The most simple thing would be to make this all conditional
> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> asking for VL vector testcases.
>  
> Richard.
>  
> >        if (mask)
> >  {
> >    HOST_WIDE_INT start = 0, length = 0;
> > -   unsigned mask_idx = 0;
> > -   do
> > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> >      {
> >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> >  {
> > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  }
> >        else
> >  length += elsz;
> > -       mask_idx++;
> >      }
> > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> >    if (length != 0)
> >      {
> >        pd.rhs_off = start;
> > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> >  {
> >    pd.offset = offset2i;
> >    pd.size = (tree_to_uhwi (len)
> > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> >    if (BYTES_BIG_ENDIAN)
> >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> >    else
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  
Richard Biener June 27, 2023, 8:28 a.m. UTC | #6
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> >> Does RVV allow MASK_STORE from
> >> intrinsics?
> No, RVV didn't use any internal_fn in intrinsics.
> 
> >>with disabling complete unrolling -fdisable-tree-cunrolli both
> >>loops should get vectorized, hopefully not iterating(?)  Then
> >>we end up with the overlapping CSE opportunity, no?  Maybe
> >>it needs support for fixed size vectors and using VL vectors
> >>just for the epilog.
> 
> I tried on ARM:
> https://godbolt.org/z/cTET8f7W9 
> 
> It seems that it can't reproduce CSE opportunity.
> I am not sure whether I am doing something wrong.
> Besides, RVV currently can't have VLS modes (V4SI) since LTO issue.
> I am trying to find a CSE opportunity on VL vectors.

So for example

void foo (int * __restrict a, int *b)
{
  for (int i = 0; i < 9; ++i)
    a[i] = 42;
  for (int i = 2; i < 6; ++i)
    b[i] = a[i];
}

with -O3 -fdisable-tree-cunrolli -march=znver4 --param 
vect-partial-vector-usage=1  -fno-tree-loop-distribute-patterns
gets us before FRE4

  .MASK_STORE (a_14(D), 32B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 
0, 0, 0, 0, 0 }, { 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 
42, 42 });
  vectp_a.10_42 = a_14(D) + 64;
  ivtmp_46 = 249;
  _47 = {ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, 
ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, ivtmp_46, 
ivtmp_46, ivtmp_46, ivtmp_46};
  _48 = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } < _47;
  vectp_a.6_11 = a_14(D) + 8;
  vectp_b.9_34 = b_15(D) + 8;
  vect__8.7_33 = MEM <vector(4) int> [(int *)vectp_a.6_11];
  MEM <vector(4) int> [(int *)vectp_b.9_34] = vect__8.7_33;

where we can optimize the non-VL vector non-masked load:

  MEM <vector(4) int> [(int *)vectp_b.9_34] = { 42, 42, 42, 42 };

We don't optimize a masked load in this place (GIMPLE currently
doesn't define the value of the masked elements - we want to
nail those to zero though).  So what we are after is VLS mode
loads optimized from VLA masked stores.  IIRC there's a
testcase in the testsuite that triggered for power/s390 with
LEN_STORE.

There's always the possibility to write GIMPLE testcases but
you have to expect to run into parser limitations as I never
tried to write VLA vector code there.

I'd say put this patch on hold until we get a motivating testcase.
That's either when you add VLS modes so you could get the above
(but then you will likely not get len_mask_store but at most
mask_store).

Maybe SVE folks can come up with something useful.

Another thing to test is for example scalar element load CSE
from VLA [masked/with-len] stores if we can for example
compute a lower bound on the number of elements accessed.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:47
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > 
> > When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> > then I failed to walk all elements analysis.
> > 
> > >> The most simple thing would be to make this all conditional
> > >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > >> asking for VL vector testcases.
> > Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> > in C code.
>  
> The original motivation was from unrolled vector code for the
> conditional masking case.  Does RVV allow MASK_STORE from
> intrinsics?
>  
> Otherwise how about
>  
> for (i = 0; i < 8; ++i)
>    a[i] = 42;
> for (i = 2; i < 6; ++i)
>    b[i] = a[i];
>  
> with disabling complete unrolling -fdisable-tree-cunrolli both
> loops should get vectorized, hopefully not iterating(?)  Then
> we end up with the overlapping CSE opportunity, no?  Maybe
> it needs support for fixed size vectors and using VL vectors
> just for the epilog.
>  
> Richard.
>  
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-06-27 15:33
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford; pan2.li
> > Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richi.
> > > 
> > > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > > 
> > > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > > 
> > > I am not sure whether I am on the same page with you.
> > > 
> > > Feel free to correct me, Thanks.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > > 
> > > ---
> > >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> > >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> > >  2 files changed, 49 insertions(+), 5 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > 
> > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > new file mode 100644
> > > index 00000000000..0b2d03693dc
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > > +
> > > +void __attribute__((noinline,noclone))
> > > +foo (int *out, int *res)
> > > +{
> > > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > > +  int i;
> > > +  for (i = 0; i < 16; ++i)
> > > +    {
> > > +      if (mask[i])
> > > +        out[i] = i;
> > > +    }
> > > +  int o0 = out[0];
> > > +  int o7 = out[7];
> > > +  int o14 = out[14];
> > > +  int o15 = out[15];
> > > +  res[0] = o0;
> > > +  res[2] = o7;
> > > +  res[4] = o14;
> > > +  res[6] = o15;
> > > +}
> > > +
> > > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > > +   vector iterations.  FRE5 after that should be able to CSE
> > > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > > index 11061a374a2..242d82d6274 100644
> > > --- a/gcc/tree-ssa-sccvn.cc
> > > +++ b/gcc/tree-ssa-sccvn.cc
> > > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > >      return (void *)-1;
> > >    break;
> > > + case IFN_LEN_MASK_STORE:
> > > +   len = gimple_call_arg (call, 2);
> > > +   bias = gimple_call_arg (call, 5);
> > > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > +     return (void *)-1;
> > > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > > +   mask = vn_valueize (mask);
> > > +   if (TREE_CODE (mask) != VECTOR_CST)
> > > +     return (void *)-1;
> > > +   break;
> > >  default:
> > >    return (void *)-1;
> > >  }
> > > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >        tree vectype = TREE_TYPE (def_rhs);
> > >        unsigned HOST_WIDE_INT elsz
> > >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > > + is always true for MASK_STORE.  */
> > > +       unsigned actual_len = UINT_MAX;
> > > +       if (len)
> > > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > > +       unsigned nunits
> > > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
> >  
> > No, that's not correct and what I meant.  There's
> > vector_cst_encoded_nelts (mask), but for example for an
> > all-ones mask that would be 1.  You'd then also not access
> > VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> > is how to recover the implicit walk over all elements for the
> > purpose of computing start/length and how generally this will
> > work for variable-length vectors where enumerating "all"
> > elements touched is required for correctness.
> >  
> > The most simple thing would be to make this all conditional
> > to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > asking for VL vector testcases.
> >  
> > Richard.
> >  
> > >        if (mask)
> > >  {
> > >    HOST_WIDE_INT start = 0, length = 0;
> > > -   unsigned mask_idx = 0;
> > > -   do
> > > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> > >      {
> > >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> > >  {
> > > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  }
> > >        else
> > >  length += elsz;
> > > -       mask_idx++;
> > >      }
> > > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> > >    if (length != 0)
> > >      {
> > >        pd.rhs_off = start;
> > > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  {
> > >    pd.offset = offset2i;
> > >    pd.size = (tree_to_uhwi (len)
> > > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> > >    if (BYTES_BIG_ENDIAN)
> > >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> > >    else
> > > 
> >  
> > 
>  
>
  
Richard Biener June 27, 2023, 8:34 a.m. UTC | #7
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> After several tries, I found a case that is "close to" have CSE opportunity in RVV for VL vectors:
> 
> void __attribute__((noinline,noclone))
> foo (uint16_t *out, uint16_t *res)
> {
>   int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
>   int i;
>   for (i = 0; i < 8; ++i)
>     {
>       if (mask[i])
>         out[i] = 33;
>     }
>   uint16_t o0 = out[0];
>   uint16_t o7 = out[3];
>   uint16_t o14 = out[6];
>   uint16_t o15 = out[7];
>   res[0] = o0;
>   res[2] = o7;
>   res[4] = o14;
>   res[6] = o15;
> }
> 
> The Gimple IR:
>   _64 = .SELECT_VL (ivtmp_31, POLY_INT_CST [4, 4]);
>   vect__1.15_54 = .LEN_MASK_LOAD (vectp_mask.13_21, 32B, _64, { -1, ... }, 0);
>   mask__7.16_56 = vect__1.15_54 != { 0, ... };
>   .LEN_MASK_STORE (vectp_out.17_34, 16B, _64, mask__7.16_56, { 33, ... }, 0);
> 
> You can see the "len" is always variable produced by SELECT_VL so it failed to have CSE opportunity.
> And I tried in ARM SVE: https://godbolt.org/z/63a6WcT9o
> It also fail to have CSE opportunity.
> 
> It seems that it's difficult to have such CSE opportunity in VL vectors.

Ah.  Nice example.  This shows we fail to constant fold
[vec_unpack_lo_expr] { -1, -1, ..., 0, 0, 0, ... } which means we
fail to fold the .MASK_LOAD from 'mask' (that's not something we
support, see my previous answer) and that means the .MASK_STORE mask
doesn't end up constant.

I understant that SVE cannot easily generate all constant [masks]
so we probably shouldn't aggressively perform elimination on
constant foldings but for actual constant folding of dependent stmts
it might be nice to have the constant folded masks.

I will open a bugreport with this testcase.

Richard.

> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:47
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > 
> > When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> > then I failed to walk all elements analysis.
> > 
> > >> The most simple thing would be to make this all conditional
> > >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > >> asking for VL vector testcases.
> > Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> > in C code.
>  
> The original motivation was from unrolled vector code for the
> conditional masking case.  Does RVV allow MASK_STORE from
> intrinsics?
>  
> Otherwise how about
>  
> for (i = 0; i < 8; ++i)
>    a[i] = 42;
> for (i = 2; i < 6; ++i)
>    b[i] = a[i];
>  
> with disabling complete unrolling -fdisable-tree-cunrolli both
> loops should get vectorized, hopefully not iterating(?)  Then
> we end up with the overlapping CSE opportunity, no?  Maybe
> it needs support for fixed size vectors and using VL vectors
> just for the epilog.
>  
> Richard.
>  
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-06-27 15:33
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford; pan2.li
> > Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richi.
> > > 
> > > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > > 
> > > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > > 
> > > I am not sure whether I am on the same page with you.
> > > 
> > > Feel free to correct me, Thanks.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > > 
> > > ---
> > >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> > >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> > >  2 files changed, 49 insertions(+), 5 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > 
> > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > new file mode 100644
> > > index 00000000000..0b2d03693dc
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > > +
> > > +void __attribute__((noinline,noclone))
> > > +foo (int *out, int *res)
> > > +{
> > > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > > +  int i;
> > > +  for (i = 0; i < 16; ++i)
> > > +    {
> > > +      if (mask[i])
> > > +        out[i] = i;
> > > +    }
> > > +  int o0 = out[0];
> > > +  int o7 = out[7];
> > > +  int o14 = out[14];
> > > +  int o15 = out[15];
> > > +  res[0] = o0;
> > > +  res[2] = o7;
> > > +  res[4] = o14;
> > > +  res[6] = o15;
> > > +}
> > > +
> > > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > > +   vector iterations.  FRE5 after that should be able to CSE
> > > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > > index 11061a374a2..242d82d6274 100644
> > > --- a/gcc/tree-ssa-sccvn.cc
> > > +++ b/gcc/tree-ssa-sccvn.cc
> > > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > >      return (void *)-1;
> > >    break;
> > > + case IFN_LEN_MASK_STORE:
> > > +   len = gimple_call_arg (call, 2);
> > > +   bias = gimple_call_arg (call, 5);
> > > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > +     return (void *)-1;
> > > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > > +   mask = vn_valueize (mask);
> > > +   if (TREE_CODE (mask) != VECTOR_CST)
> > > +     return (void *)-1;
> > > +   break;
> > >  default:
> > >    return (void *)-1;
> > >  }
> > > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >        tree vectype = TREE_TYPE (def_rhs);
> > >        unsigned HOST_WIDE_INT elsz
> > >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > > + is always true for MASK_STORE.  */
> > > +       unsigned actual_len = UINT_MAX;
> > > +       if (len)
> > > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > > +       unsigned nunits
> > > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
> >  
> > No, that's not correct and what I meant.  There's
> > vector_cst_encoded_nelts (mask), but for example for an
> > all-ones mask that would be 1.  You'd then also not access
> > VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> > is how to recover the implicit walk over all elements for the
> > purpose of computing start/length and how generally this will
> > work for variable-length vectors where enumerating "all"
> > elements touched is required for correctness.
> >  
> > The most simple thing would be to make this all conditional
> > to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > asking for VL vector testcases.
> >  
> > Richard.
> >  
> > >        if (mask)
> > >  {
> > >    HOST_WIDE_INT start = 0, length = 0;
> > > -   unsigned mask_idx = 0;
> > > -   do
> > > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> > >      {
> > >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> > >  {
> > > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  }
> > >        else
> > >  length += elsz;
> > > -       mask_idx++;
> > >      }
> > > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> > >    if (length != 0)
> > >      {
> > >        pd.rhs_off = start;
> > > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  {
> > >    pd.offset = offset2i;
> > >    pd.size = (tree_to_uhwi (len)
> > > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> > >    if (BYTES_BIG_ENDIAN)
> > >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> > >    else
> > > 
> >  
> > 
>  
>
  
juzhe.zhong@rivai.ai June 27, 2023, 8:47 a.m. UTC | #8
Hi, Richi.  After reading your emails.

Is that correct that I put supporting LEN_MASK_STORE into SCCVN on hold for now ?

Go ahead to the next RVV auto-vectorization support patterns in middle-end (for example I sent to add optabs and internal fn for LEN_MASK_GATHER_LOAD).
Then, after I finish all the RVV auto-vectorization patterns in middle-end, I come back to take a look at LEN_MASK_STORE in SCCVN?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-27 16:34
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; pan2.li
Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> After several tries, I found a case that is "close to" have CSE opportunity in RVV for VL vectors:
> 
> void __attribute__((noinline,noclone))
> foo (uint16_t *out, uint16_t *res)
> {
>   int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
>   int i;
>   for (i = 0; i < 8; ++i)
>     {
>       if (mask[i])
>         out[i] = 33;
>     }
>   uint16_t o0 = out[0];
>   uint16_t o7 = out[3];
>   uint16_t o14 = out[6];
>   uint16_t o15 = out[7];
>   res[0] = o0;
>   res[2] = o7;
>   res[4] = o14;
>   res[6] = o15;
> }
> 
> The Gimple IR:
>   _64 = .SELECT_VL (ivtmp_31, POLY_INT_CST [4, 4]);
>   vect__1.15_54 = .LEN_MASK_LOAD (vectp_mask.13_21, 32B, _64, { -1, ... }, 0);
>   mask__7.16_56 = vect__1.15_54 != { 0, ... };
>   .LEN_MASK_STORE (vectp_out.17_34, 16B, _64, mask__7.16_56, { 33, ... }, 0);
> 
> You can see the "len" is always variable produced by SELECT_VL so it failed to have CSE opportunity.
> And I tried in ARM SVE: https://godbolt.org/z/63a6WcT9o
> It also fail to have CSE opportunity.
> 
> It seems that it's difficult to have such CSE opportunity in VL vectors.
 
Ah.  Nice example.  This shows we fail to constant fold
[vec_unpack_lo_expr] { -1, -1, ..., 0, 0, 0, ... } which means we
fail to fold the .MASK_LOAD from 'mask' (that's not something we
support, see my previous answer) and that means the .MASK_STORE mask
doesn't end up constant.
 
I understant that SVE cannot easily generate all constant [masks]
so we probably shouldn't aggressively perform elimination on
constant foldings but for actual constant folding of dependent stmts
it might be nice to have the constant folded masks.
 
I will open a bugreport with this testcase.
 
Richard.
 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 15:47
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > 
> > When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> > then I failed to walk all elements analysis.
> > 
> > >> The most simple thing would be to make this all conditional
> > >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > >> asking for VL vector testcases.
> > Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> > in C code.
>  
> The original motivation was from unrolled vector code for the
> conditional masking case.  Does RVV allow MASK_STORE from
> intrinsics?
>  
> Otherwise how about
>  
> for (i = 0; i < 8; ++i)
>    a[i] = 42;
> for (i = 2; i < 6; ++i)
>    b[i] = a[i];
>  
> with disabling complete unrolling -fdisable-tree-cunrolli both
> loops should get vectorized, hopefully not iterating(?)  Then
> we end up with the overlapping CSE opportunity, no?  Maybe
> it needs support for fixed size vectors and using VL vectors
> just for the epilog.
>  
> Richard.
>  
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-06-27 15:33
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford; pan2.li
> > Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richi.
> > > 
> > > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > > 
> > > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > > 
> > > I am not sure whether I am on the same page with you.
> > > 
> > > Feel free to correct me, Thanks.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > > 
> > > ---
> > >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> > >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> > >  2 files changed, 49 insertions(+), 5 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > 
> > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > new file mode 100644
> > > index 00000000000..0b2d03693dc
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > > +
> > > +void __attribute__((noinline,noclone))
> > > +foo (int *out, int *res)
> > > +{
> > > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > > +  int i;
> > > +  for (i = 0; i < 16; ++i)
> > > +    {
> > > +      if (mask[i])
> > > +        out[i] = i;
> > > +    }
> > > +  int o0 = out[0];
> > > +  int o7 = out[7];
> > > +  int o14 = out[14];
> > > +  int o15 = out[15];
> > > +  res[0] = o0;
> > > +  res[2] = o7;
> > > +  res[4] = o14;
> > > +  res[6] = o15;
> > > +}
> > > +
> > > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > > +   vector iterations.  FRE5 after that should be able to CSE
> > > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > > index 11061a374a2..242d82d6274 100644
> > > --- a/gcc/tree-ssa-sccvn.cc
> > > +++ b/gcc/tree-ssa-sccvn.cc
> > > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > >      return (void *)-1;
> > >    break;
> > > + case IFN_LEN_MASK_STORE:
> > > +   len = gimple_call_arg (call, 2);
> > > +   bias = gimple_call_arg (call, 5);
> > > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > +     return (void *)-1;
> > > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > > +   mask = vn_valueize (mask);
> > > +   if (TREE_CODE (mask) != VECTOR_CST)
> > > +     return (void *)-1;
> > > +   break;
> > >  default:
> > >    return (void *)-1;
> > >  }
> > > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >        tree vectype = TREE_TYPE (def_rhs);
> > >        unsigned HOST_WIDE_INT elsz
> > >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > > + is always true for MASK_STORE.  */
> > > +       unsigned actual_len = UINT_MAX;
> > > +       if (len)
> > > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > > +       unsigned nunits
> > > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
> >  
> > No, that's not correct and what I meant.  There's
> > vector_cst_encoded_nelts (mask), but for example for an
> > all-ones mask that would be 1.  You'd then also not access
> > VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> > is how to recover the implicit walk over all elements for the
> > purpose of computing start/length and how generally this will
> > work for variable-length vectors where enumerating "all"
> > elements touched is required for correctness.
> >  
> > The most simple thing would be to make this all conditional
> > to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > asking for VL vector testcases.
> >  
> > Richard.
> >  
> > >        if (mask)
> > >  {
> > >    HOST_WIDE_INT start = 0, length = 0;
> > > -   unsigned mask_idx = 0;
> > > -   do
> > > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> > >      {
> > >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> > >  {
> > > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  }
> > >        else
> > >  length += elsz;
> > > -       mask_idx++;
> > >      }
> > > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> > >    if (length != 0)
> > >      {
> > >        pd.rhs_off = start;
> > > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > >  {
> > >    pd.offset = offset2i;
> > >    pd.size = (tree_to_uhwi (len)
> > > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> > >    if (BYTES_BIG_ENDIAN)
> > >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> > >    else
> > > 
> >  
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  
Richard Biener June 27, 2023, 8:56 a.m. UTC | #9
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.  After reading your emails.
> 
> Is that correct that I put supporting LEN_MASK_STORE into SCCVN on hold for now ?
> 
> Go ahead to the next RVV auto-vectorization support patterns in 
> middle-end (for example I sent to add optabs and internal fn for 
> LEN_MASK_GATHER_LOAD). Then, after I finish all the RVV 
> auto-vectorization patterns in middle-end, I come back to take a look at 
> LEN_MASK_STORE in SCCVN?

Yes, I don't think it makes much sense to have the support without
being able to write a testcase that exercises it (and verifies
correctness).

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 16:34
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > After several tries, I found a case that is "close to" have CSE opportunity in RVV for VL vectors:
> > 
> > void __attribute__((noinline,noclone))
> > foo (uint16_t *out, uint16_t *res)
> > {
> >   int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
> >   int i;
> >   for (i = 0; i < 8; ++i)
> >     {
> >       if (mask[i])
> >         out[i] = 33;
> >     }
> >   uint16_t o0 = out[0];
> >   uint16_t o7 = out[3];
> >   uint16_t o14 = out[6];
> >   uint16_t o15 = out[7];
> >   res[0] = o0;
> >   res[2] = o7;
> >   res[4] = o14;
> >   res[6] = o15;
> > }
> > 
> > The Gimple IR:
> >   _64 = .SELECT_VL (ivtmp_31, POLY_INT_CST [4, 4]);
> >   vect__1.15_54 = .LEN_MASK_LOAD (vectp_mask.13_21, 32B, _64, { -1, ... }, 0);
> >   mask__7.16_56 = vect__1.15_54 != { 0, ... };
> >   .LEN_MASK_STORE (vectp_out.17_34, 16B, _64, mask__7.16_56, { 33, ... }, 0);
> > 
> > You can see the "len" is always variable produced by SELECT_VL so it failed to have CSE opportunity.
> > And I tried in ARM SVE: https://godbolt.org/z/63a6WcT9o
> > It also fail to have CSE opportunity.
> > 
> > It seems that it's difficult to have such CSE opportunity in VL vectors.
>  
> Ah.  Nice example.  This shows we fail to constant fold
> [vec_unpack_lo_expr] { -1, -1, ..., 0, 0, 0, ... } which means we
> fail to fold the .MASK_LOAD from 'mask' (that's not something we
> support, see my previous answer) and that means the .MASK_STORE mask
> doesn't end up constant.
>  
> I understant that SVE cannot easily generate all constant [masks]
> so we probably shouldn't aggressively perform elimination on
> constant foldings but for actual constant folding of dependent stmts
> it might be nice to have the constant folded masks.
>  
> I will open a bugreport with this testcase.
>  
> Richard.
>  
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-06-27 15:47
> > To: juzhe.zhong@rivai.ai
> > CC: gcc-patches; richard.sandiford; pan2.li
> > Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > Hi, Richi.
> > > 
> > > When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> > > then I failed to walk all elements analysis.
> > > 
> > > >> The most simple thing would be to make this all conditional
> > > >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > > >> asking for VL vector testcases.
> > > Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> > > in C code.
> >  
> > The original motivation was from unrolled vector code for the
> > conditional masking case.  Does RVV allow MASK_STORE from
> > intrinsics?
> >  
> > Otherwise how about
> >  
> > for (i = 0; i < 8; ++i)
> >    a[i] = 42;
> > for (i = 2; i < 6; ++i)
> >    b[i] = a[i];
> >  
> > with disabling complete unrolling -fdisable-tree-cunrolli both
> > loops should get vectorized, hopefully not iterating(?)  Then
> > we end up with the overlapping CSE opportunity, no?  Maybe
> > it needs support for fixed size vectors and using VL vectors
> > just for the epilog.
> >  
> > Richard.
> >  
> >  
> > > Thanks.
> > > 
> > > 
> > > juzhe.zhong@rivai.ai
> > >  
> > > From: Richard Biener
> > > Date: 2023-06-27 15:33
> > > To: Ju-Zhe Zhong
> > > CC: gcc-patches; richard.sandiford; pan2.li
> > > Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> > >  
> > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > > 
> > > > Hi, Richi.
> > > > 
> > > > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > > > 
> > > > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > > > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > > > 
> > > > I am not sure whether I am on the same page with you.
> > > > 
> > > > Feel free to correct me, Thanks.
> > > > 
> > > > gcc/ChangeLog:
> > > > 
> > > >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > > > 
> > > > gcc/testsuite/ChangeLog:
> > > > 
> > > >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > > > 
> > > > ---
> > > >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> > > >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> > > >  2 files changed, 49 insertions(+), 5 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > 
> > > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > new file mode 100644
> > > > index 00000000000..0b2d03693dc
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > > > +
> > > > +void __attribute__((noinline,noclone))
> > > > +foo (int *out, int *res)
> > > > +{
> > > > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > > > +  int i;
> > > > +  for (i = 0; i < 16; ++i)
> > > > +    {
> > > > +      if (mask[i])
> > > > +        out[i] = i;
> > > > +    }
> > > > +  int o0 = out[0];
> > > > +  int o7 = out[7];
> > > > +  int o14 = out[14];
> > > > +  int o15 = out[15];
> > > > +  res[0] = o0;
> > > > +  res[2] = o7;
> > > > +  res[4] = o14;
> > > > +  res[6] = o15;
> > > > +}
> > > > +
> > > > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > > > +   vector iterations.  FRE5 after that should be able to CSE
> > > > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > > > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > > > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > > > index 11061a374a2..242d82d6274 100644
> > > > --- a/gcc/tree-ssa-sccvn.cc
> > > > +++ b/gcc/tree-ssa-sccvn.cc
> > > > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > >      return (void *)-1;
> > > >    break;
> > > > + case IFN_LEN_MASK_STORE:
> > > > +   len = gimple_call_arg (call, 2);
> > > > +   bias = gimple_call_arg (call, 5);
> > > > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > > +     return (void *)-1;
> > > > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > > > +   mask = vn_valueize (mask);
> > > > +   if (TREE_CODE (mask) != VECTOR_CST)
> > > > +     return (void *)-1;
> > > > +   break;
> > > >  default:
> > > >    return (void *)-1;
> > > >  }
> > > > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >        tree vectype = TREE_TYPE (def_rhs);
> > > >        unsigned HOST_WIDE_INT elsz
> > > >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > > > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > > > + is always true for MASK_STORE.  */
> > > > +       unsigned actual_len = UINT_MAX;
> > > > +       if (len)
> > > > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > > > +       unsigned nunits
> > > > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
> > >  
> > > No, that's not correct and what I meant.  There's
> > > vector_cst_encoded_nelts (mask), but for example for an
> > > all-ones mask that would be 1.  You'd then also not access
> > > VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> > > is how to recover the implicit walk over all elements for the
> > > purpose of computing start/length and how generally this will
> > > work for variable-length vectors where enumerating "all"
> > > elements touched is required for correctness.
> > >  
> > > The most simple thing would be to make this all conditional
> > > to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > > asking for VL vector testcases.
> > >  
> > > Richard.
> > >  
> > > >        if (mask)
> > > >  {
> > > >    HOST_WIDE_INT start = 0, length = 0;
> > > > -   unsigned mask_idx = 0;
> > > > -   do
> > > > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> > > >      {
> > > >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> > > >  {
> > > > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >  }
> > > >        else
> > > >  length += elsz;
> > > > -       mask_idx++;
> > > >      }
> > > > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> > > >    if (length != 0)
> > > >      {
> > > >        pd.rhs_off = start;
> > > > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >  {
> > > >    pd.offset = offset2i;
> > > >    pd.size = (tree_to_uhwi (len)
> > > > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > >    if (BYTES_BIG_ENDIAN)
> > > >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> > > >    else
> > > > 
> > >  
> > > 
> >  
> > 
>  
>
  
juzhe.zhong@rivai.ai June 27, 2023, 8:59 a.m. UTC | #10
Thanks so much! Richi, I am gonna open a BUG that I won't forget this issue.

And I am gonna go ahead on LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE:
This is the first patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622824.html 
which is only adding optabs && internal_fn and documents (No complicate vectorizer support).

Really appreciate your help!


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-27 16:56
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; pan2.li
Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.  After reading your emails.
> 
> Is that correct that I put supporting LEN_MASK_STORE into SCCVN on hold for now ?
> 
> Go ahead to the next RVV auto-vectorization support patterns in 
> middle-end (for example I sent to add optabs and internal fn for 
> LEN_MASK_GATHER_LOAD). Then, after I finish all the RVV 
> auto-vectorization patterns in middle-end, I come back to take a look at 
> LEN_MASK_STORE in SCCVN?
 
Yes, I don't think it makes much sense to have the support without
being able to write a testcase that exercises it (and verifies
correctness).
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-06-27 16:34
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; pan2.li
> Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > After several tries, I found a case that is "close to" have CSE opportunity in RVV for VL vectors:
> > 
> > void __attribute__((noinline,noclone))
> > foo (uint16_t *out, uint16_t *res)
> > {
> >   int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
> >   int i;
> >   for (i = 0; i < 8; ++i)
> >     {
> >       if (mask[i])
> >         out[i] = 33;
> >     }
> >   uint16_t o0 = out[0];
> >   uint16_t o7 = out[3];
> >   uint16_t o14 = out[6];
> >   uint16_t o15 = out[7];
> >   res[0] = o0;
> >   res[2] = o7;
> >   res[4] = o14;
> >   res[6] = o15;
> > }
> > 
> > The Gimple IR:
> >   _64 = .SELECT_VL (ivtmp_31, POLY_INT_CST [4, 4]);
> >   vect__1.15_54 = .LEN_MASK_LOAD (vectp_mask.13_21, 32B, _64, { -1, ... }, 0);
> >   mask__7.16_56 = vect__1.15_54 != { 0, ... };
> >   .LEN_MASK_STORE (vectp_out.17_34, 16B, _64, mask__7.16_56, { 33, ... }, 0);
> > 
> > You can see the "len" is always variable produced by SELECT_VL so it failed to have CSE opportunity.
> > And I tried in ARM SVE: https://godbolt.org/z/63a6WcT9o
> > It also fail to have CSE opportunity.
> > 
> > It seems that it's difficult to have such CSE opportunity in VL vectors.
>  
> Ah.  Nice example.  This shows we fail to constant fold
> [vec_unpack_lo_expr] { -1, -1, ..., 0, 0, 0, ... } which means we
> fail to fold the .MASK_LOAD from 'mask' (that's not something we
> support, see my previous answer) and that means the .MASK_STORE mask
> doesn't end up constant.
>  
> I understant that SVE cannot easily generate all constant [masks]
> so we probably shouldn't aggressively perform elimination on
> constant foldings but for actual constant folding of dependent stmts
> it might be nice to have the constant folded masks.
>  
> I will open a bugreport with this testcase.
>  
> Richard.
>  
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-06-27 15:47
> > To: juzhe.zhong@rivai.ai
> > CC: gcc-patches; richard.sandiford; pan2.li
> > Subject: Re: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > Hi, Richi.
> > > 
> > > When I try vector_cst_encoded_nelts (mask), the testcase I append is 2 but actual actual nunits is 8 in that case,
> > > then I failed to walk all elements analysis.
> > > 
> > > >> The most simple thing would be to make this all conditional
> > > >> to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > > >> asking for VL vector testcases.
> > > Ok, I understand your point, but RVV doesn't use LEN_MASK_STORE in intrinsics. I am not sure how to reproduce VL vectors
> > > in C code.
> >  
> > The original motivation was from unrolled vector code for the
> > conditional masking case.  Does RVV allow MASK_STORE from
> > intrinsics?
> >  
> > Otherwise how about
> >  
> > for (i = 0; i < 8; ++i)
> >    a[i] = 42;
> > for (i = 2; i < 6; ++i)
> >    b[i] = a[i];
> >  
> > with disabling complete unrolling -fdisable-tree-cunrolli both
> > loops should get vectorized, hopefully not iterating(?)  Then
> > we end up with the overlapping CSE opportunity, no?  Maybe
> > it needs support for fixed size vectors and using VL vectors
> > just for the epilog.
> >  
> > Richard.
> >  
> >  
> > > Thanks.
> > > 
> > > 
> > > juzhe.zhong@rivai.ai
> > >  
> > > From: Richard Biener
> > > Date: 2023-06-27 15:33
> > > To: Ju-Zhe Zhong
> > > CC: gcc-patches; richard.sandiford; pan2.li
> > > Subject: Re: [PATCH V4] SCCVN: Add LEN_MASK_STORE and fix LEN_STORE
> > > On Tue, 27 Jun 2023, juzhe.zhong@rivai.ai wrote:
> > >  
> > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > > 
> > > > Hi, Richi.
> > > > 
> > > > I tried to understand your last email and to refactor the do-while loop using VECTOR_CST_NELTS.
> > > > 
> > > > This patch works fine for LEN_MASK_STORE and compiler can CSE redundant store.
> > > > I have appended testcase in this patch to test VN for LEN_MASK_STORE.
> > > > 
> > > > I am not sure whether I am on the same page with you.
> > > > 
> > > > Feel free to correct me, Thanks.
> > > > 
> > > > gcc/ChangeLog:
> > > > 
> > > >         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Add LEN_MASK_STORE and fix LEN_STORE
> > > > 
> > > > gcc/testsuite/ChangeLog:
> > > > 
> > > >         * gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c: New test.
> > > > 
> > > > ---
> > > >  .../rvv/autovec/partial/len_maskstore_vn-1.c  | 30 +++++++++++++++++++
> > > >  gcc/tree-ssa-sccvn.cc                         | 24 +++++++++++----
> > > >  2 files changed, 49 insertions(+), 5 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > 
> > > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > new file mode 100644
> > > > index 00000000000..0b2d03693dc
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
> > > > +
> > > > +void __attribute__((noinline,noclone))
> > > > +foo (int *out, int *res)
> > > > +{
> > > > +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> > > > +  int i;
> > > > +  for (i = 0; i < 16; ++i)
> > > > +    {
> > > > +      if (mask[i])
> > > > +        out[i] = i;
> > > > +    }
> > > > +  int o0 = out[0];
> > > > +  int o7 = out[7];
> > > > +  int o14 = out[14];
> > > > +  int o15 = out[15];
> > > > +  res[0] = o0;
> > > > +  res[2] = o7;
> > > > +  res[4] = o14;
> > > > +  res[6] = o15;
> > > > +}
> > > > +
> > > > +/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
> > > > +   vector iterations.  FRE5 after that should be able to CSE
> > > > +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> > > > +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> > > > +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> > > > diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> > > > index 11061a374a2..242d82d6274 100644
> > > > --- a/gcc/tree-ssa-sccvn.cc
> > > > +++ b/gcc/tree-ssa-sccvn.cc
> > > > @@ -3304,6 +3304,16 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > >      return (void *)-1;
> > > >    break;
> > > > + case IFN_LEN_MASK_STORE:
> > > > +   len = gimple_call_arg (call, 2);
> > > > +   bias = gimple_call_arg (call, 5);
> > > > +   if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> > > > +     return (void *)-1;
> > > > +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> > > > +   mask = vn_valueize (mask);
> > > > +   if (TREE_CODE (mask) != VECTOR_CST)
> > > > +     return (void *)-1;
> > > > +   break;
> > > >  default:
> > > >    return (void *)-1;
> > > >  }
> > > > @@ -3344,11 +3354,17 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >        tree vectype = TREE_TYPE (def_rhs);
> > > >        unsigned HOST_WIDE_INT elsz
> > > >  = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> > > > +       /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
> > > > + is always true for MASK_STORE.  */
> > > > +       unsigned actual_len = UINT_MAX;
> > > > +       if (len)
> > > > + actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
> > > > +       unsigned nunits
> > > > + = MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
> > >  
> > > No, that's not correct and what I meant.  There's
> > > vector_cst_encoded_nelts (mask), but for example for an
> > > all-ones mask that would be 1.  You'd then also not access
> > > VECTOR_CST_ELT but VECTOR_CST_ENCODED_ELT.  What I'm not sure
> > > is how to recover the implicit walk over all elements for the
> > > purpose of computing start/length and how generally this will
> > > work for variable-length vectors where enumerating "all"
> > > elements touched is required for correctness.
> > >  
> > > The most simple thing would be to make this all conditional
> > > to constant TYPE_VECTOR_SUBPARTS.  Which is also why I was
> > > asking for VL vector testcases.
> > >  
> > > Richard.
> > >  
> > > >        if (mask)
> > > >  {
> > > >    HOST_WIDE_INT start = 0, length = 0;
> > > > -   unsigned mask_idx = 0;
> > > > -   do
> > > > +   for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
> > > >      {
> > > >        if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> > > >  {
> > > > @@ -3371,9 +3387,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >  }
> > > >        else
> > > >  length += elsz;
> > > > -       mask_idx++;
> > > >      }
> > > > -   while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> > > >    if (length != 0)
> > > >      {
> > > >        pd.rhs_off = start;
> > > > @@ -3389,7 +3403,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
> > > >  {
> > > >    pd.offset = offset2i;
> > > >    pd.size = (tree_to_uhwi (len)
> > > > -      + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > > +      + tree_to_shwi (bias)) * BITS_PER_UNIT;
> > > >    if (BYTES_BIG_ENDIAN)
> > > >      pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
> > > >    else
> > > > 
> > >  
> > > 
> >  
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  

Patch

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
new file mode 100644
index 00000000000..0b2d03693dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/len_maskstore_vn-1.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax -O3 -fdump-tree-fre5" } */
+
+void __attribute__((noinline,noclone))
+foo (int *out, int *res)
+{
+  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
+  int i;
+  for (i = 0; i < 16; ++i)
+    {
+      if (mask[i])
+        out[i] = i;
+    }
+  int o0 = out[0];
+  int o7 = out[7];
+  int o14 = out[14];
+  int o15 = out[15];
+  res[0] = o0;
+  res[2] = o7;
+  res[4] = o14;
+  res[6] = o15;
+}
+
+/* Vectorization produces .LEN_MASK_STORE, unrolling will unroll the two
+   vector iterations.  FRE5 after that should be able to CSE
+   out[7] and out[15], but leave out[0] and out[14] alone.  */
+/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
+/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
+/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
+/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
index 11061a374a2..242d82d6274 100644
--- a/gcc/tree-ssa-sccvn.cc
+++ b/gcc/tree-ssa-sccvn.cc
@@ -3304,6 +3304,16 @@  vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
 	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
 	    return (void *)-1;
 	  break;
+	case IFN_LEN_MASK_STORE:
+	  len = gimple_call_arg (call, 2);
+	  bias = gimple_call_arg (call, 5);
+	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
+	    return (void *)-1;
+	  mask = gimple_call_arg (call, internal_fn_mask_index (fn));
+	  mask = vn_valueize (mask);
+	  if (TREE_CODE (mask) != VECTOR_CST)
+	    return (void *)-1;
+	  break;
 	default:
 	  return (void *)-1;
 	}
@@ -3344,11 +3354,17 @@  vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
 	      tree vectype = TREE_TYPE (def_rhs);
 	      unsigned HOST_WIDE_INT elsz
 		= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
+	      /* Set initial len value is the UINT_MAX, so mask_idx < actual_len
+		 is always true for MASK_STORE.  */
+	      unsigned actual_len = UINT_MAX;
+	      if (len)
+		actual_len = tree_to_uhwi (len) + tree_to_shwi (bias);
+	      unsigned nunits
+		= MIN (actual_len, VECTOR_CST_NELTS (mask).coeffs[0]);
 	      if (mask)
 		{
 		  HOST_WIDE_INT start = 0, length = 0;
-		  unsigned mask_idx = 0;
-		  do
+		  for (unsigned mask_idx = 0; mask_idx < nunits; mask_idx++)
 		    {
 		      if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
 			{
@@ -3371,9 +3387,7 @@  vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
 			}
 		      else
 			length += elsz;
-		      mask_idx++;
 		    }
-		  while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
 		  if (length != 0)
 		    {
 		      pd.rhs_off = start;
@@ -3389,7 +3403,7 @@  vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
 		{
 		  pd.offset = offset2i;
 		  pd.size = (tree_to_uhwi (len)
-			     + -tree_to_shwi (bias)) * BITS_PER_UNIT;
+			     + tree_to_shwi (bias)) * BITS_PER_UNIT;
 		  if (BYTES_BIG_ENDIAN)
 		    pd.rhs_off = pd.size - tree_to_uhwi (TYPE_SIZE (vectype));
 		  else