[V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927]

Message ID 20231023094034.1728130-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927] |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Oct. 23, 2023, 9:40 a.m. UTC
  ICE:

during RTL pass: vsetvl
<source>: In function 'riscv_lms_f32':
<source>:240:1: internal compiler error: in merge, at config/riscv/riscv-vsetvl.cc:1997
  240 | }

In general compatible_p (avl_equal_p) has:

    if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
      return false;

Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.

It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
And after the fusion, we will alway use compatible_p to check whether the demand
is correct or not.

	PR target/111927

gcc/ChangeLog:

	* config/riscv/riscv-vsetvl.cc: Fix bug.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc              |  23 +++
 .../gcc.target/riscv/rvv/vsetvl/pr111927.c    | 170 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
  

Comments

Kito Cheng Oct. 23, 2023, 9:49 a.m. UTC | #1
LGTM

Juzhe-Zhong <juzhe.zhong@rivai.ai> 於 2023年10月23日 週一 17:41 寫道:

> ICE:
>
> during RTL pass: vsetvl
> <source>: In function 'riscv_lms_f32':
> <source>:240:1: internal compiler error: in merge, at
> config/riscv/riscv-vsetvl.cc:1997
>   240 | }
>
> In general compatible_p (avl_equal_p) has:
>
>     if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
>       return false;
>
> Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.
>
> It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
> fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
> And after the fusion, we will alway use compatible_p to check whether the
> demand
> is correct or not.
>
>         PR target/111927
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vsetvl.cc: Fix bug.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.
>
> ---
>  gcc/config/riscv/riscv-vsetvl.cc              |  23 +++
>  .../gcc.target/riscv/rvv/vsetvl/pr111927.c    | 170 ++++++++++++++++++
>  2 files changed, 193 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc
> b/gcc/config/riscv/riscv-vsetvl.cc
> index 47b459fddd4..f3922a051c5 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -1541,6 +1541,29 @@ private:
>    inline bool can_use_next_avl_p (const vsetvl_info &prev,
>                                   const vsetvl_info &next)
>    {
> +    /* Forbid the AVL/VL propagation if VL of NEXT is used
> +       by non-RVV instructions.  This is because:
> +
> +        bb 2:
> +          PREV: scalar move (no AVL)
> +        bb 3:
> +          NEXT: vsetvl a5(VL), a4(AVL) ...
> +          branch a5,zero
> +
> +       Since user vsetvl instruction is no side effect instruction
> +       which should be placed in the correct and optimal location
> +       of the program by the previous PASS, it is unreasonable that
> +       VSETVL PASS tries to move it to another places if it used by
> +       non-RVV instructions.
> +
> +       Note: We only forbid the cases that VL is used by the following
> +       non-RVV instructions which will cause issues.  We don't forbid
> +       other cases since it won't cause correctness issues and we still
> +       more demand info are fused backward.  The later LCM algorithm
> +       should know the optimal location of the vsetvl.  */
> +    if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
> +      return false;
> +
>      if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
>        return true;
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> new file mode 100644
> index 00000000000..ab599add57f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> @@ -0,0 +1,170 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +#define RISCV_MATH_LOOPUNROLL
> +#define RISCV_MATH_VECTOR
> +typedef  float float32_t;
> +
> +  typedef struct
> +  {
> +          uint16_t numTaps;    /**< number of coefficients in the filter.
> */
> +          float32_t *pState;   /**< points to the state variable array.
> The array is of length numTaps+blockSize-1. */
> +          float32_t *pCoeffs;  /**< points to the coefficient array. The
> array is of length numTaps. */
> +          float32_t mu;        /**< step size that controls filter
> coefficient updates. */
> +  } riscv_lms_instance_f32;
> +
> +
> +void riscv_lms_f32(
> +  const riscv_lms_instance_f32 * S,
> +  const float32_t * pSrc,
> +        float32_t * pRef,
> +        float32_t * pOut,
> +        float32_t * pErr,
> +        uint32_t blockSize)
> +{
> +        float32_t *pState = S->pState;                 /* State pointer */
> +        float32_t *pCoeffs = S->pCoeffs;               /* Coefficient
> pointer */
> +        float32_t *pStateCurnt;                        /* Points to the
> current sample of the state */
> +        float32_t *px, *pb;                            /* Temporary
> pointers for state and coefficient buffers */
> +        float32_t mu = S->mu;                          /* Adaptive factor
> */
> +        float32_t acc, e;                              /* Accumulator,
> error */
> +        float32_t w;                                   /* Weight factor */
> +        uint32_t numTaps = S->numTaps;                 /* Number of
> filter coefficients in the filter */
> +        uint32_t tapCnt, blkCnt;                       /* Loop counters */
> +
> +  /* Initializations of error,  difference, Coefficient update */
> +  e = 0.0f;
> +  w = 0.0f;
> +
> +  /* S->pState points to state array which contains previous frame
> (numTaps - 1) samples */
> +  /* pStateCurnt points to the location where the new input data should
> be written */
> +  pStateCurnt = &(S->pState[(numTaps - 1U)]);
> +
> +  /* initialise loop count */
> +  blkCnt = blockSize;
> +
> +  while (blkCnt > 0U)
> +  {
> +    /* Copy the new input sample into the state buffer */
> +    *pStateCurnt++ = *pSrc++;
> +
> +    /* Initialize pState pointer */
> +    px = pState;
> +
> +    /* Initialize coefficient pointer */
> +    pb = pCoeffs;
> +
> +    /* Set the accumulator to zero */
> +    acc = 0.0f;
> +    uint32_t vblkCnt = numTaps;                               /* Loop
> counter */
> +    size_t l;
> +    vfloat32m8_t vx, vy;
> +    vfloat32m1_t temp00m1;
> +    l = __riscv_vsetvl_e32m1(1);
> +    temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
> +    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> +      vx = __riscv_vle32_v_f32m8(px, l);
> +      px += l;
> +      vy = __riscv_vle32_v_f32m8(pb, l);
> +      pb += l;
> +      temp00m1 =
> __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l),
> temp00m1, l);
> +    }
> +    acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
> +
> +    while (tapCnt > 0U)
> +    {
> +      /* Perform the multiply-accumulate */
> +      acc += (*px++) * (*pb++);
> +
> +      /* Decrement the loop counter */
> +      tapCnt--;
> +    }
> +    /* Store the result from accumulator into the destination buffer. */
> +    *pOut++ = acc;
> +
> +    /* Compute and store error */
> +    e = (float32_t) *pRef++ - acc;
> +    *pErr++ = e;
> +
> +    /* Calculation of Weighting factor for updating filter coefficients */
> +    w = e * mu;
> +
> +    /* Initialize pState pointer */
> +    /* Advance state pointer by 1 for the next sample */
> +    px = pState++;
> +
> +    /* Initialize coefficient pointer */
> +    pb = pCoeffs;
> +
> +    vblkCnt = numTaps;
> +    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> +      vx = __riscv_vle32_v_f32m8(px, l);
> +      px += l;
> +      __riscv_vse32_v_f32m8(pb,
> __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l),
> __riscv_vle32_v_f32m8(pb, l), l) , l);
> +      pb += l;
> +    }
> +    while (tapCnt > 0U)
> +    {
> +      /* Perform the multiply-accumulate */
> +      *pb += w * (*px++);
> +      pb++;
> +
> +      /* Decrement loop counter */
> +      tapCnt--;
> +    }
> +    /* Decrement loop counter */
> +    blkCnt--;
> +  }
> +
> +  /* Processing is complete.
> +     Now copy the last numTaps - 1 samples to the start of the state
> buffer.
> +     This prepares the state buffer for the next function call. */
> +
> +  /* Points to the start of the pState buffer */
> +  pStateCurnt = S->pState;
> +
> +  /* copy data */
> +
> +    uint32_t vblkCnt = (numTaps - 1U);                               /*
> Loop counter */
> +    size_t l;
> +    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> +      __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l)
> , l);
> +      pState += l;
> +      pStateCurnt += l;
> +    }
> +
> +
> +  /* Loop unrolling: Compute 4 taps at a time. */
> +  tapCnt = (numTaps - 1U) >> 2U;
> +
> +  while (tapCnt > 0U)
> +  {
> +    *pStateCurnt++ = *pState++;
> +    *pStateCurnt++ = *pState++;
> +    *pStateCurnt++ = *pState++;
> +    *pStateCurnt++ = *pState++;
> +
> +    /* Decrement loop counter */
> +    tapCnt--;
> +  }
> +
> +  /* Loop unrolling: Compute remaining taps */
> +  tapCnt = (numTaps - 1U) & 0x3U;
> +
> +
> +
> +  /* Initialize tapCnt with number of samples */
> +  tapCnt = (numTaps - 1U);
> +
> +
> +
> +  while (tapCnt > 0U)
> +  {
> +    *pStateCurnt++ = *pState++;
> +
> +    /* Decrement loop counter */
> +    tapCnt--;
> +  }
> +}
> --
> 2.36.3
>
>
  
Li, Pan2 Oct. 23, 2023, 10:01 a.m. UTC | #2
Committed, thanks Kito.

Pan

From: Kito Cheng <kito.cheng@gmail.com>
Sent: Monday, October 23, 2023 5:50 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Kito Cheng <kito.cheng@sifive.com>; Jeff Law <jeffreyalaw@gmail.com>; Robin Dapp <rdapp.gcc@gmail.com>
Subject: Re: [PATCH V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927]

LGTM

Juzhe-Zhong <juzhe.zhong@rivai.ai<mailto:juzhe.zhong@rivai.ai>> 於 2023年10月23日 週一 17:41 寫道:
ICE:

during RTL pass: vsetvl
<source>: In function 'riscv_lms_f32':
<source>:240:1: internal compiler error: in merge, at config/riscv/riscv-vsetvl.cc:1997
  240 | }

In general compatible_p (avl_equal_p) has:

    if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
      return false;

Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.

It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
And after the fusion, we will alway use compatible_p to check whether the demand
is correct or not.

        PR target/111927

gcc/ChangeLog:

        * config/riscv/riscv-vsetvl.cc: Fix bug.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc              |  23 +++
 .../gcc.target/riscv/rvv/vsetvl/pr111927.c    | 170 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 47b459fddd4..f3922a051c5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1541,6 +1541,29 @@ private:
   inline bool can_use_next_avl_p (const vsetvl_info &prev,
                                  const vsetvl_info &next)
   {
+    /* Forbid the AVL/VL propagation if VL of NEXT is used
+       by non-RVV instructions.  This is because:
+
+        bb 2:
+          PREV: scalar move (no AVL)
+        bb 3:
+          NEXT: vsetvl a5(VL), a4(AVL) ...
+          branch a5,zero
+
+       Since user vsetvl instruction is no side effect instruction
+       which should be placed in the correct and optimal location
+       of the program by the previous PASS, it is unreasonable that
+       VSETVL PASS tries to move it to another places if it used by
+       non-RVV instructions.
+
+       Note: We only forbid the cases that VL is used by the following
+       non-RVV instructions which will cause issues.  We don't forbid
+       other cases since it won't cause correctness issues and we still
+       more demand info are fused backward.  The later LCM algorithm
+       should know the optimal location of the vsetvl.  */
+    if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
+      return false;
+
     if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
       return true;

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
new file mode 100644
index 00000000000..ab599add57f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+#define RISCV_MATH_LOOPUNROLL
+#define RISCV_MATH_VECTOR
+typedef  float float32_t;
+
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;        /**< step size that controls filter coefficient updates. */
+  } riscv_lms_instance_f32;
+
+
+void riscv_lms_f32(
+  const riscv_lms_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+        float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
+        float32_t mu = S->mu;                          /* Adaptive factor */
+        float32_t acc, e;                              /* Accumulator, error */
+        float32_t w;                                   /* Weight factor */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t tapCnt, blkCnt;                       /* Loop counters */
+
+  /* Initializations of error,  difference, Coefficient update */
+  e = 0.0f;
+  w = 0.0f;
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  /* initialise loop count */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coefficient pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0.0f;
+    uint32_t vblkCnt = numTaps;                               /* Loop counter */
+    size_t l;
+    vfloat32m8_t vx, vy;
+    vfloat32m1_t temp00m1;
+    l = __riscv_vsetvl_e32m1(1);
+    temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      vx = __riscv_vle32_v_f32m8(px, l);
+      px += l;
+      vy = __riscv_vle32_v_f32m8(pb, l);
+      pb += l;
+      temp00m1 = __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l), temp00m1, l);
+    }
+    acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (*px++) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = acc;
+
+    /* Compute and store error */
+    e = (float32_t) *pRef++ - acc;
+    *pErr++ = e;
+
+    /* Calculation of Weighting factor for updating filter coefficients */
+    w = e * mu;
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize coefficient pointer */
+    pb = pCoeffs;
+
+    vblkCnt = numTaps;
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      vx = __riscv_vle32_v_f32m8(px, l);
+      px += l;
+      __riscv_vse32_v_f32m8(pb, __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l), __riscv_vle32_v_f32m8(pb, l), l) , l);
+      pb += l;
+    }
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      *pb += w * (*px++);
+      pb++;
+
+      /* Decrement loop counter */
+      tapCnt--;
+    }
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last numTaps - 1 samples to the start of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* copy data */
+
+    uint32_t vblkCnt = (numTaps - 1U);                               /* Loop counter */
+    size_t l;
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l) , l);
+      pState += l;
+      pStateCurnt += l;
+    }
+
+
+  /* Loop unrolling: Compute 4 taps at a time. */
+  tapCnt = (numTaps - 1U) >> 2U;
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining taps */
+  tapCnt = (numTaps - 1U) & 0x3U;
+
+
+
+  /* Initialize tapCnt with number of samples */
+  tapCnt = (numTaps - 1U);
+
+
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+}
--
2.36.3
  

Patch

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 47b459fddd4..f3922a051c5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1541,6 +1541,29 @@  private:
   inline bool can_use_next_avl_p (const vsetvl_info &prev,
 				  const vsetvl_info &next)
   {
+    /* Forbid the AVL/VL propagation if VL of NEXT is used
+       by non-RVV instructions.  This is because:
+
+	 bb 2:
+	   PREV: scalar move (no AVL)
+	 bb 3:
+	   NEXT: vsetvl a5(VL), a4(AVL) ...
+	   branch a5,zero
+
+       Since user vsetvl instruction is no side effect instruction
+       which should be placed in the correct and optimal location
+       of the program by the previous PASS, it is unreasonable that
+       VSETVL PASS tries to move it to another places if it used by
+       non-RVV instructions.
+
+       Note: We only forbid the cases that VL is used by the following
+       non-RVV instructions which will cause issues.  We don't forbid
+       other cases since it won't cause correctness issues and we still
+       more demand info are fused backward.  The later LCM algorithm
+       should know the optimal location of the vsetvl.  */
+    if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
+      return false;
+
     if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
       return true;
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
new file mode 100644
index 00000000000..ab599add57f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
@@ -0,0 +1,170 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+#define RISCV_MATH_LOOPUNROLL
+#define RISCV_MATH_VECTOR
+typedef  float float32_t;
+
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;        /**< step size that controls filter coefficient updates. */
+  } riscv_lms_instance_f32;
+
+
+void riscv_lms_f32(
+  const riscv_lms_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+        float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
+        float32_t mu = S->mu;                          /* Adaptive factor */
+        float32_t acc, e;                              /* Accumulator, error */
+        float32_t w;                                   /* Weight factor */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t tapCnt, blkCnt;                       /* Loop counters */
+
+  /* Initializations of error,  difference, Coefficient update */
+  e = 0.0f;
+  w = 0.0f;
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  /* initialise loop count */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coefficient pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0.0f;
+    uint32_t vblkCnt = numTaps;                               /* Loop counter */
+    size_t l;
+    vfloat32m8_t vx, vy;
+    vfloat32m1_t temp00m1;
+    l = __riscv_vsetvl_e32m1(1);
+    temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      vx = __riscv_vle32_v_f32m8(px, l);
+      px += l;
+      vy = __riscv_vle32_v_f32m8(pb, l);
+      pb += l;
+      temp00m1 = __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l), temp00m1, l);
+    }
+    acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (*px++) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = acc;
+
+    /* Compute and store error */
+    e = (float32_t) *pRef++ - acc;
+    *pErr++ = e;
+
+    /* Calculation of Weighting factor for updating filter coefficients */
+    w = e * mu;
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize coefficient pointer */
+    pb = pCoeffs;
+
+    vblkCnt = numTaps;
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      vx = __riscv_vle32_v_f32m8(px, l);
+      px += l;
+      __riscv_vse32_v_f32m8(pb, __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l), __riscv_vle32_v_f32m8(pb, l), l) , l);
+      pb += l;
+    }
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      *pb += w * (*px++);
+      pb++;
+
+      /* Decrement loop counter */
+      tapCnt--;
+    }
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last numTaps - 1 samples to the start of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* copy data */
+
+    uint32_t vblkCnt = (numTaps - 1U);                               /* Loop counter */
+    size_t l;
+    for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+      __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l) , l);
+      pState += l;
+      pStateCurnt += l;
+    }
+
+
+  /* Loop unrolling: Compute 4 taps at a time. */
+  tapCnt = (numTaps - 1U) >> 2U;
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining taps */
+  tapCnt = (numTaps - 1U) & 0x3U;
+
+
+
+  /* Initialize tapCnt with number of samples */
+  tapCnt = (numTaps - 1U);
+
+
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+}