[V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927]
Checks
Commit Message
ICE:
during RTL pass: vsetvl
<source>: In function 'riscv_lms_f32':
<source>:240:1: internal compiler error: in merge, at config/riscv/riscv-vsetvl.cc:1997
240 | }
In general compatible_p (avl_equal_p) has:
if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
return false;
Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.
It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
And after the fusion, we will alway use compatible_p to check whether the demand
is correct or not.
PR target/111927
gcc/ChangeLog:
* config/riscv/riscv-vsetvl.cc: Fix bug.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.
---
gcc/config/riscv/riscv-vsetvl.cc | 23 +++
.../gcc.target/riscv/rvv/vsetvl/pr111927.c | 170 ++++++++++++++++++
2 files changed, 193 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
Comments
LGTM
Juzhe-Zhong <juzhe.zhong@rivai.ai> 於 2023年10月23日 週一 17:41 寫道:
> ICE:
>
> during RTL pass: vsetvl
> <source>: In function 'riscv_lms_f32':
> <source>:240:1: internal compiler error: in merge, at
> config/riscv/riscv-vsetvl.cc:1997
> 240 | }
>
> In general compatible_p (avl_equal_p) has:
>
> if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
> return false;
>
> Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.
>
> It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
> fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
> And after the fusion, we will alway use compatible_p to check whether the
> demand
> is correct or not.
>
> PR target/111927
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc: Fix bug.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.
>
> ---
> gcc/config/riscv/riscv-vsetvl.cc | 23 +++
> .../gcc.target/riscv/rvv/vsetvl/pr111927.c | 170 ++++++++++++++++++
> 2 files changed, 193 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc
> b/gcc/config/riscv/riscv-vsetvl.cc
> index 47b459fddd4..f3922a051c5 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -1541,6 +1541,29 @@ private:
> inline bool can_use_next_avl_p (const vsetvl_info &prev,
> const vsetvl_info &next)
> {
> + /* Forbid the AVL/VL propagation if VL of NEXT is used
> + by non-RVV instructions. This is because:
> +
> + bb 2:
> + PREV: scalar move (no AVL)
> + bb 3:
> + NEXT: vsetvl a5(VL), a4(AVL) ...
> + branch a5,zero
> +
> + Since user vsetvl instruction is no side effect instruction
> + which should be placed in the correct and optimal location
> + of the program by the previous PASS, it is unreasonable that
> + VSETVL PASS tries to move it to another places if it used by
> + non-RVV instructions.
> +
> + Note: We only forbid the cases that VL is used by the following
> + non-RVV instructions which will cause issues. We don't forbid
> + other cases since it won't cause correctness issues and we still
> + more demand info are fused backward. The later LCM algorithm
> + should know the optimal location of the vsetvl. */
> + if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
> + return false;
> +
> if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
> return true;
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> new file mode 100644
> index 00000000000..ab599add57f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
> @@ -0,0 +1,170 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +#define RISCV_MATH_LOOPUNROLL
> +#define RISCV_MATH_VECTOR
> +typedef float float32_t;
> +
> + typedef struct
> + {
> + uint16_t numTaps; /**< number of coefficients in the filter.
> */
> + float32_t *pState; /**< points to the state variable array.
> The array is of length numTaps+blockSize-1. */
> + float32_t *pCoeffs; /**< points to the coefficient array. The
> array is of length numTaps. */
> + float32_t mu; /**< step size that controls filter
> coefficient updates. */
> + } riscv_lms_instance_f32;
> +
> +
> +void riscv_lms_f32(
> + const riscv_lms_instance_f32 * S,
> + const float32_t * pSrc,
> + float32_t * pRef,
> + float32_t * pOut,
> + float32_t * pErr,
> + uint32_t blockSize)
> +{
> + float32_t *pState = S->pState; /* State pointer */
> + float32_t *pCoeffs = S->pCoeffs; /* Coefficient
> pointer */
> + float32_t *pStateCurnt; /* Points to the
> current sample of the state */
> + float32_t *px, *pb; /* Temporary
> pointers for state and coefficient buffers */
> + float32_t mu = S->mu; /* Adaptive factor
> */
> + float32_t acc, e; /* Accumulator,
> error */
> + float32_t w; /* Weight factor */
> + uint32_t numTaps = S->numTaps; /* Number of
> filter coefficients in the filter */
> + uint32_t tapCnt, blkCnt; /* Loop counters */
> +
> + /* Initializations of error, difference, Coefficient update */
> + e = 0.0f;
> + w = 0.0f;
> +
> + /* S->pState points to state array which contains previous frame
> (numTaps - 1) samples */
> + /* pStateCurnt points to the location where the new input data should
> be written */
> + pStateCurnt = &(S->pState[(numTaps - 1U)]);
> +
> + /* initialise loop count */
> + blkCnt = blockSize;
> +
> + while (blkCnt > 0U)
> + {
> + /* Copy the new input sample into the state buffer */
> + *pStateCurnt++ = *pSrc++;
> +
> + /* Initialize pState pointer */
> + px = pState;
> +
> + /* Initialize coefficient pointer */
> + pb = pCoeffs;
> +
> + /* Set the accumulator to zero */
> + acc = 0.0f;
> + uint32_t vblkCnt = numTaps; /* Loop
> counter */
> + size_t l;
> + vfloat32m8_t vx, vy;
> + vfloat32m1_t temp00m1;
> + l = __riscv_vsetvl_e32m1(1);
> + temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
> + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> + vx = __riscv_vle32_v_f32m8(px, l);
> + px += l;
> + vy = __riscv_vle32_v_f32m8(pb, l);
> + pb += l;
> + temp00m1 =
> __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l),
> temp00m1, l);
> + }
> + acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
> +
> + while (tapCnt > 0U)
> + {
> + /* Perform the multiply-accumulate */
> + acc += (*px++) * (*pb++);
> +
> + /* Decrement the loop counter */
> + tapCnt--;
> + }
> + /* Store the result from accumulator into the destination buffer. */
> + *pOut++ = acc;
> +
> + /* Compute and store error */
> + e = (float32_t) *pRef++ - acc;
> + *pErr++ = e;
> +
> + /* Calculation of Weighting factor for updating filter coefficients */
> + w = e * mu;
> +
> + /* Initialize pState pointer */
> + /* Advance state pointer by 1 for the next sample */
> + px = pState++;
> +
> + /* Initialize coefficient pointer */
> + pb = pCoeffs;
> +
> + vblkCnt = numTaps;
> + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> + vx = __riscv_vle32_v_f32m8(px, l);
> + px += l;
> + __riscv_vse32_v_f32m8(pb,
> __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l),
> __riscv_vle32_v_f32m8(pb, l), l) , l);
> + pb += l;
> + }
> + while (tapCnt > 0U)
> + {
> + /* Perform the multiply-accumulate */
> + *pb += w * (*px++);
> + pb++;
> +
> + /* Decrement loop counter */
> + tapCnt--;
> + }
> + /* Decrement loop counter */
> + blkCnt--;
> + }
> +
> + /* Processing is complete.
> + Now copy the last numTaps - 1 samples to the start of the state
> buffer.
> + This prepares the state buffer for the next function call. */
> +
> + /* Points to the start of the pState buffer */
> + pStateCurnt = S->pState;
> +
> + /* copy data */
> +
> + uint32_t vblkCnt = (numTaps - 1U); /*
> Loop counter */
> + size_t l;
> + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
> + __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l)
> , l);
> + pState += l;
> + pStateCurnt += l;
> + }
> +
> +
> + /* Loop unrolling: Compute 4 taps at a time. */
> + tapCnt = (numTaps - 1U) >> 2U;
> +
> + while (tapCnt > 0U)
> + {
> + *pStateCurnt++ = *pState++;
> + *pStateCurnt++ = *pState++;
> + *pStateCurnt++ = *pState++;
> + *pStateCurnt++ = *pState++;
> +
> + /* Decrement loop counter */
> + tapCnt--;
> + }
> +
> + /* Loop unrolling: Compute remaining taps */
> + tapCnt = (numTaps - 1U) & 0x3U;
> +
> +
> +
> + /* Initialize tapCnt with number of samples */
> + tapCnt = (numTaps - 1U);
> +
> +
> +
> + while (tapCnt > 0U)
> + {
> + *pStateCurnt++ = *pState++;
> +
> + /* Decrement loop counter */
> + tapCnt--;
> + }
> +}
> --
> 2.36.3
>
>
Committed, thanks Kito.
Pan
From: Kito Cheng <kito.cheng@gmail.com>
Sent: Monday, October 23, 2023 5:50 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Kito Cheng <kito.cheng@sifive.com>; Jeff Law <jeffreyalaw@gmail.com>; Robin Dapp <rdapp.gcc@gmail.com>
Subject: Re: [PATCH V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927]
LGTM
Juzhe-Zhong <juzhe.zhong@rivai.ai<mailto:juzhe.zhong@rivai.ai>> 於 2023年10月23日 週一 17:41 寫道:
ICE:
during RTL pass: vsetvl
<source>: In function 'riscv_lms_f32':
<source>:240:1: internal compiler error: in merge, at config/riscv/riscv-vsetvl.cc:1997
240 | }
In general compatible_p (avl_equal_p) has:
if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
return false;
Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions.
It is reasonable to add it into 'can_use_next_avl_p' since we don't want to
fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL.
And after the fusion, we will alway use compatible_p to check whether the demand
is correct or not.
PR target/111927
gcc/ChangeLog:
* config/riscv/riscv-vsetvl.cc: Fix bug.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/pr111927.c: New test.
---
gcc/config/riscv/riscv-vsetvl.cc | 23 +++
.../gcc.target/riscv/rvv/vsetvl/pr111927.c | 170 ++++++++++++++++++
2 files changed, 193 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 47b459fddd4..f3922a051c5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1541,6 +1541,29 @@ private:
inline bool can_use_next_avl_p (const vsetvl_info &prev,
const vsetvl_info &next)
{
+ /* Forbid the AVL/VL propagation if VL of NEXT is used
+ by non-RVV instructions. This is because:
+
+ bb 2:
+ PREV: scalar move (no AVL)
+ bb 3:
+ NEXT: vsetvl a5(VL), a4(AVL) ...
+ branch a5,zero
+
+ Since user vsetvl instruction is no side effect instruction
+ which should be placed in the correct and optimal location
+ of the program by the previous PASS, it is unreasonable that
+ VSETVL PASS tries to move it to another places if it used by
+ non-RVV instructions.
+
+ Note: We only forbid the cases that VL is used by the following
+ non-RVV instructions which will cause issues. We don't forbid
+ other cases since it won't cause correctness issues and we still
+ more demand info are fused backward. The later LCM algorithm
+ should know the optimal location of the vsetvl. */
+ if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
+ return false;
+
if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
return true;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
new file mode 100644
index 00000000000..ab599add57f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+#define RISCV_MATH_LOOPUNROLL
+#define RISCV_MATH_VECTOR
+typedef float float32_t;
+
+ typedef struct
+ {
+ uint16_t numTaps; /**< number of coefficients in the filter. */
+ float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+ float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */
+ float32_t mu; /**< step size that controls filter coefficient updates. */
+ } riscv_lms_instance_f32;
+
+
+void riscv_lms_f32(
+ const riscv_lms_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+ uint32_t vblkCnt = numTaps; /* Loop counter */
+ size_t l;
+ vfloat32m8_t vx, vy;
+ vfloat32m1_t temp00m1;
+ l = __riscv_vsetvl_e32m1(1);
+ temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ vx = __riscv_vle32_v_f32m8(px, l);
+ px += l;
+ vy = __riscv_vle32_v_f32m8(pb, l);
+ pb += l;
+ temp00m1 = __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l), temp00m1, l);
+ }
+ acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
+
+ /* Compute and store error */
+ e = (float32_t) *pRef++ - acc;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ vblkCnt = numTaps;
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ vx = __riscv_vle32_v_f32m8(px, l);
+ px += l;
+ __riscv_vse32_v_f32m8(pb, __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l), __riscv_vle32_v_f32m8(pb, l), l) , l);
+ pb += l;
+ }
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+
+ uint32_t vblkCnt = (numTaps - 1U); /* Loop counter */
+ size_t l;
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l) , l);
+ pState += l;
+ pStateCurnt += l;
+ }
+
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) & 0x3U;
+
+
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+}
--
2.36.3
@@ -1541,6 +1541,29 @@ private:
inline bool can_use_next_avl_p (const vsetvl_info &prev,
const vsetvl_info &next)
{
+ /* Forbid the AVL/VL propagation if VL of NEXT is used
+ by non-RVV instructions. This is because:
+
+ bb 2:
+ PREV: scalar move (no AVL)
+ bb 3:
+ NEXT: vsetvl a5(VL), a4(AVL) ...
+ branch a5,zero
+
+ Since user vsetvl instruction is no side effect instruction
+ which should be placed in the correct and optimal location
+ of the program by the previous PASS, it is unreasonable that
+ VSETVL PASS tries to move it to another places if it used by
+ non-RVV instructions.
+
+ Note: We only forbid the cases that VL is used by the following
+ non-RVV instructions which will cause issues. We don't forbid
+ other cases since it won't cause correctness issues and we still
+ more demand info are fused backward. The later LCM algorithm
+ should know the optimal location of the vsetvl. */
+ if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ())
+ return false;
+
if (!next.has_nonvlmax_reg_avl () && !next.has_vl ())
return true;
new file mode 100644
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+#define RISCV_MATH_LOOPUNROLL
+#define RISCV_MATH_VECTOR
+typedef float float32_t;
+
+ typedef struct
+ {
+ uint16_t numTaps; /**< number of coefficients in the filter. */
+ float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+ float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */
+ float32_t mu; /**< step size that controls filter coefficient updates. */
+ } riscv_lms_instance_f32;
+
+
+void riscv_lms_f32(
+ const riscv_lms_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+ uint32_t vblkCnt = numTaps; /* Loop counter */
+ size_t l;
+ vfloat32m8_t vx, vy;
+ vfloat32m1_t temp00m1;
+ l = __riscv_vsetvl_e32m1(1);
+ temp00m1 = __riscv_vfmv_v_f_f32m1(0, l);
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ vx = __riscv_vle32_v_f32m8(px, l);
+ px += l;
+ vy = __riscv_vle32_v_f32m8(pb, l);
+ pb += l;
+ temp00m1 = __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l), temp00m1, l);
+ }
+ acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1);
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
+
+ /* Compute and store error */
+ e = (float32_t) *pRef++ - acc;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ vblkCnt = numTaps;
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ vx = __riscv_vle32_v_f32m8(px, l);
+ px += l;
+ __riscv_vse32_v_f32m8(pb, __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l), __riscv_vle32_v_f32m8(pb, l), l) , l);
+ pb += l;
+ }
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+
+ uint32_t vblkCnt = (numTaps - 1U); /* Loop counter */
+ size_t l;
+ for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) {
+ __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l) , l);
+ pState += l;
+ pStateCurnt += l;
+ }
+
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) & 0x3U;
+
+
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+}