LoongArch: Micro-optimize sc_save_fcc and sc_restore_fcc for LA464

Message ID 20231214130206.21219-1-xry111@xry111.site
State New
Headers
Series LoongArch: Micro-optimize sc_save_fcc and sc_restore_fcc for LA464 |

Commit Message

Xi Ruoyao Dec. 14, 2023, 1:02 p.m. UTC
  On LA464 movcf2gr is 7 times slower than movcf2fr + movfr2gr, and
movgr2cf is 15 times (!) slower than movgr2fr + movfr2cf.

On LA664 movcf2fr + movfr2gr has a similar performance with movcf2gr,
and movgr2fr + movfr2cf has a similar performance with movgr2cf.

To use FP registers in sc_save_fcc and sc_restore_fcc we need to save
FP/LSX/LASX registers before sc_save_fcc, and restore FP/LSX/LASX
registers after sc_restore_fcc.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
---
 arch/loongarch/kernel/fpu.S | 94 +++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 40 deletions(-)
  

Comments

Huacai Chen Dec. 14, 2023, 1:10 p.m. UTC | #1
Emmm, I want to keep the code simpler. :)

Huacai

On Thu, Dec 14, 2023 at 9:02 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On LA464 movcf2gr is 7 times slower than movcf2fr + movfr2gr, and
> movgr2cf is 15 times (!) slower than movgr2fr + movfr2cf.
>
> On LA664 movcf2fr + movfr2gr has a similar performance with movcf2gr,
> and movgr2fr + movfr2cf has a similar performance with movgr2cf.
>
> To use FP registers in sc_save_fcc and sc_restore_fcc we need to save
> FP/LSX/LASX registers before sc_save_fcc, and restore FP/LSX/LASX
> registers after sc_restore_fcc.
>
> Signed-off-by: Xi Ruoyao <xry111@xry111.site>
> ---
>  arch/loongarch/kernel/fpu.S | 94 +++++++++++++++++++++----------------
>  1 file changed, 54 insertions(+), 40 deletions(-)
>
> diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
> index d53ab10f4644..ecb127f9a673 100644
> --- a/arch/loongarch/kernel/fpu.S
> +++ b/arch/loongarch/kernel/fpu.S
> @@ -96,43 +96,57 @@
>         .endm
>
>         .macro sc_save_fcc base, tmp0, tmp1
> -       movcf2gr        \tmp0, $fcc0
> -       move            \tmp1, \tmp0
> -       movcf2gr        \tmp0, $fcc1
> -       bstrins.d       \tmp1, \tmp0, 15, 8
> -       movcf2gr        \tmp0, $fcc2
> -       bstrins.d       \tmp1, \tmp0, 23, 16
> -       movcf2gr        \tmp0, $fcc3
> -       bstrins.d       \tmp1, \tmp0, 31, 24
> -       movcf2gr        \tmp0, $fcc4
> -       bstrins.d       \tmp1, \tmp0, 39, 32
> -       movcf2gr        \tmp0, $fcc5
> -       bstrins.d       \tmp1, \tmp0, 47, 40
> -       movcf2gr        \tmp0, $fcc6
> -       bstrins.d       \tmp1, \tmp0, 55, 48
> -       movcf2gr        \tmp0, $fcc7
> -       bstrins.d       \tmp1, \tmp0, 63, 56
> -       EX      st.d    \tmp1, \base, 0
> +       movcf2fr        ft0, $fcc0
> +       movcf2fr        ft1, $fcc1
> +       movfr2gr.s      \tmp0, ft0
> +       movfr2gr.s      \tmp1, ft1
> +       EX  st.b        \tmp0, \base, 0
> +       EX  st.b        \tmp0, \base, 8
> +       movcf2fr        ft0, $fcc2
> +       movcf2fr        ft1, $fcc3
> +       movfr2gr.s      \tmp0, ft0
> +       movfr2gr.s      \tmp1, ft1
> +       EX  st.b        \tmp0, \base, 16
> +       EX  st.b        \tmp0, \base, 24
> +       movcf2fr        ft0, $fcc3
> +       movcf2fr        ft1, $fcc4
> +       movfr2gr.s      \tmp0, ft0
> +       movfr2gr.s      \tmp1, ft1
> +       EX  st.b        \tmp0, \base, 32
> +       EX  st.b        \tmp0, \base, 40
> +       movcf2fr        ft0, $fcc5
> +       movcf2fr        ft1, $fcc6
> +       movfr2gr.s      \tmp0, ft0
> +       movfr2gr.s      \tmp1, ft1
> +       EX  st.b        \tmp0, \base, 48
> +       EX  st.b        \tmp0, \base, 56
>         .endm
>
>         .macro sc_restore_fcc base, tmp0, tmp1
> -       EX      ld.d    \tmp0, \base, 0
> -       bstrpick.d      \tmp1, \tmp0, 7, 0
> -       movgr2cf        $fcc0, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 15, 8
> -       movgr2cf        $fcc1, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 23, 16
> -       movgr2cf        $fcc2, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 31, 24
> -       movgr2cf        $fcc3, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 39, 32
> -       movgr2cf        $fcc4, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 47, 40
> -       movgr2cf        $fcc5, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 55, 48
> -       movgr2cf        $fcc6, \tmp1
> -       bstrpick.d      \tmp1, \tmp0, 63, 56
> -       movgr2cf        $fcc7, \tmp1
> +       EX      ld.b    \tmp0, \base, 0
> +       EX      ld.b    \tmp1, \base, 8
> +       movgr2fr.w      ft0, \tmp0
> +       movgr2fr.w      ft1, \tmp1
> +       movfr2cf        $fcc0, ft0
> +       movfr2cf        $fcc1, ft1
> +       EX      ld.b    \tmp0, \base, 16
> +       EX      ld.b    \tmp1, \base, 24
> +       movgr2fr.w      ft0, \tmp0
> +       movgr2fr.w      ft1, \tmp1
> +       movfr2cf        $fcc2, ft0
> +       movfr2cf        $fcc3, ft1
> +       EX      ld.b    \tmp0, \base, 32
> +       EX      ld.b    \tmp1, \base, 40
> +       movgr2fr.w      ft0, \tmp0
> +       movgr2fr.w      ft1, \tmp1
> +       movfr2cf        $fcc4, ft0
> +       movfr2cf        $fcc5, ft1
> +       EX      ld.b    \tmp0, \base, 48
> +       EX      ld.b    \tmp1, \base, 56
> +       movgr2fr.w      ft0, \tmp0
> +       movgr2fr.w      ft1, \tmp1
> +       movfr2cf        $fcc6, ft0
> +       movfr2cf        $fcc7, ft1
>         .endm
>
>         .macro sc_save_fcsr base, tmp0
> @@ -449,9 +463,9 @@ SYM_FUNC_END(_init_fpu)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_save_fp_context)
> -       sc_save_fcc     a1 t1 t2
>         sc_save_fcsr    a2 t1
>         sc_save_fp      a0
> +       sc_save_fcc     a1 t1 t2
>         li.w            a0, 0                           # success
>         jr              ra
>  SYM_FUNC_END(_save_fp_context)
> @@ -462,8 +476,8 @@ SYM_FUNC_END(_save_fp_context)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_restore_fp_context)
> -       sc_restore_fp   a0
>         sc_restore_fcc  a1 t1 t2
> +       sc_restore_fp   a0
>         sc_restore_fcsr a2 t1
>         li.w            a0, 0                           # success
>         jr              ra
> @@ -475,9 +489,9 @@ SYM_FUNC_END(_restore_fp_context)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_save_lsx_context)
> -       sc_save_fcc a1, t0, t1
>         sc_save_fcsr a2, t0
>         sc_save_lsx a0
> +       sc_save_fcc a1, t0, t1
>         li.w    a0, 0                                   # success
>         jr      ra
>  SYM_FUNC_END(_save_lsx_context)
> @@ -488,8 +502,8 @@ SYM_FUNC_END(_save_lsx_context)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_restore_lsx_context)
> -       sc_restore_lsx a0
>         sc_restore_fcc a1, t1, t2
> +       sc_restore_lsx a0
>         sc_restore_fcsr a2, t1
>         li.w    a0, 0                                   # success
>         jr      ra
> @@ -501,9 +515,9 @@ SYM_FUNC_END(_restore_lsx_context)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_save_lasx_context)
> -       sc_save_fcc a1, t0, t1
>         sc_save_fcsr a2, t0
>         sc_save_lasx a0
> +       sc_save_fcc a1, t0, t1
>         li.w    a0, 0                                   # success
>         jr      ra
>  SYM_FUNC_END(_save_lasx_context)
> @@ -514,8 +528,8 @@ SYM_FUNC_END(_save_lasx_context)
>   * a2: fcsr
>   */
>  SYM_FUNC_START(_restore_lasx_context)
> -       sc_restore_lasx a0
>         sc_restore_fcc a1, t1, t2
> +       sc_restore_lasx a0
>         sc_restore_fcsr a2, t1
>         li.w    a0, 0                                   # success
>         jr      ra
> --
> 2.43.0
>
  

Patch

diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index d53ab10f4644..ecb127f9a673 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -96,43 +96,57 @@ 
 	.endm
 
 	.macro sc_save_fcc base, tmp0, tmp1
-	movcf2gr	\tmp0, $fcc0
-	move		\tmp1, \tmp0
-	movcf2gr	\tmp0, $fcc1
-	bstrins.d	\tmp1, \tmp0, 15, 8
-	movcf2gr	\tmp0, $fcc2
-	bstrins.d	\tmp1, \tmp0, 23, 16
-	movcf2gr	\tmp0, $fcc3
-	bstrins.d	\tmp1, \tmp0, 31, 24
-	movcf2gr	\tmp0, $fcc4
-	bstrins.d	\tmp1, \tmp0, 39, 32
-	movcf2gr	\tmp0, $fcc5
-	bstrins.d	\tmp1, \tmp0, 47, 40
-	movcf2gr	\tmp0, $fcc6
-	bstrins.d	\tmp1, \tmp0, 55, 48
-	movcf2gr	\tmp0, $fcc7
-	bstrins.d	\tmp1, \tmp0, 63, 56
-	EX	st.d	\tmp1, \base, 0
+	movcf2fr	ft0, $fcc0
+	movcf2fr	ft1, $fcc1
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 0
+	EX  st.b	\tmp0, \base, 8
+	movcf2fr	ft0, $fcc2
+	movcf2fr	ft1, $fcc3
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 16
+	EX  st.b	\tmp0, \base, 24
+	movcf2fr	ft0, $fcc3
+	movcf2fr	ft1, $fcc4
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 32
+	EX  st.b	\tmp0, \base, 40
+	movcf2fr	ft0, $fcc5
+	movcf2fr	ft1, $fcc6
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 48
+	EX  st.b	\tmp0, \base, 56
 	.endm
 
 	.macro sc_restore_fcc base, tmp0, tmp1
-	EX	ld.d	\tmp0, \base, 0
-	bstrpick.d	\tmp1, \tmp0, 7, 0
-	movgr2cf	$fcc0, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 15, 8
-	movgr2cf	$fcc1, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 23, 16
-	movgr2cf	$fcc2, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 31, 24
-	movgr2cf	$fcc3, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 39, 32
-	movgr2cf	$fcc4, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 47, 40
-	movgr2cf	$fcc5, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 55, 48
-	movgr2cf	$fcc6, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 63, 56
-	movgr2cf	$fcc7, \tmp1
+	EX	ld.b	\tmp0, \base, 0
+	EX	ld.b	\tmp1, \base, 8
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc0, ft0
+	movfr2cf	$fcc1, ft1
+	EX	ld.b	\tmp0, \base, 16
+	EX	ld.b	\tmp1, \base, 24
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc2, ft0
+	movfr2cf	$fcc3, ft1
+	EX	ld.b	\tmp0, \base, 32
+	EX	ld.b	\tmp1, \base, 40
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc4, ft0
+	movfr2cf	$fcc5, ft1
+	EX	ld.b	\tmp0, \base, 48
+	EX	ld.b	\tmp1, \base, 56
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc6, ft0
+	movfr2cf	$fcc7, ft1
 	.endm
 
 	.macro sc_save_fcsr base, tmp0
@@ -449,9 +463,9 @@  SYM_FUNC_END(_init_fpu)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_fp_context)
-	sc_save_fcc	a1 t1 t2
 	sc_save_fcsr	a2 t1
 	sc_save_fp	a0
+	sc_save_fcc	a1 t1 t2
 	li.w		a0, 0				# success
 	jr		ra
 SYM_FUNC_END(_save_fp_context)
@@ -462,8 +476,8 @@  SYM_FUNC_END(_save_fp_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_fp_context)
-	sc_restore_fp	a0
 	sc_restore_fcc	a1 t1 t2
+	sc_restore_fp	a0
 	sc_restore_fcsr	a2 t1
 	li.w		a0, 0				# success
 	jr		ra
@@ -475,9 +489,9 @@  SYM_FUNC_END(_restore_fp_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_lsx_context)
-	sc_save_fcc a1, t0, t1
 	sc_save_fcsr a2, t0
 	sc_save_lsx a0
+	sc_save_fcc a1, t0, t1
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lsx_context)
@@ -488,8 +502,8 @@  SYM_FUNC_END(_save_lsx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_lsx_context)
-	sc_restore_lsx a0
 	sc_restore_fcc a1, t1, t2
+	sc_restore_lsx a0
 	sc_restore_fcsr a2, t1
 	li.w	a0, 0					# success
 	jr	ra
@@ -501,9 +515,9 @@  SYM_FUNC_END(_restore_lsx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_lasx_context)
-	sc_save_fcc a1, t0, t1
 	sc_save_fcsr a2, t0
 	sc_save_lasx a0
+	sc_save_fcc a1, t0, t1
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lasx_context)
@@ -514,8 +528,8 @@  SYM_FUNC_END(_save_lasx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_lasx_context)
-	sc_restore_lasx a0
 	sc_restore_fcc a1, t1, t2
+	sc_restore_lasx a0
 	sc_restore_fcsr a2, t1
 	li.w	a0, 0					# success
 	jr	ra