[RESEND] crypto: x86 - exit fpu context earlier in ECB/CBC macros

Message ID 20230121183450.14570-1-peter@n8pjl.ca
State New
Headers
Series [RESEND] crypto: x86 - exit fpu context earlier in ECB/CBC macros |

Commit Message

Peter Lafreniere Jan. 21, 2023, 6:34 p.m. UTC
  Currently the ecb/cbc macros hold fpu context unnecessarily when using
scalar cipher routines (e.g. when handling odd sizes of blocks per walk).

Change the macros to drop fpu context as soon as the fpu is out of use.

No performance impact found (on Intel Haswell).

Signed-off-by: Peter Lafreniere <peter@n8pjl.ca>
---
 arch/x86/crypto/ecb_cbc_helpers.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)
  

Comments

Ard Biesheuvel Jan. 31, 2023, 8:30 a.m. UTC | #1
On Sat, 21 Jan 2023 at 19:40, Peter Lafreniere <peter@n8pjl.ca> wrote:
>
> Currently the ecb/cbc macros hold fpu context unnecessarily when using
> scalar cipher routines (e.g. when handling odd sizes of blocks per walk).
>
> Change the macros to drop fpu context as soon as the fpu is out of use.
>
> No performance impact found (on Intel Haswell).
>
> Signed-off-by: Peter Lafreniere <peter@n8pjl.ca>

The patch seems correct to me, so

Acked-by: Ard Biesheuvel <ardb@kernel.org>

although I seriously doubt whether anyone would ever notice the
difference, given that the algorithms that use these macros are
primarily used in legacy block encryption scenarios, where the data is
always presented in multiples of the largest block size.


> ---
>  arch/x86/crypto/ecb_cbc_helpers.h | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
> index eaa15c7b29d6..11955bd01af1 100644
> --- a/arch/x86/crypto/ecb_cbc_helpers.h
> +++ b/arch/x86/crypto/ecb_cbc_helpers.h
> @@ -13,13 +13,14 @@
>
>  #define ECB_WALK_START(req, bsize, fpu_blocks) do {                    \
>         void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));   \
> +       const int __fpu_blocks = (fpu_blocks);                          \
>         const int __bsize = (bsize);                                    \
>         struct skcipher_walk walk;                                      \
>         int err = skcipher_walk_virt(&walk, (req), false);              \
>         while (walk.nbytes > 0) {                                       \
>                 unsigned int nbytes = walk.nbytes;                      \
> -               bool do_fpu = (fpu_blocks) != -1 &&                     \
> -                             nbytes >= (fpu_blocks) * __bsize;         \
> +               bool do_fpu = __fpu_blocks != -1 &&                     \
> +                             nbytes >= __fpu_blocks * __bsize;         \
>                 const u8 *src = walk.src.virt.addr;                     \
>                 u8 *dst = walk.dst.virt.addr;                           \
>                 u8 __maybe_unused buf[(bsize)];                         \
> @@ -35,7 +36,12 @@
>  } while (0)
>
>  #define ECB_BLOCK(blocks, func) do {                                   \
> -       while (nbytes >= (blocks) * __bsize) {                          \
> +       const int __blocks = (blocks);                                  \
> +       if (do_fpu && __blocks < __fpu_blocks) {                        \
> +               kernel_fpu_end();                                       \
> +               do_fpu = false;                                         \
> +       }                                                               \
> +       while (nbytes >= __blocks * __bsize) {                          \
>                 (func)(ctx, dst, src);                                  \
>                 ECB_WALK_ADVANCE(blocks);                               \
>         }                                                               \
> @@ -53,7 +59,12 @@
>  } while (0)
>
>  #define CBC_DEC_BLOCK(blocks, func) do {                               \
> -       while (nbytes >= (blocks) * __bsize) {                          \
> +       const int __blocks = (blocks);                                  \
> +       if (do_fpu && __blocks <  __fpu_blocks) {                       \
> +               kernel_fpu_end();                                       \
> +               do_fpu = false;                                         \
> +       }                                                               \
> +       while (nbytes >= __blocks * __bsize) {                          \
>                 const u8 *__iv = src + ((blocks) - 1) * __bsize;        \
>                 if (dst == src)                                         \
>                         __iv = memcpy(buf, __iv, __bsize);              \
> --
> 2.39.0
>
  
Herbert Xu Feb. 3, 2023, 5:03 a.m. UTC | #2
Peter Lafreniere <peter@n8pjl.ca> wrote:
> Currently the ecb/cbc macros hold fpu context unnecessarily when using
> scalar cipher routines (e.g. when handling odd sizes of blocks per walk).
> 
> Change the macros to drop fpu context as soon as the fpu is out of use.
> 
> No performance impact found (on Intel Haswell).
> 
> Signed-off-by: Peter Lafreniere <peter@n8pjl.ca>
> ---
> arch/x86/crypto/ecb_cbc_helpers.h | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)

Patch applied.  Thanks.
  

Patch

diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
index eaa15c7b29d6..11955bd01af1 100644
--- a/arch/x86/crypto/ecb_cbc_helpers.h
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -13,13 +13,14 @@ 
 
 #define ECB_WALK_START(req, bsize, fpu_blocks) do {			\
 	void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));	\
+	const int __fpu_blocks = (fpu_blocks);				\
 	const int __bsize = (bsize);					\
 	struct skcipher_walk walk;					\
 	int err = skcipher_walk_virt(&walk, (req), false);		\
 	while (walk.nbytes > 0) {					\
 		unsigned int nbytes = walk.nbytes;			\
-		bool do_fpu = (fpu_blocks) != -1 &&			\
-			      nbytes >= (fpu_blocks) * __bsize;		\
+		bool do_fpu = __fpu_blocks != -1 &&			\
+			      nbytes >= __fpu_blocks * __bsize;		\
 		const u8 *src = walk.src.virt.addr;			\
 		u8 *dst = walk.dst.virt.addr;				\
 		u8 __maybe_unused buf[(bsize)];				\
@@ -35,7 +36,12 @@ 
 } while (0)
 
 #define ECB_BLOCK(blocks, func) do {					\
-	while (nbytes >= (blocks) * __bsize) {				\
+	const int __blocks = (blocks);					\
+	if (do_fpu && __blocks < __fpu_blocks) {			\
+		kernel_fpu_end();					\
+		do_fpu = false;						\
+	}								\
+	while (nbytes >= __blocks * __bsize) {				\
 		(func)(ctx, dst, src);					\
 		ECB_WALK_ADVANCE(blocks);				\
 	}								\
@@ -53,7 +59,12 @@ 
 } while (0)
 
 #define CBC_DEC_BLOCK(blocks, func) do {				\
-	while (nbytes >= (blocks) * __bsize) {				\
+	const int __blocks = (blocks);					\
+	if (do_fpu && __blocks <  __fpu_blocks) {			\
+		kernel_fpu_end();					\
+		do_fpu = false;						\
+	}								\
+	while (nbytes >= __blocks * __bsize) {				\
 		const u8 *__iv = src + ((blocks) - 1) * __bsize;	\
 		if (dst == src)						\
 			__iv = memcpy(buf, __iv, __bsize);		\