[v3,10/17] crypto: x86/*poly* - limit FPU preemption

Message ID 20221103042740.6556-11-elliott@hpe.com
State New
Headers
Series crypt: x86 - fix RCU stalls |

Commit Message

Elliott, Robert (Servers) Nov. 3, 2022, 4:27 a.m. UTC
  Use a static const unsigned int for the limit of the number of bytes
processed between kernel_fpu_begin() and kernel_fpu_end() rather than
using the SZ_4K macro (which is a signed value), or a magic value
of 4096U embedded in the C code.

Use unsigned int rather than size_t for some of the arguments to
avoid typecasting for the min() macro.

Signed-off-by: Robert Elliott <elliott@hpe.com>

---
v3 use static int rather than macro, change to while loops
rather than do/while loops
---
 arch/x86/crypto/nhpoly1305-avx2-glue.c | 11 +++++---
 arch/x86/crypto/nhpoly1305-sse2-glue.c | 11 +++++---
 arch/x86/crypto/poly1305_glue.c        | 37 +++++++++++++++++---------
 arch/x86/crypto/polyval-clmulni_glue.c |  8 ++++--
 4 files changed, 46 insertions(+), 21 deletions(-)
  

Patch

diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 8ea5ab0f1ca7..f7dc9c563bb5 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -13,6 +13,9 @@ 
 #include <linux/sizes.h>
 #include <asm/simd.h>
 
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 337 * 1024;
+
 asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
 			u8 hash[NH_HASH_BYTES]);
 
@@ -26,18 +29,20 @@  static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_avx2_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
+	BUILD_BUG_ON(bytes_per_fpu == 0);
+
 	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
-	do {
-		unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+	while (srclen) {
+		unsigned int n = min(srclen, bytes_per_fpu);
 
 		kernel_fpu_begin();
 		crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
 		kernel_fpu_end();
 		src += n;
 		srclen -= n;
-	} while (srclen);
+	}
 	return 0;
 }
 
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 2b353d42ed13..daffcc7019ad 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -13,6 +13,9 @@ 
 #include <linux/sizes.h>
 #include <asm/simd.h>
 
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 199 * 1024;
+
 asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
 			u8 hash[NH_HASH_BYTES]);
 
@@ -26,18 +29,20 @@  static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_sse2_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
+	BUILD_BUG_ON(bytes_per_fpu == 0);
+
 	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
-	do {
-		unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+	while (srclen) {
+		unsigned int n = min(srclen, bytes_per_fpu);
 
 		kernel_fpu_begin();
 		crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
 		kernel_fpu_end();
 		src += n;
 		srclen -= n;
-	} while (srclen);
+	}
 	return 0;
 }
 
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 1dfb8af48a3c..16831c036d71 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -15,20 +15,27 @@ 
 #include <asm/intel-family.h>
 #include <asm/simd.h>
 
+#define POLY1305_BLOCK_SIZE_MASK (~(POLY1305_BLOCK_SIZE - 1))
+
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 217 * 1024;
+
 asmlinkage void poly1305_init_x86_64(void *ctx,
 				     const u8 key[POLY1305_BLOCK_SIZE]);
 asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
-				       const size_t len, const u32 padbit);
+				       const unsigned int len,
+				       const u32 padbit);
 asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 				     const u32 nonce[4]);
 asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 				  const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
-				    const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
-				     const u32 padbit);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp,
+				    const unsigned int len, const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp,
+				     const unsigned int len, const u32 padbit);
 asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
-				       const size_t len, const u32 padbit);
+				       const unsigned int len,
+				       const u32 padbit);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
@@ -86,14 +93,12 @@  static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
 	poly1305_init_x86_64(ctx, key);
 }
 
-static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, unsigned int len,
 				 const u32 padbit)
 {
 	struct poly1305_arch_internal *state = ctx;
 
-	/* SIMD disables preemption, so relax after processing each page. */
-	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
-		     SZ_4K % POLY1305_BLOCK_SIZE);
+	BUILD_BUG_ON(bytes_per_fpu < POLY1305_BLOCK_SIZE);
 
 	if (!static_branch_likely(&poly1305_use_avx) ||
 	    (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
@@ -103,8 +108,14 @@  static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
 		return;
 	}
 
-	do {
-		const size_t bytes = min_t(size_t, len, SZ_4K);
+	while (len) {
+		unsigned int bytes;
+
+		if (len < POLY1305_BLOCK_SIZE)
+			bytes = len;
+		else
+			bytes = min(len,
+				    bytes_per_fpu & POLY1305_BLOCK_SIZE_MASK);
 
 		kernel_fpu_begin();
 		if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
@@ -117,7 +128,7 @@  static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
 
 		len -= bytes;
 		inp += bytes;
-	} while (len);
+	}
 }
 
 static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
index b7664d018851..de1c908f7412 100644
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -29,6 +29,9 @@ 
 
 #define NUM_KEY_POWERS	8
 
+/* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+static const unsigned int bytes_per_fpu = 393 * 1024;
+
 struct polyval_tfm_ctx {
 	/*
 	 * These powers must be in the order h^8, ..., h^1.
@@ -107,6 +110,8 @@  static int polyval_x86_update(struct shash_desc *desc,
 	unsigned int nblocks;
 	unsigned int n;
 
+	BUILD_BUG_ON(bytes_per_fpu < POLYVAL_BLOCK_SIZE);
+
 	if (dctx->bytes) {
 		n = min(srclen, dctx->bytes);
 		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
@@ -123,8 +128,7 @@  static int polyval_x86_update(struct shash_desc *desc,
 	}
 
 	while (srclen >= POLYVAL_BLOCK_SIZE) {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+		nblocks = min(srclen, bytes_per_fpu) / POLYVAL_BLOCK_SIZE;
 		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
 		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
 		src += nblocks * POLYVAL_BLOCK_SIZE;