@@ -41,41 +41,50 @@
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
-#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
- * for crc32_pclmul_le_16 */
-#define SCALE_F 16L /* size of xmm register */
+#define PCLMUL_MIN_LEN 64U /* minimum size of buffer for crc32_pclmul_le_16 */
+#define SCALE_F 16U /* size of xmm register */
#define SCALE_F_MASK (SCALE_F - 1)
-u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+asmlinkage u32 crc32_pclmul_le_16(const u8 *buffer, unsigned int len, u32 crc32);
-static u32 __attribute__((pure))
- crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+static u32 crc32_pclmul_le(u32 crc, const u8 *p, unsigned int len)
{
unsigned int iquotient;
unsigned int iremainder;
- unsigned int prealign;
if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
return crc32_le(crc, p, len);
- if ((long)p & SCALE_F_MASK) {
+ if ((unsigned long)p & SCALE_F_MASK) {
/* align p to 16 byte */
- prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+ unsigned int prealign = SCALE_F - ((unsigned long)p & SCALE_F_MASK);
crc = crc32_le(crc, p, prealign);
len -= prealign;
- p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
- ~SCALE_F_MASK);
+ p += prealign;
}
- iquotient = len & (~SCALE_F_MASK);
+ iquotient = len & ~SCALE_F_MASK;
iremainder = len & SCALE_F_MASK;
- kernel_fpu_begin();
- crc = crc32_pclmul_le_16(p, iquotient, crc);
- kernel_fpu_end();
+ if (iquotient) {
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(iquotient, 4096U);
- if (iremainder)
- crc = crc32_le(crc, p + iquotient, iremainder);
+ crc = crc32_pclmul_le_16(p, chunk, crc);
+ iquotient -= chunk;
+ p += chunk;
+
+ if (iquotient < PCLMUL_MIN_LEN)
+ break;
+
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
+ }
+
+ if (iquotient || iremainder)
+ crc = crc32_le(crc, p, iquotient + iremainder);
return crc;
}
@@ -120,8 +129,7 @@ static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
}
/* No final XOR 0xFFFFFFFF, like crc32_le */
-static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out)
{
*(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
return 0;
@@ -144,8 +152,7 @@ static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
+ return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, out);
}
static struct shash_alg alg = {
@@ -35,19 +35,24 @@
#ifdef CONFIG_X86_64
/*
- * use carryless multiply version of crc32c when buffer
- * size is >= 512 to account
- * for fpu state save/restore overhead.
+ * only use crc_pcl() (carryless multiply version of crc32c) when buffer
+ * size is >= 512 to account for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
- unsigned int crc_init);
+/*
+ * only pass aligned buffers to crc_pcl() to avoid special handling
+ * in each pass
+ */
+#define ALIGN_CRCPCL 16U
+#define ALIGN_CRCPCL_MASK (ALIGN_CRCPCL - 1)
+
+asmlinkage u32 crc_pcl(const u8 *buffer, u64 len, u32 crc_init);
#endif /* CONFIG_X86_64 */
-static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+static u32 crc32c_intel_le_hw_byte(u32 crc, const u8 *data, unsigned int len)
{
- while (length--) {
+ while (len--) {
asm("crc32b %1, %0"
: "+r" (crc) : "rm" (*data));
data++;
@@ -56,7 +61,7 @@ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t le
return crc;
}
-static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+static u32 __pure crc32c_intel_le_hw(u32 crc, const u8 *p, unsigned int len)
{
unsigned int iquotient = len / SCALE_F;
unsigned int iremainder = len % SCALE_F;
@@ -69,8 +74,7 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
}
if (iremainder)
- crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
- iremainder);
+ crc = crc32c_intel_le_hw_byte(crc, (u8 *)ptmp, iremainder);
return crc;
}
@@ -110,8 +114,8 @@ static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
return 0;
}
-static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32c_intel_finup(const u32 *crcp, const u8 *data,
+ unsigned int len, u8 *out)
{
*(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
@@ -134,8 +138,7 @@ static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
+ return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, out);
}
static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
@@ -149,47 +152,96 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+ unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
+ u32 crc;
+
+ BUILD_BUG_ON(CRC32C_PCL_BREAKEVEN > 4096U);
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
- if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
- kernel_fpu_end();
- } else
+ if (len < CRC32C_PCL_BREAKEVEN + ALIGN_CRCPCL_MASK || !crypto_simd_usable()) {
*crcp = crc32c_intel_le_hw(*crcp, data, len);
+ return 0;
+ }
+
+ crc = *crcp;
+ /*
+ * Although crc_pcl() supports unaligned buffers, it is more efficient
+ * handling a 16-byte aligned buffer.
+ */
+ if ((unsigned long)data & ALIGN_CRCPCL_MASK) {
+ unsigned int prealign = ALIGN_CRCPCL - ((unsigned long)data & ALIGN_CRCPCL_MASK);
+
+ crc = crc32c_intel_le_hw(crc, data, prealign);
+ len -= prealign;
+ data += prealign;
+ }
+
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(len, 4096U);
+
+ crc = crc_pcl(data, chunk, crc);
+ len -= chunk;
+
+ if (!len)
+ break;
+
+ data += chunk;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
+
+ *crcp = crc;
return 0;
}
-static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __crc32c_pcl_intel_finup(const u32 *crcp, const u8 *data,
+ unsigned int len, u8 *out)
{
- if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
- kernel_fpu_end();
- } else
- *(__le32 *)out =
- ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ u32 crc;
+
+ BUILD_BUG_ON(CRC32C_PCL_BREAKEVEN > 4096U);
+
+ if (len < CRC32C_PCL_BREAKEVEN || !crypto_simd_usable()) {
+ *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ return 0;
+ }
+
+ crc = *crcp;
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(len, 4096U);
+
+ crc = crc_pcl(data, chunk, crc);
+ len -= chunk;
+
+ if (!len)
+ break;
+
+ data += chunk;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
+
+ *(__le32 *)out = ~cpu_to_le32(crc);
return 0;
}
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+ unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+ unsigned int len, u8 *out)
{
- return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
+ return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, out);
}
#endif /* CONFIG_X86_64 */
@@ -34,6 +34,8 @@
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
+#define PCLMUL_MIN_LEN 16U /* minimum size of buffer for crc_t10dif_pcl */
+
asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
struct chksum_desc_ctx {
@@ -49,17 +51,36 @@ static int chksum_init(struct shash_desc *desc)
return 0;
}
-static int chksum_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
+static int chksum_update(struct shash_desc *desc, const u8 *data, unsigned int len)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+ u16 crc;
+
+ if (len < PCLMUL_MIN_LEN || !crypto_simd_usable()) {
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, len);
+ return 0;
+ }
+
+ crc = ctx->crc;
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(len, 4096U);
+
+ crc = crc_t10dif_pcl(crc, data, chunk);
+ len -= chunk;
+ data += chunk;
+
+ if (len < PCLMUL_MIN_LEN)
+ break;
+
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
+
+ if (len)
+ crc = crc_t10dif_generic(crc, data, len);
- if (length >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
- kernel_fpu_end();
- } else
- ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+ ctx->crc = crc;
return 0;
}
@@ -73,12 +94,30 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
- if (len >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
- kernel_fpu_end();
- } else
+ if (len < PCLMUL_MIN_LEN || !crypto_simd_usable()) {
*(__u16 *)out = crc_t10dif_generic(crc, data, len);
+ return 0;
+ }
+
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(len, 4096U);
+
+ crc = crc_t10dif_pcl(crc, data, chunk);
+ len -= chunk;
+ data += chunk;
+
+ if (len < PCLMUL_MIN_LEN)
+ break;
+
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
+
+ if (len)
+ crc = crc_t10dif_generic(crc, data, len);
+
+ *(__u16 *)out = crc;
return 0;
}