[v2,2/5] crypto/pool: Add crypto_pool_reserve_scratch()

Message ID 20230103184257.118069-3-dima@arista.com
State New
Headers
Series net/crypto: Introduce crypto_pool |

Commit Message

Dmitry Safonov Jan. 3, 2023, 6:42 p.m. UTC
  Instead of having build-time hardcoded constant, reallocate scratch
area, if needed by user. Different algos, different users may need
different size of temp per-CPU buffer. Only up-sizing supported for
simplicity.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 crypto/Kconfig        |  6 ++++
 crypto/crypto_pool.c  | 77 ++++++++++++++++++++++++++++++++++---------
 include/crypto/pool.h |  3 +-
 3 files changed, 69 insertions(+), 17 deletions(-)
  

Comments

Jakub Kicinski Jan. 7, 2023, 2:04 a.m. UTC | #1
On Tue,  3 Jan 2023 18:42:54 +0000 Dmitry Safonov wrote:
> Instead of having build-time hardcoded constant, reallocate scratch
> area, if needed by user. Different algos, different users may need
> different size of temp per-CPU buffer. Only up-sizing supported for
> simplicity.

> -static int crypto_pool_scratch_alloc(void)
> +/* Slow-path */
> +/**
> + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path
> + * @size: request size for the scratch/temp buffer
> + */
> +int crypto_pool_reserve_scratch(unsigned long size)

Does this have to be a separate call? Can't we make it part of 
the pool allocation? AFAICT the scratch gets freed when last
pool is freed, so the user needs to know to allocate the pool
_first_ otherwise there's a potential race:

 CPU 1               CPU 2

 alloc pool
                    set scratch
 free pool
 [frees scratch]
                    alloc pool

>  {
> -	int cpu;
> -
> -	lockdep_assert_held(&cpool_mutex);
> +#define FREE_BATCH_SIZE		64
> +	void *free_batch[FREE_BATCH_SIZE];
> +	int cpu, err = 0;
> +	unsigned int i = 0;
>  
> +	mutex_lock(&cpool_mutex);
> +	if (size == scratch_size) {
> +		for_each_possible_cpu(cpu) {
> +			if (per_cpu(crypto_pool_scratch, cpu))
> +				continue;
> +			goto allocate_scratch;
> +		}
> +		mutex_unlock(&cpool_mutex);
> +		return 0;
> +	}
> +allocate_scratch:
> +	size = max(size, scratch_size);
> +	cpus_read_lock();
>  	for_each_possible_cpu(cpu) {
> -		void *scratch = per_cpu(crypto_pool_scratch, cpu);
> +		void *scratch, *old_scratch;
>  
> -		if (scratch)
> +		scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
> +		if (!scratch) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +
> +		old_scratch = per_cpu(crypto_pool_scratch, cpu);
> +		/* Pairs with crypto_pool_get() */
> +		WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch);

You're using RCU for protection here, please use rcu accessors.

> +		if (!cpu_online(cpu)) {
> +			kfree(old_scratch);
>  			continue;
> +		}
> +		free_batch[i++] = old_scratch;
> +		if (i == FREE_BATCH_SIZE) {
> +			cpus_read_unlock();
> +			synchronize_rcu();
> +			while (i > 0)
> +				kfree(free_batch[--i]);
> +			cpus_read_lock();
> +		}

This is a memory allocation routine, can we simplify this by
dynamically sizing "free_batch" and using call_rcu()?

struct humf_blah {
	struct rcu_head rcu;
	unsigned int cnt;
	void *data[];
};

cheezit = kmalloc(struct_size(blah, data, num_possible_cpus()));

for_each ..
	cheezit->data[cheezit->cnt++] = old_scratch;

call_rcu(&cheezit->rcu, my_free_them_scratches)

etc.

Feels like that'd be much less locking, unlocking and general
carefully'ing.

> +	}
> +	cpus_read_unlock();
> +	if (!err)
> +		scratch_size = size;
  
Dmitry Safonov Jan. 9, 2023, 9:08 p.m. UTC | #2
On 1/7/23 02:04, Jakub Kicinski wrote:
> On Tue,  3 Jan 2023 18:42:54 +0000 Dmitry Safonov wrote:
>> Instead of having build-time hardcoded constant, reallocate scratch
>> area, if needed by user. Different algos, different users may need
>> different size of temp per-CPU buffer. Only up-sizing supported for
>> simplicity.
> 
>> -static int crypto_pool_scratch_alloc(void)
>> +/* Slow-path */
>> +/**
>> + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path
>> + * @size: request size for the scratch/temp buffer
>> + */
>> +int crypto_pool_reserve_scratch(unsigned long size)
> 
> Does this have to be a separate call? Can't we make it part of 
> the pool allocation? AFAICT the scratch gets freed when last
> pool is freed, so the user needs to know to allocate the pool
> _first_ otherwise there's a potential race:
> 
>  CPU 1               CPU 2
> 
>  alloc pool
>                     set scratch
>  free pool
>  [frees scratch]
>                     alloc pool

Yeah, I think it will be cleaner if it was an argument for
crypto_pool_alloc_*() and would prevent potential misuse as you
describe. Which also means that I don't have to declare
crypto_pool_scratch_alloc() in patch 1, will just add a new parameter in
this patch to alloc function.

> 
>>  {
>> -	int cpu;
>> -
>> -	lockdep_assert_held(&cpool_mutex);
>> +#define FREE_BATCH_SIZE		64
>> +	void *free_batch[FREE_BATCH_SIZE];
>> +	int cpu, err = 0;
>> +	unsigned int i = 0;
>>  
>> +	mutex_lock(&cpool_mutex);
>> +	if (size == scratch_size) {
>> +		for_each_possible_cpu(cpu) {
>> +			if (per_cpu(crypto_pool_scratch, cpu))
>> +				continue;
>> +			goto allocate_scratch;
>> +		}
>> +		mutex_unlock(&cpool_mutex);
>> +		return 0;
>> +	}
>> +allocate_scratch:
>> +	size = max(size, scratch_size);
>> +	cpus_read_lock();
>>  	for_each_possible_cpu(cpu) {
>> -		void *scratch = per_cpu(crypto_pool_scratch, cpu);
>> +		void *scratch, *old_scratch;
>>  
>> -		if (scratch)
>> +		scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
>> +		if (!scratch) {
>> +			err = -ENOMEM;
>> +			break;
>> +		}
>> +
>> +		old_scratch = per_cpu(crypto_pool_scratch, cpu);
>> +		/* Pairs with crypto_pool_get() */
>> +		WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch);
> 
> You're using RCU for protection here, please use rcu accessors.

Will do.

> 
>> +		if (!cpu_online(cpu)) {
>> +			kfree(old_scratch);
>>  			continue;
>> +		}
>> +		free_batch[i++] = old_scratch;
>> +		if (i == FREE_BATCH_SIZE) {
>> +			cpus_read_unlock();
>> +			synchronize_rcu();
>> +			while (i > 0)
>> +				kfree(free_batch[--i]);
>> +			cpus_read_lock();
>> +		}
> 
> This is a memory allocation routine, can we simplify this by
> dynamically sizing "free_batch" and using call_rcu()?
> 
> struct humf_blah {
> 	struct rcu_head rcu;
> 	unsigned int cnt;
> 	void *data[];
> };
> 
> cheezit = kmalloc(struct_size(blah, data, num_possible_cpus()));
> 
> for_each ..
> 	cheezit->data[cheezit->cnt++] = old_scratch;
> 
> call_rcu(&cheezit->rcu, my_free_them_scratches)
> 
> etc.
> 
> Feels like that'd be much less locking, unlocking and general
> carefully'ing.

Will give it a try for v3, thanks for the idea and review,
          Dmitry
  

Patch

diff --git a/crypto/Kconfig b/crypto/Kconfig
index ba8d4a1f10f9..0614c2acfffa 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1394,6 +1394,12 @@  config CRYPTO_POOL
 	help
 	  Per-CPU pool of crypto requests ready for usage in atomic contexts.
 
+config CRYPTO_POOL_DEFAULT_SCRATCH_SIZE
+	hex "Per-CPU default scratch area size"
+	depends on CRYPTO_POOL
+	default 0x100
+	range 0x100 0x10000
+
 if !KMSAN # avoid false positives from assembly
 if ARM
 source "arch/arm/crypto/Kconfig"
diff --git a/crypto/crypto_pool.c b/crypto/crypto_pool.c
index 37131952c5a7..0cd9eade7b73 100644
--- a/crypto/crypto_pool.c
+++ b/crypto/crypto_pool.c
@@ -1,13 +1,14 @@ 
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <crypto/pool.h>
+#include <linux/cpu.h>
 #include <linux/kref.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/workqueue.h>
 
-static unsigned long scratch_size = DEFAULT_CRYPTO_POOL_SCRATCH_SZ;
+static unsigned long scratch_size = CONFIG_CRYPTO_POOL_DEFAULT_SCRATCH_SIZE;
 static DEFINE_PER_CPU(void *, crypto_pool_scratch);
 
 struct crypto_pool_entry {
@@ -22,26 +23,69 @@  static struct crypto_pool_entry cpool[CPOOL_SIZE];
 static unsigned int cpool_populated;
 static DEFINE_MUTEX(cpool_mutex);
 
-static int crypto_pool_scratch_alloc(void)
+/* Slow-path */
+/**
+ * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path
+ * @size: request size for the scratch/temp buffer
+ */
+int crypto_pool_reserve_scratch(unsigned long size)
 {
-	int cpu;
-
-	lockdep_assert_held(&cpool_mutex);
+#define FREE_BATCH_SIZE		64
+	void *free_batch[FREE_BATCH_SIZE];
+	int cpu, err = 0;
+	unsigned int i = 0;
 
+	mutex_lock(&cpool_mutex);
+	if (size == scratch_size) {
+		for_each_possible_cpu(cpu) {
+			if (per_cpu(crypto_pool_scratch, cpu))
+				continue;
+			goto allocate_scratch;
+		}
+		mutex_unlock(&cpool_mutex);
+		return 0;
+	}
+allocate_scratch:
+	size = max(size, scratch_size);
+	cpus_read_lock();
 	for_each_possible_cpu(cpu) {
-		void *scratch = per_cpu(crypto_pool_scratch, cpu);
+		void *scratch, *old_scratch;
 
-		if (scratch)
+		scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+		if (!scratch) {
+			err = -ENOMEM;
+			break;
+		}
+
+		old_scratch = per_cpu(crypto_pool_scratch, cpu);
+		/* Pairs with crypto_pool_get() */
+		WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch);
+		if (!cpu_online(cpu)) {
+			kfree(old_scratch);
 			continue;
+		}
+		free_batch[i++] = old_scratch;
+		if (i == FREE_BATCH_SIZE) {
+			cpus_read_unlock();
+			synchronize_rcu();
+			while (i > 0)
+				kfree(free_batch[--i]);
+			cpus_read_lock();
+		}
+	}
+	cpus_read_unlock();
+	if (!err)
+		scratch_size = size;
+	mutex_unlock(&cpool_mutex);
 
-		scratch = kmalloc_node(scratch_size, GFP_KERNEL,
-				       cpu_to_node(cpu));
-		if (!scratch)
-			return -ENOMEM;
-		per_cpu(crypto_pool_scratch, cpu) = scratch;
+	if (i > 0) {
+		synchronize_rcu();
+		while (i > 0)
+			kfree(free_batch[--i]);
 	}
-	return 0;
+	return err;
 }
+EXPORT_SYMBOL_GPL(crypto_pool_reserve_scratch);
 
 static void crypto_pool_scratch_free(void)
 {
@@ -138,7 +182,6 @@  int crypto_pool_alloc_ahash(const char *alg)
 
 	/* slow-path */
 	mutex_lock(&cpool_mutex);
-
 	for (i = 0; i < cpool_populated; i++) {
 		if (cpool[i].alg && !strcmp(cpool[i].alg, alg)) {
 			if (kref_read(&cpool[i].kref) > 0) {
@@ -263,7 +306,11 @@  int crypto_pool_get(unsigned int id, struct crypto_pool *c)
 		return -EINVAL;
 	}
 	ret->req = *this_cpu_ptr(cpool[id].req);
-	ret->base.scratch = this_cpu_read(crypto_pool_scratch);
+	/*
+	 * Pairs with crypto_pool_reserve_scratch(), scartch area is
+	 * valid (allocated) until crypto_pool_put().
+	 */
+	ret->base.scratch = READ_ONCE(*this_cpu_ptr(&crypto_pool_scratch));
 	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_pool_get);
diff --git a/include/crypto/pool.h b/include/crypto/pool.h
index 2c61aa45faff..c7d817860cc3 100644
--- a/include/crypto/pool.h
+++ b/include/crypto/pool.h
@@ -4,8 +4,6 @@ 
 
 #include <crypto/hash.h>
 
-#define DEFAULT_CRYPTO_POOL_SCRATCH_SZ	128
-
 struct crypto_pool {
 	void *scratch;
 };
@@ -20,6 +18,7 @@  struct crypto_pool_ahash {
 	struct ahash_request *req;
 };
 
+int crypto_pool_reserve_scratch(unsigned long size);
 int crypto_pool_alloc_ahash(const char *alg);
 void crypto_pool_add(unsigned int id);
 void crypto_pool_release(unsigned int id);