mm/vmalloc: lock contention optimization under multi-threading

Message ID 20240207033059.1565623-1-rulin.huang@intel.com
State New
Headers
Series mm/vmalloc: lock contention optimization under multi-threading |

Commit Message

rulinhuang Feb. 7, 2024, 3:30 a.m. UTC
  When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
Reviewed-by: "King, Colin" <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
 mm/vmalloc.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)


base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
  

Comments

Uladzislau Rezki Feb. 7, 2024, 9:24 a.m. UTC | #1
On Tue, Feb 06, 2024 at 10:30:59PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
> 
> Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
> Reviewed-by: "King, Colin" <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
>  mm/vmalloc.c | 31 +++++++++++++++----------------
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c171..3b1f616e8ecf0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
>  
>  /*
>   * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
>   */
>  static struct vmap_area *alloc_vmap_area(unsigned long size,
>  				unsigned long align,
>  				unsigned long vstart, unsigned long vend,
>  				int node, gfp_t gfp_mask,
> -				unsigned long va_flags)
> +				unsigned long va_flags, struct vm_struct *vm)
>  {
>  	struct vmap_area *va;
>  	unsigned long freed;
> @@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  
>  	va->va_start = addr;
>  	va->va_end = addr + size;
> -	va->vm = NULL;
> +	va->vm = vm;
>  	va->flags = va_flags;
>  
> +	if (vm != NULL)
> +		vm->addr = (void *)addr;
> +
>  	spin_lock(&vmap_area_lock);
>  	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>  	spin_unlock(&vmap_area_lock);
> @@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>  	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
>  					VMALLOC_START, VMALLOC_END,
>  					node, gfp_mask,
> -					VMAP_RAM|VMAP_BLOCK);
> +					VMAP_RAM|VMAP_BLOCK,
> +					NULL);
>  	if (IS_ERR(va)) {
>  		kfree(vb);
>  		return ERR_CAST(va);
> @@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
>  		struct vmap_area *va;
>  		va = alloc_vmap_area(size, PAGE_SIZE,
>  				VMALLOC_START, VMALLOC_END,
> -				node, GFP_KERNEL, VMAP_RAM);
> +				node, GFP_KERNEL, VMAP_RAM,
> +				NULL);
>  		if (IS_ERR(va))
>  			return NULL;
>  
> @@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
>  	va->vm = vm;
>  }
>  
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> -			      unsigned long flags, const void *caller)
> -{
> -	spin_lock(&vmap_area_lock);
> -	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vmap_area_lock);
> -}
> -
>  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  {
>  	/*
> @@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  	if (!(flags & VM_NO_GUARD))
>  		size += PAGE_SIZE;
>  
> -	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> +	area->flags = flags;
> +	area->caller = caller;
> +	area->size = size;
> +
> +	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
>  	if (IS_ERR(va)) {
>  		kfree(area);
>  		return NULL;
>  	}
>  
> -	setup_vmalloc_vm(area, va, flags, caller);
> -
>  	/*
>
Thank you for improving this! That was in my radar quite a long time ago :)

Some comments though. I think that such "partial" way of initializing of
"vm_struct" or "vm" is not optimal because it becomes spread between two
places.

Also, i think that we should not remove the setup_vmalloc_vm() function,
instead we can move it in a place where the VA is not inserted to the
tree, i.e. to initialize before insert.

The "locked" variant can be renamed to setup_vmalloc_vm(). 

Also, could you please post how you tested this patch and stress-ng
figures? This is really good for other people and folk to know.

Thanks!

--
Uladzislau Rezki
  
rulinhuang Feb. 9, 2024, 11:51 a.m. UTC | #2
Hi Rezki, thanks so much for your review. Exactly, your suggestions 
could effectively enhance the maintainability and readability of this 
code change as well as the whole vmalloc implementation. To avoid the 
partial initialization issue, we are trying to refine this patch by 
separating insert_map_area(), the insertion of VA into the tree, from 
alloc_map_area(), so that setup_vmalloc_vm() could be invoked between 
them. However, our initial trial ran into a boot-time error, which we 
are still debugging, and it may take a little bit longer than expected 
as the coming week is the public holiday of Lunar New Year in China. 
We will share with you the latest version of patch once ready for your 
review.
In the performance test, we firstly build stress-ng by following the 
instructions from https://github.com/ColinIanKing/stress-ng, and then 
launch the stressor for pthread (--pthread) for 30 seconds (-t 30) via 
the below command:
/stress-ng -t 30 --metrics-brief --pthread  -1 --no-rand-seed
The aggregated count of spawned threads per second (Bogo ops/s) is 
taken as the score of this workload. We evaluated the performance 
impact of this patch on the Ice Lake server with 40, 80, 120 and 160
 online cores respectively. And as is shown in below figure, with 
 the expansion of online cores, this patch could relieve the 
 increasingly severe lock contention and achieve quite significant 
 performance improvement of around 5.5% at 160 cores.

vcpu number                     40                           80                           120                         160
patched/original               100.5%                  100.8%                  105.2%                  105.5%
 
Thanks again for your help and please let us know if more details 
are needed.
  
rulinhuang Feb. 20, 2024, 9:12 a.m. UTC | #3
Hi Rezki, we have submitted patch v2 to avoid the partial 
initialization issue of vm's members and separated insert_vmap_area() 
from alloc_vmap_area() so that setup_vmalloc_vm() has no need to 
require lock protection. We have also verified the improvement of 
6.3% at 160 vcpu on intel icelake platform with this patch.
Thank you for your patience.
  
Uladzislau Rezki Feb. 21, 2024, 8:38 a.m. UTC | #4
Hello, Rulinhuang!

>
> Hi Rezki, we have submitted patch v2 to avoid the partial 
> initialization issue of vm's members and separated insert_vmap_area() 
> from alloc_vmap_area() so that setup_vmalloc_vm() has no need to 
> require lock protection. We have also verified the improvement of 
> 6.3% at 160 vcpu on intel icelake platform with this patch.
> Thank you for your patience.
>
Please work on the mm-unstable and try to measure it on the latest tip.

--
Uladzislau Rezki
  

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..3b1f616e8ecf0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,13 +1577,13 @@  preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
  */
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
 				int node, gfp_t gfp_mask,
-				unsigned long va_flags)
+				unsigned long va_flags, struct vm_struct *vm)
 {
 	struct vmap_area *va;
 	unsigned long freed;
@@ -1627,9 +1627,12 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	va->va_start = addr;
 	va->va_end = addr + size;
-	va->vm = NULL;
+	va->vm = vm;
 	va->flags = va_flags;
 
+	if (vm != NULL)
+		vm->addr = (void *)addr;
+
 	spin_lock(&vmap_area_lock);
 	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
 	spin_unlock(&vmap_area_lock);
@@ -2039,7 +2042,8 @@  static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
 					VMALLOC_START, VMALLOC_END,
 					node, gfp_mask,
-					VMAP_RAM|VMAP_BLOCK);
+					VMAP_RAM|VMAP_BLOCK,
+					NULL);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2394,7 +2398,8 @@  void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		struct vmap_area *va;
 		va = alloc_vmap_area(size, PAGE_SIZE,
 				VMALLOC_START, VMALLOC_END,
-				node, GFP_KERNEL, VMAP_RAM);
+				node, GFP_KERNEL, VMAP_RAM,
+				NULL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2548,14 +2553,6 @@  static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	spin_lock(&vmap_area_lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -2592,14 +2589,16 @@  static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+	area->flags = flags;
+	area->caller = caller;
+	area->size = size;
+
+	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
 	}
 
-	setup_vmalloc_vm(area, va, flags, caller);
-
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.