[1/2] maple_tree: optimize mas_wr_append(), also improve duplicating VMAs

Message ID 20230602075353.5917-1-zhangpeng.00@bytedance.com
State New
Headers
Series [1/2] maple_tree: optimize mas_wr_append(), also improve duplicating VMAs |

Commit Message

Peng Zhang June 2, 2023, 7:53 a.m. UTC
  When the new range can be completely covered by the original last range
without touching the boundaries on both sides, two new entries can be
appended to the end as a fast path. We update the original last pivot at
the end, and the newly appended two entries will not be accessed before
this, so it is also safe in RCU mode.

This is useful for sequential insertion, which is what we do in
dup_mmap(). Enabling BENCH_FORK in test_maple_tree and just running
bench_forking() gives the following time-consuming numbers:

before:               after:
17,874.83 msec        15,738.38 msec

It shows about a 12% performance improvement for duplicating VMAs.

Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
---
 lib/maple_tree.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)
  

Comments

Liam R. Howlett June 2, 2023, 4:59 p.m. UTC | #1
* Peng Zhang <zhangpeng.00@bytedance.com> [230602 03:54]:
> When the new range can be completely covered by the original last range
> without touching the boundaries on both sides, two new entries can be
> appended to the end as a fast path. We update the original last pivot at
> the end, and the newly appended two entries will not be accessed before
> this, so it is also safe in RCU mode.
> 
> This is useful for sequential insertion, which is what we do in
> dup_mmap(). Enabling BENCH_FORK in test_maple_tree and just running
> bench_forking() gives the following time-consuming numbers:
> 
> before:               after:
> 17,874.83 msec        15,738.38 msec
> 
> It shows about a 12% performance improvement for duplicating VMAs.

byte-unixbench [1] has a 'spawn' benchmark for forking, it's not perfect as
the number of VMAs is rather low (21 VMAs).  So we're talking about a
height 2 tree with 2 nodes - basically one of the worst scenarios and
highly unlikely.  We're going to over-allocate nodes and return most of
them, we're going to balance to the left and have to rebalance as well.
For comparison, even cat /proc/self/maps returns 24 VMAs.

Anyways, with your change I see an average (over 4 runs) 2.1%
improvement on this benchmark.  This will obviously scale up with the
VMA count, so it is promising.

Another user of many forks is kernel builds, so we should try an mmtests
run on the kernbuild benchmark.

[1] https://github.com/kdlucas/byte-unixbench

> 
> Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>

Reviewed-by:Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  lib/maple_tree.c | 33 ++++++++++++++++++++++-----------
>  1 file changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/maple_tree.c b/lib/maple_tree.c
> index 7dd54afe66ed..cfd9fad308a2 100644
> --- a/lib/maple_tree.c
> +++ b/lib/maple_tree.c
> @@ -4199,10 +4199,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
>   *
>   * Return: True if appended, false otherwise
>   */
> -static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
> +static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
> +				 unsigned char new_end)
>  {
>  	unsigned char end = wr_mas->node_end;
> -	unsigned char new_end = end + 1;
>  	struct ma_state *mas = wr_mas->mas;
>  	unsigned char node_pivots = mt_pivots[wr_mas->type];
>  
> @@ -4214,16 +4214,27 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
>  		ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end);
>  	}
>  
> -	if (mas->last == wr_mas->r_max) {
> -		/* Append to end of range */
> -		rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry);
> -		wr_mas->pivots[end] = mas->index - 1;
> -		mas->offset = new_end;
> +	if (new_end == wr_mas->node_end + 1) {
> +		if (mas->last == wr_mas->r_max) {
> +			/* Append to end of range */
> +			rcu_assign_pointer(wr_mas->slots[new_end],
> +					   wr_mas->entry);
> +			wr_mas->pivots[end] = mas->index - 1;
> +			mas->offset = new_end;
> +		} else {
> +			/* Append to start of range */
> +			rcu_assign_pointer(wr_mas->slots[new_end],
> +					   wr_mas->content);
> +			wr_mas->pivots[end] = mas->last;
> +			rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
> +		}
>  	} else {
> -		/* Append to start of range */
> +		/* Append to the range without touching any boundaries. */
>  		rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content);
> -		wr_mas->pivots[end] = mas->last;
> -		rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
> +		wr_mas->pivots[end + 1] = mas->last;
> +		rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry);
> +		wr_mas->pivots[end] = mas->index - 1;
> +		mas->offset = end + 1;
>  	}
>  
>  	if (!wr_mas->content || !wr_mas->entry)
> @@ -4270,7 +4281,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
>  		goto slow_path;
>  
>  	/* Attempt to append */
> -	if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas))
> +	if (mas_wr_append(wr_mas, new_end))
>  		return;
>  
>  	if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas))
> -- 
> 2.20.1
>
  

Patch

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 7dd54afe66ed..cfd9fad308a2 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4199,10 +4199,10 @@  static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
  *
  * Return: True if appended, false otherwise
  */
-static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
+static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
+				 unsigned char new_end)
 {
 	unsigned char end = wr_mas->node_end;
-	unsigned char new_end = end + 1;
 	struct ma_state *mas = wr_mas->mas;
 	unsigned char node_pivots = mt_pivots[wr_mas->type];
 
@@ -4214,16 +4214,27 @@  static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
 		ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end);
 	}
 
-	if (mas->last == wr_mas->r_max) {
-		/* Append to end of range */
-		rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry);
-		wr_mas->pivots[end] = mas->index - 1;
-		mas->offset = new_end;
+	if (new_end == wr_mas->node_end + 1) {
+		if (mas->last == wr_mas->r_max) {
+			/* Append to end of range */
+			rcu_assign_pointer(wr_mas->slots[new_end],
+					   wr_mas->entry);
+			wr_mas->pivots[end] = mas->index - 1;
+			mas->offset = new_end;
+		} else {
+			/* Append to start of range */
+			rcu_assign_pointer(wr_mas->slots[new_end],
+					   wr_mas->content);
+			wr_mas->pivots[end] = mas->last;
+			rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
+		}
 	} else {
-		/* Append to start of range */
+		/* Append to the range without touching any boundaries. */
 		rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content);
-		wr_mas->pivots[end] = mas->last;
-		rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
+		wr_mas->pivots[end + 1] = mas->last;
+		rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry);
+		wr_mas->pivots[end] = mas->index - 1;
+		mas->offset = end + 1;
 	}
 
 	if (!wr_mas->content || !wr_mas->entry)
@@ -4270,7 +4281,7 @@  static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
 		goto slow_path;
 
 	/* Attempt to append */
-	if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas))
+	if (mas_wr_append(wr_mas, new_end))
 		return;
 
 	if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas))