diff mbox series

[v4,3/7] padata: dispatch works on different nodes

Message ID	20240118123911.88833-4-gang.li@linux.dev
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel+bounces-30106-ouuuleilei=gmail.com@vger.kernel.org designates 2604:1380:45e3:2400::1 as permitted sender) client-ip=2604:1380:45e3:2400::1; From: Gang Li <gang.li@linux.dev> To: David Hildenbrand <david@redhat.com>, David Rientjes <rientjes@google.com>, Mike Kravetz <mike.kravetz@oracle.com>, Muchun Song <muchun.song@linux.dev>, Andrew Morton <akpm@linux-foundation.org>, Tim Chen <tim.c.chen@linux.intel.com> Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, ligang.bdlg@bytedance.com, Gang Li <gang.li@linux.dev> Subject: [PATCH v4 3/7] padata: dispatch works on different nodes Date: Thu, 18 Jan 2024 20:39:07 +0800 Message-Id: <20240118123911.88833-4-gang.li@linux.dev> In-Reply-To: <20240118123911.88833-1-gang.li@linux.dev> References: <20240118123911.88833-1-gang.li@linux.dev> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	hugetlb: parallelize hugetlb page init on boot \| [RESEND,v4,0/7] hugetlb: parallelize hugetlb page init on boot [v4,1/7] hugetlb: code clean for hugetlb_hstate_alloc_pages [v4,2/7] hugetlb: split hugetlb_hstate_alloc_pages [v4,3/7] padata: dispatch works on different nodes [v4,4/7] hugetlb: pass *next_nid_to_alloc directly to for_each_node_mask_to_alloc [v4,5/7] hugetlb: have CONFIG_HUGETLBFS select CONFIG_PADATA [v4,6/7] hugetlb: parallelize 2M hugetlb allocation and initialization [v4,7/7] hugetlb: parallelize 1G hugetlb initialization

Commit Message

Gang Li Jan. 18, 2024, 12:39 p.m. UTC

  When a group of tasks that access different nodes are scheduled on the
same node, they may encounter bandwidth bottlenecks and access latency.

Thus, numa_aware flag is introduced here, allowing tasks to be
distributed across different nodes to fully utilize the advantage of
multi-node systems.

Signed-off-by: Gang Li <gang.li@linux.dev>
Tested-by: David Rientjes <rientjes@google.com>
---
 include/linux/padata.h |  3 +++
 kernel/padata.c        | 14 ++++++++++++--
 mm/mm_init.c           |  1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

Comments

Tim Chen Jan. 18, 2024, 11:04 p.m. UTC | #1

On Thu, 2024-01-18 at 20:39 +0800, Gang Li wrote:
> When a group of tasks that access different nodes are scheduled on the
> same node, they may encounter bandwidth bottlenecks and access latency.
> 
> Thus, numa_aware flag is introduced here, allowing tasks to be
> distributed across different nodes to fully utilize the advantage of
> multi-node systems.
> 
> Signed-off-by: Gang Li <gang.li@linux.dev>
> Tested-by: David Rientjes <rientjes@google.com>
> ---
>  include/linux/padata.h |  3 +++
>  kernel/padata.c        | 14 ++++++++++++--
>  mm/mm_init.c           |  1 +
>  3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/padata.h b/include/linux/padata.h
> index 495b16b6b4d7..f79ccd50e7f4 100644
> --- a/include/linux/padata.h
> +++ b/include/linux/padata.h
> @@ -137,6 +137,8 @@ struct padata_shell {
>   *             appropriate for one worker thread to do at once.
>   * @max_threads: Max threads to use for the job, actual number may be less
>   *               depending on task size and minimum chunk size.
> + * @numa_aware: Dispatch jobs to different nodes. If a node only has memory but
> + *              no CPU, dispatch its jobs to a random CPU.

Suggest:
Distribute jobs to different nodes with CPU in a round robin fashion.

>   */
>  struct padata_mt_job {

Muchun Song Jan. 19, 2024, 2:59 a.m. UTC | #2

On 2024/1/18 20:39, Gang Li wrote:
> When a group of tasks that access different nodes are scheduled on the
> same node, they may encounter bandwidth bottlenecks and access latency.
>
> Thus, numa_aware flag is introduced here, allowing tasks to be
> distributed across different nodes to fully utilize the advantage of
> multi-node systems.
>
> Signed-off-by: Gang Li <gang.li@linux.dev>
> Tested-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/padata.h |  3 +++
>   kernel/padata.c        | 14 ++++++++++++--
>   mm/mm_init.c           |  1 +
>   3 files changed, 16 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/padata.h b/include/linux/padata.h
> index 495b16b6b4d7..f79ccd50e7f4 100644
> --- a/include/linux/padata.h
> +++ b/include/linux/padata.h
> @@ -137,6 +137,8 @@ struct padata_shell {
>    *             appropriate for one worker thread to do at once.
>    * @max_threads: Max threads to use for the job, actual number may be less
>    *               depending on task size and minimum chunk size.
> + * @numa_aware: Dispatch jobs to different nodes. If a node only has memory but
> + *              no CPU, dispatch its jobs to a random CPU.
>    */
>   struct padata_mt_job {
>   	void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
> @@ -146,6 +148,7 @@ struct padata_mt_job {
>   	unsigned long		align;
>   	unsigned long		min_chunk;
>   	int			max_threads;
> +	bool			numa_aware;
>   };
>   
>   /**
> diff --git a/kernel/padata.c b/kernel/padata.c
> index 179fb1518070..10eae3f59203 100644
> --- a/kernel/padata.c
> +++ b/kernel/padata.c
> @@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
>   	struct padata_work my_work, *pw;
>   	struct padata_mt_job_state ps;
>   	LIST_HEAD(works);
> -	int nworks;
> +	int nworks, nid;
> +	static atomic_t last_used_nid = ATOMIC_INIT(0);
last_used_nid is only used during boot time so it could be
__init_data. Otherwise, LGTM.

Reviewed-by: Muchun Song <muchun.song@linux.dev>

>   
>   	if (job->size == 0)
>   		return;
> @@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
>   	ps.chunk_size = roundup(ps.chunk_size, job->align);
>   
>   	list_for_each_entry(pw, &works, pw_list)
> -		queue_work(system_unbound_wq, &pw->pw_work);
> +		if (job->numa_aware) {
> +			int old_node = atomic_read(&last_used_nid);
> +
> +			do {
> +				nid = next_node_in(old_node, node_states[N_CPU]);
> +			} while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
> +			queue_work_node(nid, system_unbound_wq, &pw->pw_work);
> +		} else {
> +			queue_work(system_unbound_wq, &pw->pw_work);
> +		}
>   
>   	/* Use the current thread, which saves starting a workqueue worker. */
>   	padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 2c19f5515e36..549e76af8f82 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data)
>   			.align       = PAGES_PER_SECTION,
>   			.min_chunk   = PAGES_PER_SECTION,
>   			.max_threads = max_threads,
> +			.numa_aware  = false,
>   		};
>   
>   		padata_do_multithreaded(&job);

Gang Li Jan. 19, 2024, 3:04 p.m. UTC | #3

On 2024/1/19 10:59, Muchun Song wrote:
> On 2024/1/18 20:39, Gang Li wrote:
>> --- a/kernel/padata.c
>> +++ b/kernel/padata.c
>> @@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct 
>> padata_mt_job *job)
>>       struct padata_work my_work, *pw;
>>       struct padata_mt_job_state ps;
>>       LIST_HEAD(works);
>> -    int nworks;
>> +    int nworks, nid;
>> +    static atomic_t last_used_nid = ATOMIC_INIT(0);
> last_used_nid is only used during boot time so it could be
> __init_data. Otherwise, LGTM.
> 
> Reviewed-by: Muchun Song <muchun.song@linux.dev>
> 

OK, thanks.

Gang Li Jan. 19, 2024, 3:05 p.m. UTC | #4

On 2024/1/19 07:04, Tim Chen wrote:
> On Thu, 2024-01-18 at 20:39 +0800, Gang Li wrote:
>> When a group of tasks that access different nodes are scheduled on the
>> same node, they may encounter bandwidth bottlenecks and access latency.
>>
>> Thus, numa_aware flag is introduced here, allowing tasks to be
>> distributed across different nodes to fully utilize the advantage of
>> multi-node systems.
>>
>> Signed-off-by: Gang Li <gang.li@linux.dev>
>> Tested-by: David Rientjes <rientjes@google.com>
>> ---
>>   include/linux/padata.h |  3 +++
>>   kernel/padata.c        | 14 ++++++++++++--
>>   mm/mm_init.c           |  1 +
>>   3 files changed, 16 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/padata.h b/include/linux/padata.h
>> index 495b16b6b4d7..f79ccd50e7f4 100644
>> --- a/include/linux/padata.h
>> +++ b/include/linux/padata.h
>> @@ -137,6 +137,8 @@ struct padata_shell {
>>    *             appropriate for one worker thread to do at once.
>>    * @max_threads: Max threads to use for the job, actual number may be less
>>    *               depending on task size and minimum chunk size.
>> + * @numa_aware: Dispatch jobs to different nodes. If a node only has memory but
>> + *              no CPU, dispatch its jobs to a random CPU.
> 
> Suggest:
> Distribute jobs to different nodes with CPU in a round robin fashion.
> 

Good idea.
Thanks.

>>    */
>>   struct padata_mt_job {
>

diff mbox series

Patch

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 495b16b6b4d7..f79ccd50e7f4 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -137,6 +137,8 @@  struct padata_shell {
  *             appropriate for one worker thread to do at once.
  * @max_threads: Max threads to use for the job, actual number may be less
  *               depending on task size and minimum chunk size.
+ * @numa_aware: Dispatch jobs to different nodes. If a node only has memory but
+ *              no CPU, dispatch its jobs to a random CPU.
  */
 struct padata_mt_job {
 	void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
@@ -146,6 +148,7 @@  struct padata_mt_job {
 	unsigned long		align;
 	unsigned long		min_chunk;
 	int			max_threads;
+	bool			numa_aware;
 };
 
 /**
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070..10eae3f59203 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,8 @@  void __init padata_do_multithreaded(struct padata_mt_job *job)
 	struct padata_work my_work, *pw;
 	struct padata_mt_job_state ps;
 	LIST_HEAD(works);
-	int nworks;
+	int nworks, nid;
+	static atomic_t last_used_nid = ATOMIC_INIT(0);
 
 	if (job->size == 0)
 		return;
@@ -517,7 +518,16 @@  void __init padata_do_multithreaded(struct padata_mt_job *job)
 	ps.chunk_size = roundup(ps.chunk_size, job->align);
 
 	list_for_each_entry(pw, &works, pw_list)
-		queue_work(system_unbound_wq, &pw->pw_work);
+		if (job->numa_aware) {
+			int old_node = atomic_read(&last_used_nid);
+
+			do {
+				nid = next_node_in(old_node, node_states[N_CPU]);
+			} while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+			queue_work_node(nid, system_unbound_wq, &pw->pw_work);
+		} else {
+			queue_work(system_unbound_wq, &pw->pw_work);
+		}
 
 	/* Use the current thread, which saves starting a workqueue worker. */
 	padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2c19f5515e36..549e76af8f82 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2231,6 +2231,7 @@  static int __init deferred_init_memmap(void *data)
 			.align       = PAGES_PER_SECTION,
 			.min_chunk   = PAGES_PER_SECTION,
 			.max_threads = max_threads,
+			.numa_aware  = false,
 		};
 
 		padata_do_multithreaded(&job);