[V2,2/3] sched/numa: Enhance vma scanning logic

Message ID 5f0872657ddb164aa047a2231f8dc1086fe6adf6.1675159422.git.raghavendra.kt@amd.com
State New
Headers
Series sched/numa: Enhance vma scanning |

Commit Message

Raghavendra K T Feb. 1, 2023, 8:02 a.m. UTC
  During the Numa scanning make sure only relevant vmas of the
tasks are scanned.

Before:
 All the tasks of a process participate in scanning the vma
even if they do not access vma in it's lifespan.

Now:
 Except cases of first few unconditional scans, if a process do
not touch vma (exluding false positive cases of PID collisions)
tasks no longer scan all vma.

Logic used:
1) 6 bits of PID used to mark active bit in vma numab status during
 fault to remember PIDs accessing vma. (Thanks Mel)

2) Subsequently in scan path, vma scanning is skipped if current PID
had not accessed vma.

3) First two times we do allow unconditional scan to preserve earlier
 behaviour of scanning.

Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch
to store pid information.

Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
 include/linux/mm.h       | 14 ++++++++++++++
 include/linux/mm_types.h |  1 +
 kernel/sched/fair.c      | 15 +++++++++++++++
 mm/huge_memory.c         |  1 +
 mm/memory.c              |  1 +
 5 files changed, 32 insertions(+)
  

Comments

Peter Zijlstra Feb. 3, 2023, 11:15 a.m. UTC | #1
On Wed, Feb 01, 2023 at 01:32:21PM +0530, Raghavendra K T wrote:
>  During the Numa scanning make sure only relevant vmas of the
> tasks are scanned.
> 
> Before:
>  All the tasks of a process participate in scanning the vma
> even if they do not access vma in it's lifespan.
> 
> Now:
>  Except cases of first few unconditional scans, if a process do
> not touch vma (exluding false positive cases of PID collisions)
> tasks no longer scan all vma.
> 
> Logic used:
> 1) 6 bits of PID used to mark active bit in vma numab status during
>  fault to remember PIDs accessing vma. (Thanks Mel)
> 
> 2) Subsequently in scan path, vma scanning is skipped if current PID
> had not accessed vma.
> 
> 3) First two times we do allow unconditional scan to preserve earlier
>  behaviour of scanning.
> 
> Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch
> to store pid information.
> 
> Suggested-by: Mel Gorman <mgorman@techsingularity.net>
> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
> ---
>  include/linux/mm.h       | 14 ++++++++++++++
>  include/linux/mm_types.h |  1 +
>  kernel/sched/fair.c      | 15 +++++++++++++++
>  mm/huge_memory.c         |  1 +
>  mm/memory.c              |  1 +
>  5 files changed, 32 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 74d9df1d8982..489422942482 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1381,6 +1381,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
>  	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
>  	return last_time << PAGE_ACCESS_TIME_BUCKETS;
>  }
> +
> +static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
> +{
> +	unsigned int active_pid_bit;
> +
> +	if (vma->numab) {
> +		active_pid_bit = current->pid % BITS_PER_LONG;
> +		vma->numab->accessing_pids |= 1UL << active_pid_bit;
> +	}
> +}

Perhaps:

	if (vma->numab)
		__set_bit(current->pid % BITS_PER_LONG, &vma->numab->pids);

?

Or maybe even:

	bit = current->pid % BITS_PER_LONG;
	if (vma->numab && !__test_bit(bit, &vma->numab->pids))
		__set_bit(bit, &vma->numab->pids);


> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 060b241ce3c5..3505ae57c07c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2916,6 +2916,18 @@ static void reset_ptenuma_scan(struct task_struct *p)
>  	p->mm->numa_scan_offset = 0;
>  }
>  
> +static bool vma_is_accessed(struct vm_area_struct *vma)
> +{
> +	unsigned int active_pid_bit;
> +
	/*
	 * Tell us why 2....
	 */
> +	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
> +		return true;
> +
> +	active_pid_bit = current->pid % BITS_PER_LONG;
> +
> +	return vma->numab->accessing_pids & (1UL << active_pid_bit);
	return __test_bit(current->pid % BITS_PER_LONG, &vma->numab->pids)
> +}
> +
>  /*
>   * The expensive part of numa migration is done from task_work context.
>   * Triggered from task_tick_numa().
> @@ -3032,6 +3044,9 @@ static void task_numa_work(struct callback_head *work)
>  		if (mm->numa_scan_seq && time_before(jiffies, vma->numab->next_scan))
>  			continue;
>  
		/*
		 * tell us more...
		 */
> +		if (!vma_is_accessed(vma))
> +			continue;
> +
>  		do {
>  			start = max(start, vma->vm_start);
>  			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);


This feels wrong, specifically we track numa_scan_offset per mm, now, if
we divide the threads into two dis-joint groups each only using their
own set of vmas (in fact quite common for workloads with proper data
partitioning) it is possible to consistently sample one set of threads
and thus not scan the other set of vmas.

It seems somewhat unlikely, but not impossible to create significant
unfairness.

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 811d19b5c4f6..d908aa95f3c3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1485,6 +1485,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  	bool was_writable = pmd_savedwrite(oldpmd);
>  	int flags = 0;
>  
> +	vma_set_active_pid_bit(vma);
>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>  	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>  		spin_unlock(vmf->ptl);
> diff --git a/mm/memory.c b/mm/memory.c
> index 8c8420934d60..2ec3045cb8b3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4718,6 +4718,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>  	bool was_writable = pte_savedwrite(vmf->orig_pte);
>  	int flags = 0;
>  
> +	vma_set_active_pid_bit(vma);
>  	/*
>  	 * The "pte" at this point cannot be used safely without
>  	 * validation through pte_unmap_same(). It's of NUMA type but

Urghh... do_*numa_page() is two near identical functions.. is there
really no sane way to de-duplicate at least some of that?

Also, is this placement right, you're marking the thread even before we
know there's even a page there. I would expect this somewhere around
where we track lastpid.

Maybe numa_migrate_prep() ?
  
Peter Zijlstra Feb. 3, 2023, 11:27 a.m. UTC | #2
On Fri, Feb 03, 2023 at 12:15:48PM +0100, Peter Zijlstra wrote:

> > +static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
> > +{
> > +	unsigned int active_pid_bit;
> > +
> > +	if (vma->numab) {
> > +		active_pid_bit = current->pid % BITS_PER_LONG;
> > +		vma->numab->accessing_pids |= 1UL << active_pid_bit;
> > +	}
> > +}
> 
> Perhaps:
> 
> 	if (vma->numab)
> 		__set_bit(current->pid % BITS_PER_LONG, &vma->numab->pids);
> 
> ?
> 
> Or maybe even:
> 
> 	bit = current->pid % BITS_PER_LONG;
> 	if (vma->numab && !__test_bit(bit, &vma->numab->pids))
> 		__set_bit(bit, &vma->numab->pids);
> 

The alternative to just taking the low n bits is to use:

  hash_32(current->pid, BITS_PER_LONG)

That mixes things up a bit.
  
Raghavendra K T Feb. 4, 2023, 6:14 p.m. UTC | #3
On 2/3/2023 4:45 PM, Peter Zijlstra wrote:
> On Wed, Feb 01, 2023 at 01:32:21PM +0530, Raghavendra K T wrote:
>>   During the Numa scanning make sure only relevant vmas of the
>> tasks are scanned.
>>
>> Before:
>>   All the tasks of a process participate in scanning the vma
>> even if they do not access vma in it's lifespan.
>>
>> Now:
>>   Except cases of first few unconditional scans, if a process do
>> not touch vma (exluding false positive cases of PID collisions)
>> tasks no longer scan all vma.
>>
>> Logic used:
>> 1) 6 bits of PID used to mark active bit in vma numab status during
>>   fault to remember PIDs accessing vma. (Thanks Mel)
>>
>> 2) Subsequently in scan path, vma scanning is skipped if current PID
>> had not accessed vma.
>>
>> 3) First two times we do allow unconditional scan to preserve earlier
>>   behaviour of scanning.
>>
>> Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch
>> to store pid information.
>>
>> Suggested-by: Mel Gorman <mgorman@techsingularity.net>
>> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
>> ---
>>   include/linux/mm.h       | 14 ++++++++++++++
>>   include/linux/mm_types.h |  1 +
>>   kernel/sched/fair.c      | 15 +++++++++++++++
>>   mm/huge_memory.c         |  1 +
>>   mm/memory.c              |  1 +
>>   5 files changed, 32 insertions(+)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 74d9df1d8982..489422942482 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1381,6 +1381,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
>>   	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
>>   	return last_time << PAGE_ACCESS_TIME_BUCKETS;
>>   }
>> +
>> +static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
>> +{
>> +	unsigned int active_pid_bit;
>> +
>> +	if (vma->numab) {
>> +		active_pid_bit = current->pid % BITS_PER_LONG;
>> +		vma->numab->accessing_pids |= 1UL << active_pid_bit;
>> +	}
>> +}
> 
> Perhaps:
> 
> 	if (vma->numab)
> 		__set_bit(current->pid % BITS_PER_LONG, &vma->numab->pids);
> 
> ?
> 
> Or maybe even:
> 
> 	bit = current->pid % BITS_PER_LONG;
> 	if (vma->numab && !__test_bit(bit, &vma->numab->pids))
> 		__set_bit(bit, &vma->numab->pids);
> 
> 

Sure ..will use one of the above.

>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 060b241ce3c5..3505ae57c07c 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2916,6 +2916,18 @@ static void reset_ptenuma_scan(struct task_struct *p)
>>   	p->mm->numa_scan_offset = 0;
>>   }
>>   
>> +static bool vma_is_accessed(struct vm_area_struct *vma)
>> +{
>> +	unsigned int active_pid_bit;
>> +
> 	/*
> 	 * Tell us why 2....
> 	 */

Agree. The logic is more towards allowing unconditional scan first two
times to build task/page relation. I will experiment if we further need
to allow for two full passes if "multi-stage node selection" (=4), to
take care of early migration.

But only doubt I have is numa_scan_seq is per mm and thus will it create
corner cases or we need to have a per vma count separately when a new
VMA is created..

>> +	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
>> +		return true;
>> +
>> +	active_pid_bit = current->pid % BITS_PER_LONG;
>> +
>> +	return vma->numab->accessing_pids & (1UL << active_pid_bit);
> 	return __test_bit(current->pid % BITS_PER_LONG, &vma->numab->pids)
>> +}
>> +
>>   /*
>>    * The expensive part of numa migration is done from task_work context.
>>    * Triggered from task_tick_numa().
>> @@ -3032,6 +3044,9 @@ static void task_numa_work(struct callback_head *work)
>>   		if (mm->numa_scan_seq && time_before(jiffies, vma->numab->next_scan))
>>   			continue;
>>   
> 		/*
> 		 * tell us more...
> 		 */

Sure. Since this is the core of the whole logic where we want to confine 
VMA scan to PIDs of interest mostly.

>> +		if (!vma_is_accessed(vma))
>> +			continue;
>> +
>>   		do {
>>   			start = max(start, vma->vm_start);
>>   			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
> 
> 
> This feels wrong, specifically we track numa_scan_offset per mm, now, if
> we divide the threads into two dis-joint groups each only using their
> own set of vmas (in fact quite common for workloads with proper data
> partitioning) it is possible to consistently sample one set of threads
> and thus not scan the other set of vmas.
> 
> It seems somewhat unlikely, but not impossible to create significant
> unfairness.
> 

Agree, But that is the reason why we want to allow first few
unconditional scans Or am I missing something?

>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 811d19b5c4f6..d908aa95f3c3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1485,6 +1485,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>>   	bool was_writable = pmd_savedwrite(oldpmd);
>>   	int flags = 0;
>>   
>> +	vma_set_active_pid_bit(vma);
>>   	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>   	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
>>   		spin_unlock(vmf->ptl);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 8c8420934d60..2ec3045cb8b3 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4718,6 +4718,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>   	bool was_writable = pte_savedwrite(vmf->orig_pte);
>>   	int flags = 0;
>>   
>> +	vma_set_active_pid_bit(vma);
>>   	/*
>>   	 * The "pte" at this point cannot be used safely without
>>   	 * validation through pte_unmap_same(). It's of NUMA type but
> 
> Urghh... do_*numa_page() is two near identical functions.. is there
> really no sane way to de-duplicate at least some of that?
> 

Agree. I will explore and will take that as a separate TODO.

> Also, is this placement right, you're marking the thread even before we
> know there's even a page there. I would expect this somewhere around
> where we track lastpid.
> 

Good point. I will check this again

> Maybe numa_migrate_prep() ?

yes.. there was no hurry to record accessing pid early above...
  
Raghavendra K T Feb. 4, 2023, 6:18 p.m. UTC | #4
On 2/3/2023 4:57 PM, Peter Zijlstra wrote:
> On Fri, Feb 03, 2023 at 12:15:48PM +0100, Peter Zijlstra wrote:
> 
>>> +static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
>>> +{
>>> +	unsigned int active_pid_bit;
>>> +
>>> +	if (vma->numab) {
>>> +		active_pid_bit = current->pid % BITS_PER_LONG;
>>> +		vma->numab->accessing_pids |= 1UL << active_pid_bit;
>>> +	}
>>> +}
>>
>> Perhaps:
>>
>> 	if (vma->numab)
>> 		__set_bit(current->pid % BITS_PER_LONG, &vma->numab->pids);
>>
>> ?
>>
>> Or maybe even:
>>
>> 	bit = current->pid % BITS_PER_LONG;
>> 	if (vma->numab && !__test_bit(bit, &vma->numab->pids))
>> 		__set_bit(bit, &vma->numab->pids);
>>
> 
> The alternative to just taking the low n bits is to use:
> 
>    hash_32(current->pid, BITS_PER_LONG)
> 
> That mixes things up a bit.

Good idea, when we have workloads that creates lesser number of threads
faster, current solution might have been simpler, but with thread 
creation that happens over period of time hash function mixes and avoids 
collision. will experiment with this option.
  
Raghavendra K T Feb. 7, 2023, 6:41 a.m. UTC | #5
On 2/4/2023 11:44 PM, Raghavendra K T wrote:
> On 2/3/2023 4:45 PM, Peter Zijlstra wrote:
>> On Wed, Feb 01, 2023 at 01:32:21PM +0530, Raghavendra K T wrote:
[...]
> 
>>> +        if (!vma_is_accessed(vma))
>>> +            continue;
>>> +
>>>           do {
>>>               start = max(start, vma->vm_start);
>>>               end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
>>
>>
>> This feels wrong, specifically we track numa_scan_offset per mm, now, if
>> we divide the threads into two dis-joint groups each only using their
>> own set of vmas (in fact quite common for workloads with proper data
>> partitioning) it is possible to consistently sample one set of threads
>> and thus not scan the other set of vmas.
>>
>> It seems somewhat unlikely, but not impossible to create significant
>> unfairness.
>>
> 
> Agree, But that is the reason why we want to allow first few
> unconditional scans Or am I missing something?
> 

Thinking further, may be we can summarize the different aspects of 
thread/ two disjoint set case itself into:

1) Unfairness because of way in which threads gets opportunity
to scan.

2) Disjoint set of vmas in the partition set could be of different sizes

3) Disjoint set of vmas could be associated with different number of
threads

Each of above can potentially help or make some thread do heavy lifting

but (2), and (3). is what I think we are trying to be Okay with by
making sure tasks mostly do not scan others' vmas

(1) could be a real issue (though I know that there are many places we
  have corrected the issue by introducing offset in p->numa_next_scan)
but how the distribution looks now practically, I take it as a TODO and
post.
  
Raghavendra K T Feb. 27, 2023, 6:40 a.m. UTC | #6
On 2/7/2023 12:11 PM, Raghavendra K T wrote:
> On 2/4/2023 11:44 PM, Raghavendra K T wrote:
>> On 2/3/2023 4:45 PM, Peter Zijlstra wrote:
>>> On Wed, Feb 01, 2023 at 01:32:21PM +0530, Raghavendra K T wrote:
> [...]
>>
>>>> +        if (!vma_is_accessed(vma))
>>>> +            continue;
>>>> +
>>>>           do {
>>>>               start = max(start, vma->vm_start);
>>>>               end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
>>>
>>>
>>> This feels wrong, specifically we track numa_scan_offset per mm, now, if
>>> we divide the threads into two dis-joint groups each only using their
>>> own set of vmas (in fact quite common for workloads with proper data
>>> partitioning) it is possible to consistently sample one set of threads
>>> and thus not scan the other set of vmas.
>>>
>>> It seems somewhat unlikely, but not impossible to create significant
>>> unfairness.
>>>
>>
>> Agree, But that is the reason why we want to allow first few
>> unconditional scans Or am I missing something?
>>
> 
> Thinking further, may be we can summarize the different aspects of 
> thread/ two disjoint set case itself into:
> 
> 1) Unfairness because of way in which threads gets opportunity
> to scan.
> 
> 2) Disjoint set of vmas in the partition set could be of different sizes
> 
> 3) Disjoint set of vmas could be associated with different number of
> threads
> 
> Each of above can potentially help or make some thread do heavy lifting
> 
> but (2), and (3). is what I think we are trying to be Okay with by
> making sure tasks mostly do not scan others' vmas
> 
> (1) could be a real issue (though I know that there are many places we
>   have corrected the issue by introducing offset in p->numa_next_scan)
> but how the distribution looks now practically, I take it as a TODO and
> post.


Hello PeterZ,
Sorry to come back little late with Data for how unfair are we when we
have two disjoint set of groups mentioned above:

Program I tested with:
1. 128/256 thread program divided into two sets each accessing
different set of memory (set0, set1) from node1 and node2 to induce
migration (8GB divided into 4GB each].
Total iteration per thread was around 2048 to make sure for
128 thread: program ran around 8 min
256 thread: program ran around 17/18 min

SUT: 128 core 256 CPU Milan with 2 nodes

Some of the observations:
1) Total number of threads which gets access to scan in the entire
program span was around 50-62%

for e.g. 128 thread case total tasks got access to scan was around 74-84
for 256 thread range was 123-146

2) There was a little bias towards set 1 always (the threads from where 
remote faults occurred)

In summary: I do see that access to VMAs from disjoint sets is not fully
  fair, But on the other hand it is not very bad too. There is definitely
some scope or possibility to explore/improve fairness in this area
further.

Posting result for one of the 128 threads: (base 6.1.0 vanilla)

column1: frequency
column2: PID

set 0 threads
       1 5906
       1 5908
       2 5912
       1 5914
       1 5916
       1 5918
       1 5926
       2 5928
       3 5932
       3 5938
       1 5940
       1 5944
       2 5948
       2 5950
       1 5956
       1 5962
       1 5974
       1 5978
       1 5992
       1 6004
       1 6006
       1 6008
       1 6012
       3 6014
       1 6016
       2 6026
       2 6028
       1 6030

set 1 threads
       3 5907
       5 5909
       2 5911
       4 5913
       7 5915
       2 5917
       3 5919
       5 5921
       3 5923
       2 5925
       4 5927
       4 5929
       3 5931
       3 5933
       2 5935
       7 5937
       4 5939
       3 5941
       4 5943
       1 5945
       2 5947
       1 5949
       1 5951
       2 5953
       1 5955
       1 5957
       1 5959
       1 5961
       1 5963
       1 5965
       1 5967
       1 5969
       1 5971
       2 5975
       2 5979
       2 5981
       2 5983
       5 5993
       5 5995
      11 5997
       1 5999
       6 6003
       4 6005
       3 6007
       1 6013
       1 6015
       3 6017
       1 6019
       1 6021
       1 6023
       6 6027
       4 6029
       4 6031
       1 6033

PS: I have also tested above applying V3 patch (which incorporates your
suggestions), have not seen much deviation in observation with patch.

Thanks
- Raghu
  
Peter Zijlstra Feb. 27, 2023, 10:06 a.m. UTC | #7
On Mon, Feb 27, 2023 at 12:10:41PM +0530, Raghavendra K T wrote:
> In summary: I do see that access to VMAs from disjoint sets is not fully
>  fair, But on the other hand it is not very bad too. There is definitely
> some scope or possibility to explore/improve fairness in this area
> further.

Ok, might be good to summarize some of this in a comment near here, so
that readers are aware of the caveat of this code.

> PS: I have also tested above applying V3 patch (which incorporates your
> suggestions), have not seen much deviation in observation with patch.

I'll see if I can find it in this dumpester fire I call inbox :-)
  
Raghavendra K T Feb. 27, 2023, 10:12 a.m. UTC | #8
On 2/27/2023 3:36 PM, Peter Zijlstra wrote:
> On Mon, Feb 27, 2023 at 12:10:41PM +0530, Raghavendra K T wrote:
>> In summary: I do see that access to VMAs from disjoint sets is not fully
>>   fair, But on the other hand it is not very bad too. There is definitely
>> some scope or possibility to explore/improve fairness in this area
>> further.
> 
> Ok, might be good to summarize some of this in a comment near here, so
> that readers are aware of the caveat of this code.
> 

Sure will do.

>> PS: I have also tested above applying V3 patch (which incorporates your
>> suggestions), have not seen much deviation in observation with patch.
> 
> I'll see if I can find it in this dumpester fire I call inbox :-)

Sorry I wasn't clear there.. Still in inbox and am about to post.. It
was a heads up.
  
Raghavendra K T Feb. 28, 2023, 4:59 a.m. UTC | #9
On 2/4/2023 11:44 PM, Raghavendra K T wrote:
> On 2/3/2023 4:45 PM, Peter Zijlstra wrote:
>> On Wed, Feb 01, 2023 at 01:32:21PM +0530, Raghavendra K T wrote:
[...]>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 8c8420934d60..2ec3045cb8b3 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4718,6 +4718,7 @@ static vm_fault_t do_numa_page(struct vm_fault 
>>> *vmf)
>>>       bool was_writable = pte_savedwrite(vmf->orig_pte);
>>>       int flags = 0;
>>> +    vma_set_active_pid_bit(vma);
>>>       /*
>>>        * The "pte" at this point cannot be used safely without
>>>        * validation through pte_unmap_same(). It's of NUMA type but
>>
>> Urghh... do_*numa_page() is two near identical functions.. is there
>> really no sane way to de-duplicate at least some of that?
>>
> 
> Agree. I will explore and will take that as a separate TODO.
> 

Did spend some time to look at if there is a better way of merging these
two.
code looks similar as you noted, with very subtle changes (pte vs pmd
and difference in unlock).
But I thought only some part of the code can be changed easily, but 
changing whole code did not look to be worth as of now.
(unless we come up with better idea)

This is the only comment perhaps not addressed rightly I feel :(

Thanks
  

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 74d9df1d8982..489422942482 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1381,6 +1381,16 @@  static inline int xchg_page_access_time(struct page *page, int time)
 	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
+
+static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
+{
+	unsigned int active_pid_bit;
+
+	if (vma->numab) {
+		active_pid_bit = current->pid % BITS_PER_LONG;
+		vma->numab->accessing_pids |= 1UL << active_pid_bit;
+	}
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
@@ -1430,6 +1440,10 @@  static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 {
 	return false;
 }
+
+static inline void vma_set_active_pid_bit(struct vm_area_struct *vma)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e84f95a77321..980a6a4308b6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -437,6 +437,7 @@  struct anon_vma_name {
 
 struct vma_numab {
 	unsigned long next_scan;
+	unsigned long accessing_pids;
 };
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 060b241ce3c5..3505ae57c07c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2916,6 +2916,18 @@  static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+	unsigned int active_pid_bit;
+
+	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+		return true;
+
+	active_pid_bit = current->pid % BITS_PER_LONG;
+
+	return vma->numab->accessing_pids & (1UL << active_pid_bit);
+}
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3032,6 +3044,9 @@  static void task_numa_work(struct callback_head *work)
 		if (mm->numa_scan_seq && time_before(jiffies, vma->numab->next_scan))
 			continue;
 
+		if (!vma_is_accessed(vma))
+			continue;
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 811d19b5c4f6..d908aa95f3c3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1485,6 +1485,7 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	bool was_writable = pmd_savedwrite(oldpmd);
 	int flags = 0;
 
+	vma_set_active_pid_bit(vma);
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
 		spin_unlock(vmf->ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 8c8420934d60..2ec3045cb8b3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4718,6 +4718,7 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	bool was_writable = pte_savedwrite(vmf->orig_pte);
 	int flags = 0;
 
+	vma_set_active_pid_bit(vma);
 	/*
 	 * The "pte" at this point cannot be used safely without
 	 * validation through pte_unmap_same(). It's of NUMA type but