[2/2] mm/readahead: limit sync readahead while too many active refault

Message ID 20240201100835.1626685-3-liushixin2@huawei.com
State New
Headers
Series [1/2] mm/readahead: stop readahead loop if memcg charge fails |

Commit Message

Liu Shixin Feb. 1, 2024, 10:08 a.m. UTC
  When the pagefault is not for write and the refault distance is close,
the page will be activated directly. If there are too many such pages in
a file, that means the pages may be reclaimed immediately.
In such situation, there is no positive effect to read-ahead since it will
only waste IO. So collect the number of such pages and when the number is
too large, stop bothering with read-ahead for a while until it decreased
automatically.

Define 'too large' as 10000 experientially, which can solves the problem
and does not affect by the occasional active refault.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/fs.h      |  2 ++
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 16 ++++++++++++++++
 mm/readahead.c          |  4 ++++
 4 files changed, 23 insertions(+)
  

Comments

Jan Kara Feb. 1, 2024, 9:37 a.m. UTC | #1
On Thu 01-02-24 18:08:35, Liu Shixin wrote:
> When the pagefault is not for write and the refault distance is close,
> the page will be activated directly. If there are too many such pages in
> a file, that means the pages may be reclaimed immediately.
> In such situation, there is no positive effect to read-ahead since it will
> only waste IO. So collect the number of such pages and when the number is
> too large, stop bothering with read-ahead for a while until it decreased
> automatically.
> 
> Define 'too large' as 10000 experientially, which can solves the problem
> and does not affect by the occasional active refault.
> 
> Signed-off-by: Liu Shixin <liushixin2@huawei.com>

So I'm not convinced this new logic is needed. We already have
ra->mmap_miss which gets incremented when a page fault has to read the page
(and decremented when a page fault found the page already in cache). This
should already work to detect trashing as well, shouldn't it? If it does
not, why?

								Honza

> ---
>  include/linux/fs.h      |  2 ++
>  include/linux/pagemap.h |  1 +
>  mm/filemap.c            | 16 ++++++++++++++++
>  mm/readahead.c          |  4 ++++
>  4 files changed, 23 insertions(+)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ed5966a704951..f2a1825442f5a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -960,6 +960,7 @@ struct fown_struct {
>   *      the first of these pages is accessed.
>   * @ra_pages: Maximum size of a readahead request, copied from the bdi.
>   * @mmap_miss: How many mmap accesses missed in the page cache.
> + * @active_refault: Number of active page refault.
>   * @prev_pos: The last byte in the most recent read request.
>   *
>   * When this structure is passed to ->readahead(), the "most recent"
> @@ -971,6 +972,7 @@ struct file_ra_state {
>  	unsigned int async_size;
>  	unsigned int ra_pages;
>  	unsigned int mmap_miss;
> +	unsigned int active_refault;
>  	loff_t prev_pos;
>  };
>  
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 2df35e65557d2..da9eaf985dec4 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -1256,6 +1256,7 @@ struct readahead_control {
>  	pgoff_t _index;
>  	unsigned int _nr_pages;
>  	unsigned int _batch_count;
> +	unsigned int _active_refault;
>  	bool _workingset;
>  	unsigned long _pflags;
>  };
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 750e779c23db7..4de80592ab270 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3037,6 +3037,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
>  
>  #ifdef CONFIG_MMU
>  #define MMAP_LOTSAMISS  (100)
> +#define ACTIVE_REFAULT_LIMIT	(10000)
>  /*
>   * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
>   * @vmf - the vm_fault for this fault.
> @@ -3142,6 +3143,18 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>  	if (mmap_miss > MMAP_LOTSAMISS)
>  		return fpin;
>  
> +	ractl._active_refault = READ_ONCE(ra->active_refault);
> +	if (ractl._active_refault)
> +		WRITE_ONCE(ra->active_refault, --ractl._active_refault);
> +
> +	/*
> +	 * If there are a lot of refault of active pages in this file,
> +	 * that means the memory reclaim is ongoing. Stop bothering with
> +	 * read-ahead since it will only waste IO.
> +	 */
> +	if (ractl._active_refault >= ACTIVE_REFAULT_LIMIT)
> +		return fpin;
> +
>  	/*
>  	 * mmap read-around
>  	 */
> @@ -3151,6 +3164,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>  	ra->async_size = ra->ra_pages / 4;
>  	ractl._index = ra->start;
>  	page_cache_ra_order(&ractl, ra, 0);
> +
> +	WRITE_ONCE(ra->active_refault, ractl._active_refault);
> +
>  	return fpin;
>  }
>  
> diff --git a/mm/readahead.c b/mm/readahead.c
> index cc4abb67eb223..d79bb70a232c4 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -263,6 +263,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
>  			folio_set_readahead(folio);
>  		ractl->_workingset |= folio_test_workingset(folio);
>  		ractl->_nr_pages++;
> +		if (unlikely(folio_test_workingset(folio)))
> +			ractl->_active_refault++;
> +		else if (unlikely(ractl->_active_refault))
> +			ractl->_active_refault--;
>  	}
>  
>  	/*
> -- 
> 2.25.1
>
  
Liu Shixin Feb. 1, 2024, 10:41 a.m. UTC | #2
On 2024/2/1 17:37, Jan Kara wrote:
> On Thu 01-02-24 18:08:35, Liu Shixin wrote:
>> When the pagefault is not for write and the refault distance is close,
>> the page will be activated directly. If there are too many such pages in
>> a file, that means the pages may be reclaimed immediately.
>> In such situation, there is no positive effect to read-ahead since it will
>> only waste IO. So collect the number of such pages and when the number is
>> too large, stop bothering with read-ahead for a while until it decreased
>> automatically.
>>
>> Define 'too large' as 10000 experientially, which can solves the problem
>> and does not affect by the occasional active refault.
>>
>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> So I'm not convinced this new logic is needed. We already have
> ra->mmap_miss which gets incremented when a page fault has to read the page
> (and decremented when a page fault found the page already in cache). This
> should already work to detect trashing as well, shouldn't it? If it does
> not, why?
>
> 								Honza
ra->mmap_miss doesn't help, it increased only one in do_sync_mmap_readahead()
and then decreased one for every page in filemap_map_pages(). So in this scenario,
it can't exceed MMAP_LOTSAMISS.

Thanks,
>
>> ---
>>  include/linux/fs.h      |  2 ++
>>  include/linux/pagemap.h |  1 +
>>  mm/filemap.c            | 16 ++++++++++++++++
>>  mm/readahead.c          |  4 ++++
>>  4 files changed, 23 insertions(+)
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index ed5966a704951..f2a1825442f5a 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -960,6 +960,7 @@ struct fown_struct {
>>   *      the first of these pages is accessed.
>>   * @ra_pages: Maximum size of a readahead request, copied from the bdi.
>>   * @mmap_miss: How many mmap accesses missed in the page cache.
>> + * @active_refault: Number of active page refault.
>>   * @prev_pos: The last byte in the most recent read request.
>>   *
>>   * When this structure is passed to ->readahead(), the "most recent"
>> @@ -971,6 +972,7 @@ struct file_ra_state {
>>  	unsigned int async_size;
>>  	unsigned int ra_pages;
>>  	unsigned int mmap_miss;
>> +	unsigned int active_refault;
>>  	loff_t prev_pos;
>>  };
>>  
>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>> index 2df35e65557d2..da9eaf985dec4 100644
>> --- a/include/linux/pagemap.h
>> +++ b/include/linux/pagemap.h
>> @@ -1256,6 +1256,7 @@ struct readahead_control {
>>  	pgoff_t _index;
>>  	unsigned int _nr_pages;
>>  	unsigned int _batch_count;
>> +	unsigned int _active_refault;
>>  	bool _workingset;
>>  	unsigned long _pflags;
>>  };
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index 750e779c23db7..4de80592ab270 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3037,6 +3037,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
>>  
>>  #ifdef CONFIG_MMU
>>  #define MMAP_LOTSAMISS  (100)
>> +#define ACTIVE_REFAULT_LIMIT	(10000)
>>  /*
>>   * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
>>   * @vmf - the vm_fault for this fault.
>> @@ -3142,6 +3143,18 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>>  	if (mmap_miss > MMAP_LOTSAMISS)
>>  		return fpin;
>>  
>> +	ractl._active_refault = READ_ONCE(ra->active_refault);
>> +	if (ractl._active_refault)
>> +		WRITE_ONCE(ra->active_refault, --ractl._active_refault);
>> +
>> +	/*
>> +	 * If there are a lot of refault of active pages in this file,
>> +	 * that means the memory reclaim is ongoing. Stop bothering with
>> +	 * read-ahead since it will only waste IO.
>> +	 */
>> +	if (ractl._active_refault >= ACTIVE_REFAULT_LIMIT)
>> +		return fpin;
>> +
>>  	/*
>>  	 * mmap read-around
>>  	 */
>> @@ -3151,6 +3164,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>>  	ra->async_size = ra->ra_pages / 4;
>>  	ractl._index = ra->start;
>>  	page_cache_ra_order(&ractl, ra, 0);
>> +
>> +	WRITE_ONCE(ra->active_refault, ractl._active_refault);
>> +
>>  	return fpin;
>>  }
>>  
>> diff --git a/mm/readahead.c b/mm/readahead.c
>> index cc4abb67eb223..d79bb70a232c4 100644
>> --- a/mm/readahead.c
>> +++ b/mm/readahead.c
>> @@ -263,6 +263,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
>>  			folio_set_readahead(folio);
>>  		ractl->_workingset |= folio_test_workingset(folio);
>>  		ractl->_nr_pages++;
>> +		if (unlikely(folio_test_workingset(folio)))
>> +			ractl->_active_refault++;
>> +		else if (unlikely(ractl->_active_refault))
>> +			ractl->_active_refault--;
>>  	}
>>  
>>  	/*
>> -- 
>> 2.25.1
>>
  
Jan Kara Feb. 1, 2024, 5:31 p.m. UTC | #3
On Thu 01-02-24 18:41:30, Liu Shixin wrote:
> On 2024/2/1 17:37, Jan Kara wrote:
> > On Thu 01-02-24 18:08:35, Liu Shixin wrote:
> >> When the pagefault is not for write and the refault distance is close,
> >> the page will be activated directly. If there are too many such pages in
> >> a file, that means the pages may be reclaimed immediately.
> >> In such situation, there is no positive effect to read-ahead since it will
> >> only waste IO. So collect the number of such pages and when the number is
> >> too large, stop bothering with read-ahead for a while until it decreased
> >> automatically.
> >>
> >> Define 'too large' as 10000 experientially, which can solves the problem
> >> and does not affect by the occasional active refault.
> >>
> >> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> > So I'm not convinced this new logic is needed. We already have
> > ra->mmap_miss which gets incremented when a page fault has to read the page
> > (and decremented when a page fault found the page already in cache). This
> > should already work to detect trashing as well, shouldn't it? If it does
> > not, why?
> >
> > 								Honza
> ra->mmap_miss doesn't help, it increased only one in do_sync_mmap_readahead()
> and then decreased one for every page in filemap_map_pages(). So in this scenario,
> it can't exceed MMAP_LOTSAMISS.

I see, OK. But that's a (longstanding) bug in how mmap_miss is handled. Can
you please test whether attached patches fix the trashing for you? At least
now I can see mmap_miss properly increments when we are hitting uncached
pages...  Thanks!

								Honza
  
Liu Shixin Feb. 2, 2024, 1:25 a.m. UTC | #4
On 2024/2/2 1:31, Jan Kara wrote:
> On Thu 01-02-24 18:41:30, Liu Shixin wrote:
>> On 2024/2/1 17:37, Jan Kara wrote:
>>> On Thu 01-02-24 18:08:35, Liu Shixin wrote:
>>>> When the pagefault is not for write and the refault distance is close,
>>>> the page will be activated directly. If there are too many such pages in
>>>> a file, that means the pages may be reclaimed immediately.
>>>> In such situation, there is no positive effect to read-ahead since it will
>>>> only waste IO. So collect the number of such pages and when the number is
>>>> too large, stop bothering with read-ahead for a while until it decreased
>>>> automatically.
>>>>
>>>> Define 'too large' as 10000 experientially, which can solves the problem
>>>> and does not affect by the occasional active refault.
>>>>
>>>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>>> So I'm not convinced this new logic is needed. We already have
>>> ra->mmap_miss which gets incremented when a page fault has to read the page
>>> (and decremented when a page fault found the page already in cache). This
>>> should already work to detect trashing as well, shouldn't it? If it does
>>> not, why?
>>>
>>> 								Honza
>> ra->mmap_miss doesn't help, it increased only one in do_sync_mmap_readahead()
>> and then decreased one for every page in filemap_map_pages(). So in this scenario,
>> it can't exceed MMAP_LOTSAMISS.
> I see, OK. But that's a (longstanding) bug in how mmap_miss is handled. Can
> you please test whether attached patches fix the trashing for you? At least
> now I can see mmap_miss properly increments when we are hitting uncached
> pages...  Thanks!
>
> 								Honza
Thanks for the patch, I will test it.
>
  
Liu Shixin Feb. 2, 2024, 9:02 a.m. UTC | #5
On 2024/2/2 1:31, Jan Kara wrote:
> On Thu 01-02-24 18:41:30, Liu Shixin wrote:
>> On 2024/2/1 17:37, Jan Kara wrote:
>>> On Thu 01-02-24 18:08:35, Liu Shixin wrote:
>>>> When the pagefault is not for write and the refault distance is close,
>>>> the page will be activated directly. If there are too many such pages in
>>>> a file, that means the pages may be reclaimed immediately.
>>>> In such situation, there is no positive effect to read-ahead since it will
>>>> only waste IO. So collect the number of such pages and when the number is
>>>> too large, stop bothering with read-ahead for a while until it decreased
>>>> automatically.
>>>>
>>>> Define 'too large' as 10000 experientially, which can solves the problem
>>>> and does not affect by the occasional active refault.
>>>>
>>>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>>> So I'm not convinced this new logic is needed. We already have
>>> ra->mmap_miss which gets incremented when a page fault has to read the page
>>> (and decremented when a page fault found the page already in cache). This
>>> should already work to detect trashing as well, shouldn't it? If it does
>>> not, why?
>>>
>>> 								Honza
>> ra->mmap_miss doesn't help, it increased only one in do_sync_mmap_readahead()
>> and then decreased one for every page in filemap_map_pages(). So in this scenario,
>> it can't exceed MMAP_LOTSAMISS.
> I see, OK. But that's a (longstanding) bug in how mmap_miss is handled. Can
> you please test whether attached patches fix the trashing for you? At least
> now I can see mmap_miss properly increments when we are hitting uncached
> pages...  Thanks!
>
> 								Honza
The patch doesn't seem to have much effect. I will try to analyze why it doesn't work.
The attached file is my testcase.

Thanks,
>
#!/bin/bash
  
while true; do
    flag=$(ps -ef | grep -v grep | grep alloc_page| wc -l)
    if [ "$flag" -eq 0 ]; then
        /alloc_page &
    fi

    sleep 30

    start_time=$(date +%s)
    yum install -y expect > /dev/null 2>&1

    end_time=$(date +%s)

    elapsed_time=$((end_time - start_time))

    echo "$elapsed_time seconds"
    yum remove -y expect > /dev/null 2>&1
done
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

#define SIZE 1*1024*1024 //1M

int main()
{
    void *ptr = NULL;
    int i;

    for (i = 0; i < 1024 * 6 - 50;i++) {
        ptr = (void *) malloc(SIZE);
        if (ptr == NULL) {
            printf("malloc err!");
            return -1;
        }

        memset(ptr, 0, SIZE);
    }

    sleep(99999);

    free(ptr);
    return 0;
}
  
Liu Shixin Feb. 29, 2024, 9:01 a.m. UTC | #6
On 2024/2/2 17:02, Liu Shixin wrote:
>
> On 2024/2/2 1:31, Jan Kara wrote:
>> On Thu 01-02-24 18:41:30, Liu Shixin wrote:
>>> On 2024/2/1 17:37, Jan Kara wrote:
>>>> On Thu 01-02-24 18:08:35, Liu Shixin wrote:
>>>>> When the pagefault is not for write and the refault distance is close,
>>>>> the page will be activated directly. If there are too many such pages in
>>>>> a file, that means the pages may be reclaimed immediately.
>>>>> In such situation, there is no positive effect to read-ahead since it will
>>>>> only waste IO. So collect the number of such pages and when the number is
>>>>> too large, stop bothering with read-ahead for a while until it decreased
>>>>> automatically.
>>>>>
>>>>> Define 'too large' as 10000 experientially, which can solves the problem
>>>>> and does not affect by the occasional active refault.
>>>>>
>>>>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>>>> So I'm not convinced this new logic is needed. We already have
>>>> ra->mmap_miss which gets incremented when a page fault has to read the page
>>>> (and decremented when a page fault found the page already in cache). This
>>>> should already work to detect trashing as well, shouldn't it? If it does
>>>> not, why?
>>>>
>>>> 								Honza
>>> ra->mmap_miss doesn't help, it increased only one in do_sync_mmap_readahead()
>>> and then decreased one for every page in filemap_map_pages(). So in this scenario,
>>> it can't exceed MMAP_LOTSAMISS.
>> I see, OK. But that's a (longstanding) bug in how mmap_miss is handled. Can
>> you please test whether attached patches fix the trashing for you? At least
>> now I can see mmap_miss properly increments when we are hitting uncached
>> pages...  Thanks!
>>
>> 								Honza
> The patch doesn't seem to have much effect. I will try to analyze why it doesn't work.
> The attached file is my testcase.
>
> Thanks,

I think I figured out why mmap_miss doesn't work. After do_sync_mmap_readahead(), there is a
__filemap_get_folio() to make sure the page is ready. Then, it is ready too in filemap_map_pages(),
so the mmap_miss will decreased once. mmap_miss goes back to 0, and can't stop read-ahead.
Overall, I don't think mmap_miss can solve this problem.

.
  

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a704951..f2a1825442f5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -960,6 +960,7 @@  struct fown_struct {
  *      the first of these pages is accessed.
  * @ra_pages: Maximum size of a readahead request, copied from the bdi.
  * @mmap_miss: How many mmap accesses missed in the page cache.
+ * @active_refault: Number of active page refault.
  * @prev_pos: The last byte in the most recent read request.
  *
  * When this structure is passed to ->readahead(), the "most recent"
@@ -971,6 +972,7 @@  struct file_ra_state {
 	unsigned int async_size;
 	unsigned int ra_pages;
 	unsigned int mmap_miss;
+	unsigned int active_refault;
 	loff_t prev_pos;
 };
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d2..da9eaf985dec4 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1256,6 +1256,7 @@  struct readahead_control {
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
+	unsigned int _active_refault;
 	bool _workingset;
 	unsigned long _pflags;
 };
diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db7..4de80592ab270 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3037,6 +3037,7 @@  loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
 
 #ifdef CONFIG_MMU
 #define MMAP_LOTSAMISS  (100)
+#define ACTIVE_REFAULT_LIMIT	(10000)
 /*
  * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
  * @vmf - the vm_fault for this fault.
@@ -3142,6 +3143,18 @@  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	if (mmap_miss > MMAP_LOTSAMISS)
 		return fpin;
 
+	ractl._active_refault = READ_ONCE(ra->active_refault);
+	if (ractl._active_refault)
+		WRITE_ONCE(ra->active_refault, --ractl._active_refault);
+
+	/*
+	 * If there are a lot of refault of active pages in this file,
+	 * that means the memory reclaim is ongoing. Stop bothering with
+	 * read-ahead since it will only waste IO.
+	 */
+	if (ractl._active_refault >= ACTIVE_REFAULT_LIMIT)
+		return fpin;
+
 	/*
 	 * mmap read-around
 	 */
@@ -3151,6 +3164,9 @@  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	ra->async_size = ra->ra_pages / 4;
 	ractl._index = ra->start;
 	page_cache_ra_order(&ractl, ra, 0);
+
+	WRITE_ONCE(ra->active_refault, ractl._active_refault);
+
 	return fpin;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index cc4abb67eb223..d79bb70a232c4 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -263,6 +263,10 @@  void page_cache_ra_unbounded(struct readahead_control *ractl,
 			folio_set_readahead(folio);
 		ractl->_workingset |= folio_test_workingset(folio);
 		ractl->_nr_pages++;
+		if (unlikely(folio_test_workingset(folio)))
+			ractl->_active_refault++;
+		else if (unlikely(ractl->_active_refault))
+			ractl->_active_refault--;
 	}
 
 	/*