diff mbox series

[08/24] mm/swap: check readahead policy per entry

Message ID	20231119194740.94101-9-ryncsn@gmail.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.37 as permitted sender) client-ip=23.128.96.37; From: Kairui Song <ryncsn@gmail.com> To: linux-mm@kvack.org Cc: Andrew Morton <akpm@linux-foundation.org>, "Huang, Ying" <ying.huang@intel.com>, David Hildenbrand <david@redhat.com>, Hugh Dickins <hughd@google.com>, Johannes Weiner <hannes@cmpxchg.org>, Matthew Wilcox <willy@infradead.org>, Michal Hocko <mhocko@suse.com>, linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com> Subject: [PATCH 08/24] mm/swap: check readahead policy per entry Date: Mon, 20 Nov 2023 03:47:24 +0800 Message-ID: <20231119194740.94101-9-ryncsn@gmail.com> In-Reply-To: <20231119194740.94101-1-ryncsn@gmail.com> References: <20231119194740.94101-1-ryncsn@gmail.com> Reply-To: Kairui Song <kasong@tencent.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Swapin path refactor for optimization and bugfix \| [00/24] Swapin path refactor for optimization and bugfix [01/24] mm/swap: fix a potential undefined behavior issue [02/24] mm/swapfile.c: add back some comment [03/24] mm/swap: move no readahead swapin code to a stand alone helper [04/24] mm/swap: avoid setting page lock bit and doing extra unlock check [05/24] mm/swap: move readahead policy checking into swapin_readahead [06/24] swap: rework swapin_no_readahead arguments [07/24] mm/swap: move swap_count to header to be shared [08/24] mm/swap: check readahead policy per entry [09/24] mm/swap: inline __swap_count [10/24] mm/swap: remove nr_rotate_swap and related code [11/24] mm/swap: also handle swapcache lookup in swapin_readahead [12/24] mm/swap: simplify arguments for swap_cache_get_folio [13/24] swap: simplify swap_cache_get_folio [14/24] mm/swap: do shadow lookup as well when doing swap cache lookup [15/24] mm/swap: avoid an duplicated swap cache lookup for SYNCHRONOUS_IO device [16/24] mm/swap: reduce scope of get_swap_device in swapin path [17/24] mm/swap: fix false error when swapoff race with swapin [18/24] mm/swap: introduce a helper non fault swapin [19/24] shmem, swap: refactor error check on OOM or race [20/24] swap: simplify and make swap_find_cache static [21/24] swap: make swapin_readahead result checking argument mandatory [22/24] swap: make swap_cluster_readahead static [23/24] swap: fix multiple swap leak when after cgroup migrate [24/24] mm/swap: change swapin_readahead to swapin_page_fault

Commit Message

Kairui Song Nov. 19, 2023, 7:47 p.m. UTC

  From: Kairui Song <kasong@tencent.com>

Currently VMA readahead is globally disabled when any rotate disk is
used as swap backend. So multiple swap devices are enabled, if a slower
hard disk is set as a low priority fallback, and a high performance SSD
is used and high priority swap device, vma readahead is disabled globally.
The SSD swap device performance will drop by a lot.

Check readahead policy per entry to avoid such problem.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swap_state.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

Comments

Huang, Ying Nov. 20, 2023, 6:04 a.m. UTC | #1

Kairui Song <ryncsn@gmail.com> writes:

> From: Kairui Song <kasong@tencent.com>
>
> Currently VMA readahead is globally disabled when any rotate disk is
> used as swap backend. So multiple swap devices are enabled, if a slower
> hard disk is set as a low priority fallback, and a high performance SSD
> is used and high priority swap device, vma readahead is disabled globally.
> The SSD swap device performance will drop by a lot.
>
> Check readahead policy per entry to avoid such problem.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/swap_state.c | 12 +++++++-----
>  1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index ff6756f2e8e4..fb78f7f18ed7 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -321,9 +321,9 @@ static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
>  	return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
>  }
>  
> -static inline bool swap_use_vma_readahead(void)
> +static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
>  {
> -	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> +	return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);
>  }
>  
>  /*
> @@ -341,7 +341,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
>  
>  	folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
>  	if (!IS_ERR(folio)) {
> -		bool vma_ra = swap_use_vma_readahead();
> +		bool vma_ra = swap_use_vma_readahead(swp_swap_info(entry));
>  		bool readahead;
>  
>  		/*
> @@ -920,16 +920,18 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  			      struct vm_fault *vmf, bool *swapcached)
>  {
> +	struct swap_info_struct *si;
>  	struct mempolicy *mpol;
>  	struct page *page;
>  	pgoff_t ilx;
>  	bool cached;
>  
> +	si = swp_swap_info(entry);
>  	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> -	if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> +	if (swap_use_no_readahead(si, entry)) {
>  		page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
>  		cached = false;
> -	} else if (swap_use_vma_readahead()) {
> +	} else if (swap_use_vma_readahead(si)) {

It's possible that some pages are swapped out to SSD while others are
swapped out to HDD in a readahead window.

I suspect that there are practical requirements to use swap on SSD and
HDD at the same time.

>  		page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
>  		cached = true;
>  	} else {

--
Best Regards,
Huang, Ying

Kairui Song Nov. 20, 2023, 11:17 a.m. UTC | #2

Huang, Ying <ying.huang@intel.com> 于2023年11月20日周一 14:07写道：
>
> Kairui Song <ryncsn@gmail.com> writes:
>
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently VMA readahead is globally disabled when any rotate disk is
> > used as swap backend. So multiple swap devices are enabled, if a slower
> > hard disk is set as a low priority fallback, and a high performance SSD
> > is used and high priority swap device, vma readahead is disabled globally.
> > The SSD swap device performance will drop by a lot.
> >
> > Check readahead policy per entry to avoid such problem.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/swap_state.c | 12 +++++++-----
> >  1 file changed, 7 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index ff6756f2e8e4..fb78f7f18ed7 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -321,9 +321,9 @@ static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
> >       return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> >  }
> >
> > -static inline bool swap_use_vma_readahead(void)
> > +static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
> >  {
> > -     return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> > +     return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);
> >  }
> >
> >  /*
> > @@ -341,7 +341,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
> >
> >       folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
> >       if (!IS_ERR(folio)) {
> > -             bool vma_ra = swap_use_vma_readahead();
> > +             bool vma_ra = swap_use_vma_readahead(swp_swap_info(entry));
> >               bool readahead;
> >
> >               /*
> > @@ -920,16 +920,18 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >                             struct vm_fault *vmf, bool *swapcached)
> >  {
> > +     struct swap_info_struct *si;
> >       struct mempolicy *mpol;
> >       struct page *page;
> >       pgoff_t ilx;
> >       bool cached;
> >
> > +     si = swp_swap_info(entry);
> >       mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> > -     if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> > +     if (swap_use_no_readahead(si, entry)) {
> >               page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
> >               cached = false;
> > -     } else if (swap_use_vma_readahead()) {
> > +     } else if (swap_use_vma_readahead(si)) {
>
> It's possible that some pages are swapped out to SSD while others are
> swapped out to HDD in a readahead window.
>
> I suspect that there are practical requirements to use swap on SSD and
> HDD at the same time.

Hi Ying,

Thanks for the review!

For the first issue "fragmented readahead window", I was planning to
do an extra check in readahead path to skip readahead entries that are
on different swap devices, which is not hard to do, but this series is
growing too long so I thought it will be better done later.

For the second issue, "is there any practical use for multiple swap",
I think actually there are. For example we are trying to use multi
layer swap for offloading memory of different hotness on servers. And
we also tried to implement a mechanism to migrate long sleep swap
entries from high performance SSD/RAMDISK swap to cheap HDD swap
device, with more than two layers of swap, which worked except the
upstream issue, that readahead policy will no longer work as expected.


>
> >               page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> >               cached = true;
> >       } else {
>
> --
> Best Regards,
> Huang, Ying

Huang, Ying Nov. 21, 2023, 1:10 a.m. UTC | #3

Kairui Song <ryncsn@gmail.com> writes:

> Huang, Ying <ying.huang@intel.com> 于2023年11月20日周一 14:07写道：
>>
>> Kairui Song <ryncsn@gmail.com> writes:
>>
>> > From: Kairui Song <kasong@tencent.com>
>> >
>> > Currently VMA readahead is globally disabled when any rotate disk is
>> > used as swap backend. So multiple swap devices are enabled, if a slower
>> > hard disk is set as a low priority fallback, and a high performance SSD
>> > is used and high priority swap device, vma readahead is disabled globally.
>> > The SSD swap device performance will drop by a lot.
>> >
>> > Check readahead policy per entry to avoid such problem.
>> >
>> > Signed-off-by: Kairui Song <kasong@tencent.com>
>> > ---
>> >  mm/swap_state.c | 12 +++++++-----
>> >  1 file changed, 7 insertions(+), 5 deletions(-)
>> >
>> > diff --git a/mm/swap_state.c b/mm/swap_state.c
>> > index ff6756f2e8e4..fb78f7f18ed7 100644
>> > --- a/mm/swap_state.c
>> > +++ b/mm/swap_state.c
>> > @@ -321,9 +321,9 @@ static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
>> >       return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
>> >  }
>> >
>> > -static inline bool swap_use_vma_readahead(void)
>> > +static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
>> >  {
>> > -     return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
>> > +     return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);
>> >  }
>> >
>> >  /*
>> > @@ -341,7 +341,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
>> >
>> >       folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
>> >       if (!IS_ERR(folio)) {
>> > -             bool vma_ra = swap_use_vma_readahead();
>> > +             bool vma_ra = swap_use_vma_readahead(swp_swap_info(entry));
>> >               bool readahead;
>> >
>> >               /*
>> > @@ -920,16 +920,18 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
>> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
>> >                             struct vm_fault *vmf, bool *swapcached)
>> >  {
>> > +     struct swap_info_struct *si;
>> >       struct mempolicy *mpol;
>> >       struct page *page;
>> >       pgoff_t ilx;
>> >       bool cached;
>> >
>> > +     si = swp_swap_info(entry);
>> >       mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
>> > -     if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
>> > +     if (swap_use_no_readahead(si, entry)) {
>> >               page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
>> >               cached = false;
>> > -     } else if (swap_use_vma_readahead()) {
>> > +     } else if (swap_use_vma_readahead(si)) {
>>
>> It's possible that some pages are swapped out to SSD while others are
>> swapped out to HDD in a readahead window.
>>
>> I suspect that there are practical requirements to use swap on SSD and
>> HDD at the same time.
>
> Hi Ying,
>
> Thanks for the review!
>
> For the first issue "fragmented readahead window", I was planning to
> do an extra check in readahead path to skip readahead entries that are
> on different swap devices, which is not hard to do,

This is a possible solution.

> but this series is growing too long so I thought it will be better
> done later.

You don't need to keep everything in one series.  Just use multiple
series.  Even if they are all swap-related.  They are dealing with
different problem in fact.

> For the second issue, "is there any practical use for multiple swap",
> I think actually there are. For example we are trying to use multi
> layer swap for offloading memory of different hotness on servers. And
> we also tried to implement a mechanism to migrate long sleep swap
> entries from high performance SSD/RAMDISK swap to cheap HDD swap
> device, with more than two layers of swap, which worked except the
> upstream issue, that readahead policy will no longer work as expected.

Thanks for your information.

>> >               page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
>> >               cached = true;
>> >       } else {

--
Best Regards,
Huang, Ying

Chris Li Nov. 21, 2023, 5:13 a.m. UTC | #4

On Mon, Nov 20, 2023 at 3:17 AM Kairui Song <ryncsn@gmail.com> wrote:
ime.
>
> Hi Ying,
>
> Thanks for the review!
>
> For the first issue "fragmented readahead window", I was planning to
> do an extra check in readahead path to skip readahead entries that are

That makes sense. The read ahead is an optional thing for speed
optimization. If the read ahead crosses the swap device boundaries.
The read ahead portion can be capped.

> on different swap devices, which is not hard to do, but this series is
> growing too long so I thought it will be better done later.
>
> For the second issue, "is there any practical use for multiple swap",
> I think actually there are. For example we are trying to use multi
> layer swap for offloading memory of different hotness on servers. And
> we also tried to implement a mechanism to migrate long sleep swap
> entries from high performance SSD/RAMDISK swap to cheap HDD swap
> device, with more than two layers of swap, which worked except the
> upstream issue, that readahead policy will no longer work as expected.

Thank you very much for sharing your usage case. I am proposing
"memory.swap.tiers"  in this email thread:
https://lore.kernel.org/linux-mm/CAF8kJuOD6zq2VPcVdoZGvkzYX8iXn1akuYhNDJx-LUdS+Sx3GA@mail.gmail.com/
It allows memcg to select which swap device/tiers it wants to opt in.
Your SSD and HDD swap combination is what I have in mind as well.

Chris

Chris Li Nov. 21, 2023, 5:20 a.m. UTC | #5

On Mon, Nov 20, 2023 at 5:12 PM Huang, Ying <ying.huang@intel.com> wrote:
> > but this series is growing too long so I thought it will be better
> > done later.
>
> You don't need to keep everything in one series.  Just use multiple
> series.  Even if they are all swap-related.  They are dealing with
> different problem in fact.

I second that. Actually having multiple smaller series is *preferred*
over one long series.
Shorter series are easier to review.

Chris

Chris Li Nov. 21, 2023, 7:54 a.m. UTC | #6

On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Currently VMA readahead is globally disabled when any rotate disk is
> used as swap backend. So multiple swap devices are enabled, if a slower
> hard disk is set as a low priority fallback, and a high performance SSD
> is used and high priority swap device, vma readahead is disabled globally.
> The SSD swap device performance will drop by a lot.
>
> Check readahead policy per entry to avoid such problem.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/swap_state.c | 12 +++++++-----
>  1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index ff6756f2e8e4..fb78f7f18ed7 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -321,9 +321,9 @@ static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
>         return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
>  }
>
> -static inline bool swap_use_vma_readahead(void)
> +static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
>  {
> -       return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> +       return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);

A very minor point:
I notice you change the order enable_vma_readahead to the last.
Normally if enable_vma_reachahead == 0, there is no need to check the si->flags.
The si->flags check is more expensive than simple memory load.
You might want to check enable_vma_readahead first then you can short
cut the more expensive part.

Chris

Kairui Song Nov. 23, 2023, 10:52 a.m. UTC | #7

Chris Li <chrisl@kernel.org> 于2023年11月21日周二 15:54写道：
>
> On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently VMA readahead is globally disabled when any rotate disk is
> > used as swap backend. So multiple swap devices are enabled, if a slower
> > hard disk is set as a low priority fallback, and a high performance SSD
> > is used and high priority swap device, vma readahead is disabled globally.
> > The SSD swap device performance will drop by a lot.
> >
> > Check readahead policy per entry to avoid such problem.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/swap_state.c | 12 +++++++-----
> >  1 file changed, 7 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index ff6756f2e8e4..fb78f7f18ed7 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -321,9 +321,9 @@ static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
> >         return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> >  }
> >
> > -static inline bool swap_use_vma_readahead(void)
> > +static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
> >  {
> > -       return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> > +       return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);
>
> A very minor point:
> I notice you change the order enable_vma_readahead to the last.
> Normally if enable_vma_reachahead == 0, there is no need to check the si->flags.
> The si->flags check is more expensive than simple memory load.
> You might want to check enable_vma_readahead first then you can short
> cut the more expensive part.

Thanks, I'll improve this part.

diff mbox series

Patch

diff --git a/mm/swap_state.c b/mm/swap_state.c
index ff6756f2e8e4..fb78f7f18ed7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -321,9 +321,9 @@  static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_
 	return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
 }
 
-static inline bool swap_use_vma_readahead(void)
+static inline bool swap_use_vma_readahead(struct swap_info_struct *si)
 {
-	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
+	return data_race(si->flags & SWP_SOLIDSTATE) && READ_ONCE(enable_vma_readahead);
 }
 
 /*
@@ -341,7 +341,7 @@  struct folio *swap_cache_get_folio(swp_entry_t entry,
 
 	folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
 	if (!IS_ERR(folio)) {
-		bool vma_ra = swap_use_vma_readahead();
+		bool vma_ra = swap_use_vma_readahead(swp_swap_info(entry));
 		bool readahead;
 
 		/*
@@ -920,16 +920,18 @@  static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 			      struct vm_fault *vmf, bool *swapcached)
 {
+	struct swap_info_struct *si;
 	struct mempolicy *mpol;
 	struct page *page;
 	pgoff_t ilx;
 	bool cached;
 
+	si = swp_swap_info(entry);
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
-	if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
+	if (swap_use_no_readahead(si, entry)) {
 		page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
 		cached = false;
-	} else if (swap_use_vma_readahead()) {
+	} else if (swap_use_vma_readahead(si)) {
 		page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
 		cached = true;
 	} else {