[-V2,2/5] swap, __read_swap_cache_async(): enlarge get/put_swap_device protection range
Commit Message
This makes the function a little easier to be understood because we
don't need to consider swapoff. And this makes it possible to remove
get/put_swap_device() calling in some functions called by
__read_swap_cache_async().
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
---
mm/swap_state.c | 33 ++++++++++++++++++++++-----------
1 file changed, 22 insertions(+), 11 deletions(-)
Comments
On 22.05.23 09:09, Huang Ying wrote:
> This makes the function a little easier to be understood because we
> don't need to consider swapoff. And this makes it possible to remove
> get/put_swap_device() calling in some functions called by
> __read_swap_cache_async().
>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Tim Chen <tim.c.chen@linux.intel.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Yu Zhao <yuzhao@google.com>
> ---
> mm/swap_state.c | 33 ++++++++++++++++++++++-----------
> 1 file changed, 22 insertions(+), 11 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b76a65ac28b3..a1028fe7214e 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -417,9 +417,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> {
> struct swap_info_struct *si;
> struct folio *folio;
> + struct page *page;
> void *shadow = NULL;
>
> *new_page_allocated = false;
> + si = get_swap_device(entry);
> + if (!si)
> + return NULL;
>
> for (;;) {
> int err;
> @@ -428,14 +432,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> * called after swap_cache_get_folio() failed, re-calling
> * that would confuse statistics.
> */
> - si = get_swap_device(entry);
> - if (!si)
> - return NULL;
> folio = filemap_get_folio(swap_address_space(entry),
> swp_offset(entry));
> - put_swap_device(si);
> - if (!IS_ERR(folio))
> - return folio_file_page(folio, swp_offset(entry));
> + if (!IS_ERR(folio)) {
> + page = folio_file_page(folio, swp_offset(entry));
> + goto got_page;
> + }
>
> /*
> * Just skip read ahead for unused swap slot.
> @@ -445,8 +447,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> * as SWAP_HAS_CACHE. That's done in later part of code or
> * else swap_off will be aborted if we return NULL.
> */
> - if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
> - return NULL;
> + if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
> + goto fail;
>
> /*
> * Get a new page to read into from swap. Allocate it now,
> @@ -455,7 +457,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> */
> folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
> if (!folio)
> - return NULL;
> + goto fail;
>
> /*
> * Swap entry may have been freed since our caller observed it.
> @@ -466,7 +468,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>
> folio_put(folio);
> if (err != -EEXIST)
> - return NULL;
> + goto fail;
>
> /*
> * We might race against __delete_from_swap_cache(), and
> @@ -500,12 +502,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> /* Caller will initiate read into locked folio */
> folio_add_lru(folio);
> *new_page_allocated = true;
> - return &folio->page;
> + page = &folio->page;
> +got_page:
> + put_swap_device(si);
> + return page;
>
> fail_unlock:
> put_swap_folio(folio, entry);
> folio_unlock(folio);
> folio_put(folio);
> +fail:
Maybe better "fail_put_swap".
We now hold the swap device ref longer than we used to, prevent swapoff
over the whole operation. I guess there is no way we can deadlock this way?
In general, looks good to me.
David Hildenbrand <david@redhat.com> writes:
> On 22.05.23 09:09, Huang Ying wrote:
>> This makes the function a little easier to be understood because we
>> don't need to consider swapoff. And this makes it possible to remove
>> get/put_swap_device() calling in some functions called by
>> __read_swap_cache_async().
>> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Tim Chen <tim.c.chen@linux.intel.com>
>> Cc: Yang Shi <shy828301@gmail.com>
>> Cc: Yu Zhao <yuzhao@google.com>
>> ---
>> mm/swap_state.c | 33 ++++++++++++++++++++++-----------
>> 1 file changed, 22 insertions(+), 11 deletions(-)
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index b76a65ac28b3..a1028fe7214e 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -417,9 +417,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> {
>> struct swap_info_struct *si;
>> struct folio *folio;
>> + struct page *page;
>> void *shadow = NULL;
>> *new_page_allocated = false;
>> + si = get_swap_device(entry);
>> + if (!si)
>> + return NULL;
>> for (;;) {
>> int err;
>> @@ -428,14 +432,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> * called after swap_cache_get_folio() failed, re-calling
>> * that would confuse statistics.
>> */
>> - si = get_swap_device(entry);
>> - if (!si)
>> - return NULL;
>> folio = filemap_get_folio(swap_address_space(entry),
>> swp_offset(entry));
>> - put_swap_device(si);
>> - if (!IS_ERR(folio))
>> - return folio_file_page(folio, swp_offset(entry));
>> + if (!IS_ERR(folio)) {
>> + page = folio_file_page(folio, swp_offset(entry));
>> + goto got_page;
>> + }
>> /*
>> * Just skip read ahead for unused swap slot.
>> @@ -445,8 +447,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> * as SWAP_HAS_CACHE. That's done in later part of code or
>> * else swap_off will be aborted if we return NULL.
>> */
>> - if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
>> - return NULL;
>> + if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
>> + goto fail;
>> /*
>> * Get a new page to read into from swap. Allocate it now,
>> @@ -455,7 +457,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> */
>> folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
>> if (!folio)
>> - return NULL;
>> + goto fail;
>> /*
>> * Swap entry may have been freed since our caller observed it.
>> @@ -466,7 +468,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> folio_put(folio);
>> if (err != -EEXIST)
>> - return NULL;
>> + goto fail;
>> /*
>> * We might race against __delete_from_swap_cache(), and
>> @@ -500,12 +502,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>> /* Caller will initiate read into locked folio */
>> folio_add_lru(folio);
>> *new_page_allocated = true;
>> - return &folio->page;
>> + page = &folio->page;
>> +got_page:
>> + put_swap_device(si);
>> + return page;
>> fail_unlock:
>> put_swap_folio(folio, entry);
>> folio_unlock(folio);
>> folio_put(folio);
>> +fail:
>
> Maybe better "fail_put_swap".
>
> We now hold the swap device ref longer than we used to, prevent
> swapoff over the whole operation. I guess there is no way we can
> deadlock this way?
I think that we are safe. In swapoff() syscall, we call
percpu_ref_kill() after all pages are swapped in (via try_to_unuse()).
> In general, looks good to me.
Thanks!
Best Regards,
Huang, Ying
@@ -417,9 +417,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct folio *folio;
+ struct page *page;
void *shadow = NULL;
*new_page_allocated = false;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
for (;;) {
int err;
@@ -428,14 +432,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry),
swp_offset(entry));
- put_swap_device(si);
- if (!IS_ERR(folio))
- return folio_file_page(folio, swp_offset(entry));
+ if (!IS_ERR(folio)) {
+ page = folio_file_page(folio, swp_offset(entry));
+ goto got_page;
+ }
/*
* Just skip read ahead for unused swap slot.
@@ -445,8 +447,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* as SWAP_HAS_CACHE. That's done in later part of code or
* else swap_off will be aborted if we return NULL.
*/
- if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- return NULL;
+ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ goto fail;
/*
* Get a new page to read into from swap. Allocate it now,
@@ -455,7 +457,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
if (!folio)
- return NULL;
+ goto fail;
/*
* Swap entry may have been freed since our caller observed it.
@@ -466,7 +468,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
folio_put(folio);
if (err != -EEXIST)
- return NULL;
+ goto fail;
/*
* We might race against __delete_from_swap_cache(), and
@@ -500,12 +502,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* Caller will initiate read into locked folio */
folio_add_lru(folio);
*new_page_allocated = true;
- return &folio->page;
+ page = &folio->page;
+got_page:
+ put_swap_device(si);
+ return page;
fail_unlock:
put_swap_folio(folio, entry);
folio_unlock(folio);
folio_put(folio);
+fail:
+ put_swap_device(si);
return NULL;
}
@@ -514,6 +521,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
+ *
+ * get/put_swap_device() aren't needed to call this function, because
+ * __read_swap_cache_async() call them and swap_readpage() holds the
+ * swap cache folio lock.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,