[18/24] mm/swap: introduce a helper non fault swapin

Message ID 20231119194740.94101-19-ryncsn@gmail.com
State New
Headers
Series Swapin path refactor for optimization and bugfix |

Commit Message

Kairui Song Nov. 19, 2023, 7:47 p.m. UTC
  From: Kairui Song <kasong@tencent.com>

There are two places where swapin is not direct caused by page fault:
shmem swapin is invoked through shmem mapping, swapoff cause swapin by
walking the page table. They used to construct a pseudo vmfault struct
for swapin function.

Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
path is still using a pseudo vmfault.

Introduce a helper for them both, this help save stack usage for swapoff
path, and help apply a unified swapin cache and readahead policy check.

Also prepare for follow up commits.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c      | 51 ++++++++++++++++---------------------------------
 mm/swap.h       | 11 +++++++++++
 mm/swap_state.c | 38 ++++++++++++++++++++++++++++++++++++
 mm/swapfile.c   | 23 +++++++++++-----------
 4 files changed, 76 insertions(+), 47 deletions(-)
  

Comments

kernel test robot Nov. 20, 2023, 1:07 a.m. UTC | #1
Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.7-rc2 next-20231117]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/mm-swap-fix-a-potential-undefined-behavior-issue/20231120-035926
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20231119194740.94101-19-ryncsn%40gmail.com
patch subject: [PATCH 18/24] mm/swap: introduce a helper non fault swapin
config: i386-buildonly-randconfig-002-20231120 (https://download.01.org/0day-ci/archive/20231120/202311200850.FrQj7bMD-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231120/202311200850.FrQj7bMD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311200850.FrQj7bMD-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from mm/shmem.c:43:
   mm/swap.h:105:31: warning: declaration of 'enum swap_cache_result' will not be visible outside of this function [-Wvisibility]
                           struct vm_fault *vmf, enum swap_cache_result *result)
                                                      ^
   mm/swap.h:112:8: warning: declaration of 'enum swap_cache_result' will not be visible outside of this function [-Wvisibility]
                   enum swap_cache_result *result)
                        ^
>> mm/shmem.c:1841:25: error: variable has incomplete type 'enum swap_cache_result'
           enum swap_cache_result result;
                                  ^
   mm/shmem.c:1841:7: note: forward declaration of 'enum swap_cache_result'
           enum swap_cache_result result;
                ^
>> mm/shmem.c:1870:31: error: use of undeclared identifier 'SWAP_CACHE_HIT'
                   if (fault_type && result != SWAP_CACHE_HIT) {
                                               ^
>> mm/shmem.c:1879:17: error: use of undeclared identifier 'SWAP_CACHE_BYPASS'
           if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||
                          ^
   2 warnings and 3 errors generated.


vim +1841 mm/shmem.c

  1827	
  1828	/*
  1829	 * Swap in the folio pointed to by *foliop.
  1830	 * Caller has to make sure that *foliop contains a valid swapped folio.
  1831	 * Returns 0 and the folio in foliop if success. On failure, returns the
  1832	 * error code and NULL in *foliop.
  1833	 */
  1834	static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  1835				     struct folio **foliop, enum sgp_type sgp,
  1836				     gfp_t gfp, struct mm_struct *fault_mm,
  1837				     vm_fault_t *fault_type)
  1838	{
  1839		struct address_space *mapping = inode->i_mapping;
  1840		struct shmem_inode_info *info = SHMEM_I(inode);
> 1841		enum swap_cache_result result;
  1842		struct folio *folio = NULL;
  1843		struct mempolicy *mpol;
  1844		struct page *page;
  1845		swp_entry_t swap;
  1846		pgoff_t ilx;
  1847		int error;
  1848	
  1849		VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
  1850		swap = radix_to_swp_entry(*foliop);
  1851		*foliop = NULL;
  1852	
  1853		if (is_poisoned_swp_entry(swap))
  1854			return -EIO;
  1855	
  1856		mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
  1857		page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);
  1858		mpol_cond_put(mpol);
  1859	
  1860		if (PTR_ERR(page) == -EBUSY) {
  1861			if (!shmem_confirm_swap(mapping, index, swap))
  1862				return -EEXIST;
  1863			else
  1864				return -EINVAL;
  1865		} else if (!page) {
  1866			error = -ENOMEM;
  1867			goto failed;
  1868		} else {
  1869			folio = page_folio(page);
> 1870			if (fault_type && result != SWAP_CACHE_HIT) {
  1871				*fault_type |= VM_FAULT_MAJOR;
  1872				count_vm_event(PGMAJFAULT);
  1873				count_memcg_event_mm(fault_mm, PGMAJFAULT);
  1874			}
  1875		}
  1876	
  1877		/* We have to do this with folio locked to prevent races */
  1878		folio_lock(folio);
> 1879		if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||
  1880		    folio->swap.val != swap.val ||
  1881		    !shmem_confirm_swap(mapping, index, swap)) {
  1882			error = -EEXIST;
  1883			goto unlock;
  1884		}
  1885		if (!folio_test_uptodate(folio)) {
  1886			error = -EIO;
  1887			goto failed;
  1888		}
  1889		folio_wait_writeback(folio);
  1890	
  1891		/*
  1892		 * Some architectures may have to restore extra metadata to the
  1893		 * folio after reading from swap.
  1894		 */
  1895		arch_swap_restore(swap, folio);
  1896	
  1897		if (shmem_should_replace_folio(folio, gfp)) {
  1898			error = shmem_replace_folio(&folio, gfp, info, index);
  1899			if (error)
  1900				goto failed;
  1901		}
  1902	
  1903		error = shmem_add_to_page_cache(folio, mapping, index,
  1904						swp_to_radix_entry(swap), gfp);
  1905		if (error)
  1906			goto failed;
  1907	
  1908		shmem_recalc_inode(inode, 0, -1);
  1909	
  1910		if (sgp == SGP_WRITE)
  1911			folio_mark_accessed(folio);
  1912	
  1913		delete_from_swap_cache(folio);
  1914		folio_mark_dirty(folio);
  1915		swap_free(swap);
  1916	
  1917		*foliop = folio;
  1918		return 0;
  1919	failed:
  1920		if (!shmem_confirm_swap(mapping, index, swap))
  1921			error = -EEXIST;
  1922		if (error == -EIO)
  1923			shmem_set_folio_swapin_error(inode, index, folio, swap);
  1924	unlock:
  1925		if (folio) {
  1926			folio_unlock(folio);
  1927			folio_put(folio);
  1928		}
  1929	
  1930		return error;
  1931	}
  1932
  
Chris Li Nov. 22, 2023, 4:40 a.m. UTC | #2
On Sun, Nov 19, 2023 at 11:49 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> There are two places where swapin is not direct caused by page fault:
> shmem swapin is invoked through shmem mapping, swapoff cause swapin by
> walking the page table. They used to construct a pseudo vmfault struct
> for swapin function.
>
> Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
> ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
> path is still using a pseudo vmfault.
>
> Introduce a helper for them both, this help save stack usage for swapoff
> path, and help apply a unified swapin cache and readahead policy check.
>
> Also prepare for follow up commits.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/shmem.c      | 51 ++++++++++++++++---------------------------------
>  mm/swap.h       | 11 +++++++++++
>  mm/swap_state.c | 38 ++++++++++++++++++++++++++++++++++++
>  mm/swapfile.c   | 23 +++++++++++-----------
>  4 files changed, 76 insertions(+), 47 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index f9ce4067c742..81d129aa66d1 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1565,22 +1565,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
>  static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
>                         pgoff_t index, unsigned int order, pgoff_t *ilx);
>
> -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
> -                       struct shmem_inode_info *info, pgoff_t index)
> -{
> -       struct mempolicy *mpol;
> -       pgoff_t ilx;
> -       struct page *page;
> -
> -       mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> -       page = swap_cluster_readahead(swap, gfp, mpol, ilx);
> -       mpol_cond_put(mpol);
> -
> -       if (!page)
> -               return NULL;
> -       return page_folio(page);
> -}
> -

Nice. Thank you.

>  /*
>   * Make sure huge_gfp is always more limited than limit_gfp.
>   * Some of the flags set permissions, while others set limitations.
> @@ -1854,9 +1838,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  {
>         struct address_space *mapping = inode->i_mapping;
>         struct shmem_inode_info *info = SHMEM_I(inode);
> -       struct swap_info_struct *si;
> +       enum swap_cache_result result;
>         struct folio *folio = NULL;
> +       struct mempolicy *mpol;
> +       struct page *page;
>         swp_entry_t swap;
> +       pgoff_t ilx;
>         int error;
>
>         VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> @@ -1866,34 +1853,30 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>         if (is_poisoned_swp_entry(swap))
>                 return -EIO;
>
> -       si = get_swap_device(swap);
> -       if (!si) {
> +       mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> +       page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);

Notice this "result" CAN be outdated. e.g. after this call, the swap
cache can be changed by another thread generating the swap page fault
and installing the folio into the swap cache or removing it.

> +       mpol_cond_put(mpol);
> +
> +       if (PTR_ERR(page) == -EBUSY) {
>                 if (!shmem_confirm_swap(mapping, index, swap))
>                         return -EEXIST;
Not your fault . The if statement already returned.
>                 else
This is not needed, the next return -EINVAL can be one less indent level.
>                         return -EINVAL;
> -       }
> -
> -       /* Look it up and read it in.. */
> -       folio = swap_cache_get_folio(swap, NULL, NULL);
> -       if (!folio) {
> -               /* Or update major stats only when swapin succeeds?? */
> -               if (fault_type) {
> +       } else if (!page) {
Don't need the else here because previous if statement always return.

> +               error = -ENOMEM;
> +               goto failed;
> +       } else {

Don't need the else here. Previous goto terminate the flow.

> +               folio = page_folio(page);
> +               if (fault_type && result != SWAP_CACHE_HIT) {
>                         *fault_type |= VM_FAULT_MAJOR;
>                         count_vm_event(PGMAJFAULT);
>                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
>                 }
> -               /* Here we actually start the io */
> -               folio = shmem_swapin_cluster(swap, gfp, info, index);
> -               if (!folio) {
> -                       error = -ENOMEM;
> -                       goto failed;
> -               }
>         }
>
>         /* We have to do this with folio locked to prevent races */
>         folio_lock(folio);
> -       if (!folio_test_swapcache(folio) ||
> +       if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||

I think there is a possible racing bug here. Because the result can be outdated.

>             folio->swap.val != swap.val ||
>             !shmem_confirm_swap(mapping, index, swap)) {
>                 error = -EEXIST;
> @@ -1930,7 +1913,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>         delete_from_swap_cache(folio);
>         folio_mark_dirty(folio);
>         swap_free(swap);
> -       put_swap_device(si);
>
>         *foliop = folio;
>         return 0;
> @@ -1944,7 +1926,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                 folio_unlock(folio);
>                 folio_put(folio);
>         }
> -       put_swap_device(si);
>
>         return error;
>  }
> diff --git a/mm/swap.h b/mm/swap.h
> index da9deb5ba37d..b073c29c9790 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -62,6 +62,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>                                     struct mempolicy *mpol, pgoff_t ilx);
>  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
>                               struct vm_fault *vmf, enum swap_cache_result *result);
> +struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
> +                                  struct mempolicy *mpol, pgoff_t ilx,
> +                                  struct mm_struct *mm,
> +                                  enum swap_cache_result *result);
>
>  static inline unsigned int folio_swap_flags(struct folio *folio)
>  {
> @@ -103,6 +107,13 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
>         return NULL;
>  }
>
> +static inline struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
> +               struct mempolicy *mpol, pgoff_t ilx, struct mm_struct *mm,
> +               enum swap_cache_result *result)
> +{
> +       return NULL;
> +}
> +
>  static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
>  {
>         return 0;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index ff8a166603d0..eef66757c615 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -956,6 +956,44 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
>         return page;
>  }
>
> +struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
> +                                  struct mempolicy *mpol, pgoff_t ilx,
> +                                  struct mm_struct *mm, enum swap_cache_result *result)

Can you get a better function name? e.g. no negative works. The
function should be named after what it does, not who calls it. The
caller usage might change over time.
I saw that swapin_page_non_fault() and swapin_readahead() are doing
similar things and with similar structure. Can you unify these two
somehow?

Chris

> +{
> +       enum swap_cache_result cache_result;
> +       struct swap_info_struct *si;
> +       void *shadow = NULL;
> +       struct folio *folio;
> +       struct page *page;
> +
> +       /* Prevent swapoff from happening to us */
> +       si = get_swap_device(entry);
> +       if (unlikely(!si))
> +               return ERR_PTR(-EBUSY);
> +
> +       folio = swap_cache_get_folio(entry, NULL, &shadow);
> +       if (folio) {
> +               page = folio_file_page(folio, swp_offset(entry));
> +               cache_result = SWAP_CACHE_HIT;
> +               goto done;
> +       }
> +
> +       if (swap_use_no_readahead(si, swp_offset(entry))) {
> +               page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, mm);
> +               if (shadow)
> +                       workingset_refault(page_folio(page), shadow);
> +               cache_result = SWAP_CACHE_BYPASS;
> +       } else {
> +               page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> +               cache_result = SWAP_CACHE_MISS;
> +       }
> +done:
> +       put_swap_device(si);
> +       if (result)
> +               *result = cache_result;
> +       return page;
> +}
> +
>  #ifdef CONFIG_SYSFS
>  static ssize_t vma_ra_enabled_show(struct kobject *kobj,
>                                      struct kobj_attribute *attr, char *buf)
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 925ad92486a4..f8c5096fe0f0 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1822,20 +1822,15 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>
>         si = swap_info[type];
>         do {
> +               int ret;
> +               pte_t ptent;
> +               pgoff_t ilx;
> +               swp_entry_t entry;
>                 struct page *page;
>                 unsigned long offset;
> +               struct mempolicy *mpol;
>                 unsigned char swp_count;
>                 struct folio *folio = NULL;
> -               swp_entry_t entry;
> -               int ret;
> -               pte_t ptent;
> -
> -               struct vm_fault vmf = {
> -                       .vma = vma,
> -                       .address = addr,
> -                       .real_address = addr,
> -                       .pmd = pmd,
> -               };
>
>                 if (!pte++) {
>                         pte = pte_offset_map(pmd, addr);
> @@ -1855,8 +1850,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                 offset = swp_offset(entry);
>                 pte_unmap(pte);
>                 pte = NULL;
> -               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> -                                       &vmf, NULL);
> +
> +               mpol = get_vma_policy(vma, addr, 0, &ilx);
> +               page = swapin_page_non_fault(entry, GFP_HIGHUSER_MOVABLE,
> +                                            mpol, ilx, vma->vm_mm, NULL);
> +               mpol_cond_put(mpol);
> +
>                 if (IS_ERR(page))
>                         return PTR_ERR(page);
>                 else if (page)
> --
> 2.42.0
>
>
  
Kairui Song Nov. 28, 2023, 11:22 a.m. UTC | #3
Chris Li <chrisl@kernel.org> 于2023年11月22日周三 12:41写道:
>
> On Sun, Nov 19, 2023 at 11:49 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > There are two places where swapin is not direct caused by page fault:
> > shmem swapin is invoked through shmem mapping, swapoff cause swapin by
> > walking the page table. They used to construct a pseudo vmfault struct
> > for swapin function.
> >
> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
> > path is still using a pseudo vmfault.
> >
> > Introduce a helper for them both, this help save stack usage for swapoff
> > path, and help apply a unified swapin cache and readahead policy check.
> >
> > Also prepare for follow up commits.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/shmem.c      | 51 ++++++++++++++++---------------------------------
> >  mm/swap.h       | 11 +++++++++++
> >  mm/swap_state.c | 38 ++++++++++++++++++++++++++++++++++++
> >  mm/swapfile.c   | 23 +++++++++++-----------
> >  4 files changed, 76 insertions(+), 47 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index f9ce4067c742..81d129aa66d1 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1565,22 +1565,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
> >  static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
> >                         pgoff_t index, unsigned int order, pgoff_t *ilx);
> >
> > -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
> > -                       struct shmem_inode_info *info, pgoff_t index)
> > -{
> > -       struct mempolicy *mpol;
> > -       pgoff_t ilx;
> > -       struct page *page;
> > -
> > -       mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> > -       page = swap_cluster_readahead(swap, gfp, mpol, ilx);
> > -       mpol_cond_put(mpol);
> > -
> > -       if (!page)
> > -               return NULL;
> > -       return page_folio(page);
> > -}
> > -
>
> Nice. Thank you.
>
> >  /*
> >   * Make sure huge_gfp is always more limited than limit_gfp.
> >   * Some of the flags set permissions, while others set limitations.
> > @@ -1854,9 +1838,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >  {
> >         struct address_space *mapping = inode->i_mapping;
> >         struct shmem_inode_info *info = SHMEM_I(inode);
> > -       struct swap_info_struct *si;
> > +       enum swap_cache_result result;
> >         struct folio *folio = NULL;
> > +       struct mempolicy *mpol;
> > +       struct page *page;
> >         swp_entry_t swap;
> > +       pgoff_t ilx;
> >         int error;
> >
> >         VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> > @@ -1866,34 +1853,30 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >         if (is_poisoned_swp_entry(swap))
> >                 return -EIO;
> >
> > -       si = get_swap_device(swap);
> > -       if (!si) {
> > +       mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> > +       page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);

Hi Chris,

I've been trying to address these issues in V2, most issue in other
patches have a straight solution, some could be discuss in seperate
series, but I come up with some thoughts here:

>
> Notice this "result" CAN be outdated. e.g. after this call, the swap
> cache can be changed by another thread generating the swap page fault
> and installing the folio into the swap cache or removing it.

This is true, and it seems a potential race also exist before this
series for direct (no swapcache) swapin path (do_swap_page) if I
understand it correctly:

In do_swap_page path, multiple process could swapin the page at the
same time (a mapped once page can still be shared by sub threads),
they could get different folios. The later pte lock and pte_same check
is not enough, because while one process is not holding the pte lock,
another process could read-in, swap_free the entry, then swap-out the
page again, using same entry, an ABA problem. The race is not likely
to happen in reality but in theory possible.

Same issue for shmem here, there are
shmem_confirm_swap/shmem_add_to_page_cache check later to prevent
re-installing into shmem mapping for direct swap in, but also not
enough. Other process could read-in and re-swapout using same entry so
the mapping entry seems unchanged during the time window. Still very
unlikely to happen in reality, but not impossible.

When swapcache is used there is no such issue, since swap lock and
swap_map are used to sync all readers, and while one reader is still
holding the folio, the entry is locked through swapcache, or if a
folio is removed from swapcache, folio_test_swapcache will fail, and
the reader could retry.

I'm trying to come up with a better locking for direct swap in, am I
missing anything here? Correct me if I get it wrong...
  
Chris Li Dec. 13, 2023, 2:22 a.m. UTC | #4
On Tue, Nov 28, 2023 at 3:22 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> > >  /*
> > >   * Make sure huge_gfp is always more limited than limit_gfp.
> > >   * Some of the flags set permissions, while others set limitations.
> > > @@ -1854,9 +1838,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > >  {
> > >         struct address_space *mapping = inode->i_mapping;
> > >         struct shmem_inode_info *info = SHMEM_I(inode);
> > > -       struct swap_info_struct *si;
> > > +       enum swap_cache_result result;
> > >         struct folio *folio = NULL;
> > > +       struct mempolicy *mpol;
> > > +       struct page *page;
> > >         swp_entry_t swap;
> > > +       pgoff_t ilx;
> > >         int error;
> > >
> > >         VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> > > @@ -1866,34 +1853,30 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > >         if (is_poisoned_swp_entry(swap))
> > >                 return -EIO;
> > >
> > > -       si = get_swap_device(swap);
> > > -       if (!si) {
> > > +       mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> > > +       page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);
>
> Hi Chris,
>
> I've been trying to address these issues in V2, most issue in other
> patches have a straight solution, some could be discuss in seperate
> series, but I come up with some thoughts here:
>
> >
> > Notice this "result" CAN be outdated. e.g. after this call, the swap
> > cache can be changed by another thread generating the swap page fault
> > and installing the folio into the swap cache or removing it.
>
> This is true, and it seems a potential race also exist before this
> series for direct (no swapcache) swapin path (do_swap_page) if I
> understand it correctly:

I just noticed I missed this email while I was cleaning up my email
archive. Sorry for the late reply. Traveling does not help either.

I am not aware of swap in racing bugs in the existing code. Racing,
yes. If you discover a code path for racing causing bug, please report
it.
>
> In do_swap_page path, multiple process could swapin the page at the
> same time (a mapped once page can still be shared by sub threads),
> they could get different folios. The later pte lock and pte_same check
> is not enough, because while one process is not holding the pte lock,
> another process could read-in, swap_free the entry, then swap-out the
> page again, using same entry, an ABA problem. The race is not likely
> to happen in reality but in theory possible.

Have you taken into account that if the page was locked, then it
wasn't able to change from the swapcache? I think the swap cache find
and get function will return the page locked. Then swapcache will not
be able to change the mapping as long as the page is still locked.

>
> Same issue for shmem here, there are
> shmem_confirm_swap/shmem_add_to_page_cache check later to prevent
> re-installing into shmem mapping for direct swap in, but also not
> enough. Other process could read-in and re-swapout using same entry so
> the mapping entry seems unchanged during the time window. Still very
> unlikely to happen in reality, but not impossible.

Please take a look again with the page lock information. Report back
if you still think there is a racing bug in the existing code. We can
take a closer look at the concurrent call stack to trigger the bug.

Chris

>
> When swapcache is used there is no such issue, since swap lock and
> swap_map are used to sync all readers, and while one reader is still
> holding the folio, the entry is locked through swapcache, or if a
> folio is removed from swapcache, folio_test_swapcache will fail, and
> the reader could retry.
>
> I'm trying to come up with a better locking for direct swap in, am I
> missing anything here? Correct me if I get it wrong...
>
  

Patch

diff --git a/mm/shmem.c b/mm/shmem.c
index f9ce4067c742..81d129aa66d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1565,22 +1565,6 @@  static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 			pgoff_t index, unsigned int order, pgoff_t *ilx);
 
-static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	struct mempolicy *mpol;
-	pgoff_t ilx;
-	struct page *page;
-
-	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
-	page = swap_cluster_readahead(swap, gfp, mpol, ilx);
-	mpol_cond_put(mpol);
-
-	if (!page)
-		return NULL;
-	return page_folio(page);
-}
-
 /*
  * Make sure huge_gfp is always more limited than limit_gfp.
  * Some of the flags set permissions, while others set limitations.
@@ -1854,9 +1838,12 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct swap_info_struct *si;
+	enum swap_cache_result result;
 	struct folio *folio = NULL;
+	struct mempolicy *mpol;
+	struct page *page;
 	swp_entry_t swap;
+	pgoff_t ilx;
 	int error;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1866,34 +1853,30 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (is_poisoned_swp_entry(swap))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	if (!si) {
+	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+	page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);
+	mpol_cond_put(mpol);
+
+	if (PTR_ERR(page) == -EBUSY) {
 		if (!shmem_confirm_swap(mapping, index, swap))
 			return -EEXIST;
 		else
 			return -EINVAL;
-	}
-
-	/* Look it up and read it in.. */
-	folio = swap_cache_get_folio(swap, NULL, NULL);
-	if (!folio) {
-		/* Or update major stats only when swapin succeeds?? */
-		if (fault_type) {
+	} else if (!page) {
+		error = -ENOMEM;
+		goto failed;
+	} else {
+		folio = page_folio(page);
+		if (fault_type && result != SWAP_CACHE_HIT) {
 			*fault_type |= VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
-		/* Here we actually start the io */
-		folio = shmem_swapin_cluster(swap, gfp, info, index);
-		if (!folio) {
-			error = -ENOMEM;
-			goto failed;
-		}
 	}
 
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
-	if (!folio_test_swapcache(folio) ||
+	if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||
 	    folio->swap.val != swap.val ||
 	    !shmem_confirm_swap(mapping, index, swap)) {
 		error = -EEXIST;
@@ -1930,7 +1913,6 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	delete_from_swap_cache(folio);
 	folio_mark_dirty(folio);
 	swap_free(swap);
-	put_swap_device(si);
 
 	*foliop = folio;
 	return 0;
@@ -1944,7 +1926,6 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		folio_unlock(folio);
 		folio_put(folio);
 	}
-	put_swap_device(si);
 
 	return error;
 }
diff --git a/mm/swap.h b/mm/swap.h
index da9deb5ba37d..b073c29c9790 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -62,6 +62,10 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 			      struct vm_fault *vmf, enum swap_cache_result *result);
+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+				   struct mempolicy *mpol, pgoff_t ilx,
+				   struct mm_struct *mm,
+				   enum swap_cache_result *result);
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
@@ -103,6 +107,13 @@  static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+		struct mempolicy *mpol, pgoff_t ilx, struct mm_struct *mm,
+		enum swap_cache_result *result)
+{
+	return NULL;
+}
+
 static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
 	return 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ff8a166603d0..eef66757c615 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -956,6 +956,44 @@  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	return page;
 }
 
+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+				   struct mempolicy *mpol, pgoff_t ilx,
+				   struct mm_struct *mm, enum swap_cache_result *result)
+{
+	enum swap_cache_result cache_result;
+	struct swap_info_struct *si;
+	void *shadow = NULL;
+	struct folio *folio;
+	struct page *page;
+
+	/* Prevent swapoff from happening to us */
+	si = get_swap_device(entry);
+	if (unlikely(!si))
+		return ERR_PTR(-EBUSY);
+
+	folio = swap_cache_get_folio(entry, NULL, &shadow);
+	if (folio) {
+		page = folio_file_page(folio, swp_offset(entry));
+		cache_result = SWAP_CACHE_HIT;
+		goto done;
+	}
+
+	if (swap_use_no_readahead(si, swp_offset(entry))) {
+		page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, mm);
+		if (shadow)
+			workingset_refault(page_folio(page), shadow);
+		cache_result = SWAP_CACHE_BYPASS;
+	} else {
+		page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+		cache_result = SWAP_CACHE_MISS;
+	}
+done:
+	put_swap_device(si);
+	if (result)
+		*result = cache_result;
+	return page;
+}
+
 #ifdef CONFIG_SYSFS
 static ssize_t vma_ra_enabled_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *buf)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 925ad92486a4..f8c5096fe0f0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1822,20 +1822,15 @@  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 	si = swap_info[type];
 	do {
+		int ret;
+		pte_t ptent;
+		pgoff_t ilx;
+		swp_entry_t entry;
 		struct page *page;
 		unsigned long offset;
+		struct mempolicy *mpol;
 		unsigned char swp_count;
 		struct folio *folio = NULL;
-		swp_entry_t entry;
-		int ret;
-		pte_t ptent;
-
-		struct vm_fault vmf = {
-			.vma = vma,
-			.address = addr,
-			.real_address = addr,
-			.pmd = pmd,
-		};
 
 		if (!pte++) {
 			pte = pte_offset_map(pmd, addr);
@@ -1855,8 +1850,12 @@  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		offset = swp_offset(entry);
 		pte_unmap(pte);
 		pte = NULL;
-		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-					&vmf, NULL);
+
+		mpol = get_vma_policy(vma, addr, 0, &ilx);
+		page = swapin_page_non_fault(entry, GFP_HIGHUSER_MOVABLE,
+					     mpol, ilx, vma->vm_mm, NULL);
+		mpol_cond_put(mpol);
+
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 		else if (page)