[11/24] mm/swap: also handle swapcache lookup in swapin_readahead

Message ID 20231119194740.94101-12-ryncsn@gmail.com
State New
Headers
Series Swapin path refactor for optimization and bugfix |

Commit Message

Kairui Song Nov. 19, 2023, 7:47 p.m. UTC
  From: Kairui Song <kasong@tencent.com>

No feature change, just prepare for later commits.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c     | 61 +++++++++++++++++++++++--------------------------
 mm/swap.h       | 10 ++++++--
 mm/swap_state.c | 26 +++++++++++++--------
 mm/swapfile.c   | 30 +++++++++++-------------
 4 files changed, 66 insertions(+), 61 deletions(-)
  

Comments

kernel test robot Nov. 20, 2023, 12:47 a.m. UTC | #1
Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.7-rc2 next-20231117]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/mm-swap-fix-a-potential-undefined-behavior-issue/20231120-035926
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20231119194740.94101-12-ryncsn%40gmail.com
patch subject: [PATCH 11/24] mm/swap: also handle swapcache lookup in swapin_readahead
config: x86_64-allnoconfig (https://download.01.org/0day-ci/archive/20231120/202311200813.x056cCRJ-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231120/202311200813.x056cCRJ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311200813.x056cCRJ-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   In file included from mm/filemap.c:62:
>> mm/swap.h:101:52: warning: 'enum swap_cache_result' declared inside parameter list will not be visible outside of this definition or declaration
     101 |                         struct vm_fault *vmf, enum swap_cache_result *result)
         |                                                    ^~~~~~~~~~~~~~~~~
--
   In file included from mm/memory.c:93:
>> mm/swap.h:101:52: warning: 'enum swap_cache_result' declared inside parameter list will not be visible outside of this definition or declaration
     101 |                         struct vm_fault *vmf, enum swap_cache_result *result)
         |                                                    ^~~~~~~~~~~~~~~~~
   mm/memory.c: In function 'do_swap_page':
>> mm/memory.c:3790:32: error: storage size of 'cache_result' isn't known
    3790 |         enum swap_cache_result cache_result;
         |                                ^~~~~~~~~~~~
>> mm/memory.c:3857:37: error: 'SWAP_CACHE_HIT' undeclared (first use in this function); did you mean 'DQST_CACHE_HITS'?
    3857 |                 if (cache_result != SWAP_CACHE_HIT) {
         |                                     ^~~~~~~~~~~~~~
         |                                     DQST_CACHE_HITS
   mm/memory.c:3857:37: note: each undeclared identifier is reported only once for each function it appears in
>> mm/memory.c:3863:37: error: 'SWAP_CACHE_BYPASS' undeclared (first use in this function); did you mean 'SMP_CACHE_BYTES'?
    3863 |                 if (cache_result != SWAP_CACHE_BYPASS)
         |                                     ^~~~~~~~~~~~~~~~~
         |                                     SMP_CACHE_BYTES
>> mm/memory.c:3790:32: warning: unused variable 'cache_result' [-Wunused-variable]
    3790 |         enum swap_cache_result cache_result;
         |                                ^~~~~~~~~~~~


vim +3790 mm/memory.c

  3777	
  3778	/*
  3779	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3780	 * but allow concurrent faults), and pte mapped but not yet locked.
  3781	 * We return with pte unmapped and unlocked.
  3782	 *
  3783	 * We return with the mmap_lock locked or unlocked in the same cases
  3784	 * as does filemap_fault().
  3785	 */
  3786	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3787	{
  3788		struct vm_area_struct *vma = vmf->vma;
  3789		struct folio *swapcache = NULL, *folio = NULL;
> 3790		enum swap_cache_result cache_result;
  3791		struct page *page;
  3792		struct swap_info_struct *si = NULL;
  3793		rmap_t rmap_flags = RMAP_NONE;
  3794		bool exclusive = false;
  3795		swp_entry_t entry;
  3796		pte_t pte;
  3797		vm_fault_t ret = 0;
  3798	
  3799		if (!pte_unmap_same(vmf))
  3800			goto out;
  3801	
  3802		entry = pte_to_swp_entry(vmf->orig_pte);
  3803		if (unlikely(non_swap_entry(entry))) {
  3804			if (is_migration_entry(entry)) {
  3805				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3806						     vmf->address);
  3807			} else if (is_device_exclusive_entry(entry)) {
  3808				vmf->page = pfn_swap_entry_to_page(entry);
  3809				ret = remove_device_exclusive_entry(vmf);
  3810			} else if (is_device_private_entry(entry)) {
  3811				if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
  3812					/*
  3813					 * migrate_to_ram is not yet ready to operate
  3814					 * under VMA lock.
  3815					 */
  3816					vma_end_read(vma);
  3817					ret = VM_FAULT_RETRY;
  3818					goto out;
  3819				}
  3820	
  3821				vmf->page = pfn_swap_entry_to_page(entry);
  3822				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3823						vmf->address, &vmf->ptl);
  3824				if (unlikely(!vmf->pte ||
  3825					     !pte_same(ptep_get(vmf->pte),
  3826								vmf->orig_pte)))
  3827					goto unlock;
  3828	
  3829				/*
  3830				 * Get a page reference while we know the page can't be
  3831				 * freed.
  3832				 */
  3833				get_page(vmf->page);
  3834				pte_unmap_unlock(vmf->pte, vmf->ptl);
  3835				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3836				put_page(vmf->page);
  3837			} else if (is_hwpoison_entry(entry)) {
  3838				ret = VM_FAULT_HWPOISON;
  3839			} else if (is_pte_marker_entry(entry)) {
  3840				ret = handle_pte_marker(vmf);
  3841			} else {
  3842				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3843				ret = VM_FAULT_SIGBUS;
  3844			}
  3845			goto out;
  3846		}
  3847	
  3848		/* Prevent swapoff from happening to us. */
  3849		si = get_swap_device(entry);
  3850		if (unlikely(!si))
  3851			goto out;
  3852	
  3853		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3854					vmf, &cache_result);
  3855		if (page) {
  3856			folio = page_folio(page);
> 3857			if (cache_result != SWAP_CACHE_HIT) {
  3858				/* Had to read the page from swap area: Major fault */
  3859				ret = VM_FAULT_MAJOR;
  3860				count_vm_event(PGMAJFAULT);
  3861				count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3862			}
> 3863			if (cache_result != SWAP_CACHE_BYPASS)
  3864				swapcache = folio;
  3865			if (PageHWPoison(page)) {
  3866				/*
  3867				 * hwpoisoned dirty swapcache pages are kept for killing
  3868				 * owner processes (which may be unknown at hwpoison time)
  3869				 */
  3870				ret = VM_FAULT_HWPOISON;
  3871				goto out_release;
  3872			}
  3873		} else {
  3874			/*
  3875			 * Back out if somebody else faulted in this pte
  3876			 * while we released the pte lock.
  3877			 */
  3878			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3879					vmf->address, &vmf->ptl);
  3880			if (likely(vmf->pte &&
  3881				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
  3882				ret = VM_FAULT_OOM;
  3883			goto unlock;
  3884		}
  3885	
  3886		ret |= folio_lock_or_retry(folio, vmf);
  3887		if (ret & VM_FAULT_RETRY)
  3888			goto out_release;
  3889	
  3890		if (swapcache) {
  3891			/*
  3892			 * Make sure folio_free_swap() or swapoff did not release the
  3893			 * swapcache from under us.  The page pin, and pte_same test
  3894			 * below, are not enough to exclude that.  Even if it is still
  3895			 * swapcache, we need to check that the page's swap has not
  3896			 * changed.
  3897			 */
  3898			if (unlikely(!folio_test_swapcache(folio) ||
  3899				     page_swap_entry(page).val != entry.val))
  3900				goto out_page;
  3901	
  3902			/*
  3903			 * KSM sometimes has to copy on read faults, for example, if
  3904			 * page->index of !PageKSM() pages would be nonlinear inside the
  3905			 * anon VMA -- PageKSM() is lost on actual swapout.
  3906			 */
  3907			page = ksm_might_need_to_copy(page, vma, vmf->address);
  3908			if (unlikely(!page)) {
  3909				ret = VM_FAULT_OOM;
  3910				goto out_page;
  3911			} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
  3912				ret = VM_FAULT_HWPOISON;
  3913				goto out_page;
  3914			}
  3915			folio = page_folio(page);
  3916	
  3917			/*
  3918			 * If we want to map a page that's in the swapcache writable, we
  3919			 * have to detect via the refcount if we're really the exclusive
  3920			 * owner. Try removing the extra reference from the local LRU
  3921			 * caches if required.
  3922			 */
  3923			if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
  3924			    !folio_test_ksm(folio) && !folio_test_lru(folio))
  3925				lru_add_drain();
  3926		}
  3927	
  3928		folio_throttle_swaprate(folio, GFP_KERNEL);
  3929	
  3930		/*
  3931		 * Back out if somebody else already faulted in this pte.
  3932		 */
  3933		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3934				&vmf->ptl);
  3935		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
  3936			goto out_nomap;
  3937	
  3938		if (unlikely(!folio_test_uptodate(folio))) {
  3939			ret = VM_FAULT_SIGBUS;
  3940			goto out_nomap;
  3941		}
  3942	
  3943		/*
  3944		 * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
  3945		 * must never point at an anonymous page in the swapcache that is
  3946		 * PG_anon_exclusive. Sanity check that this holds and especially, that
  3947		 * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
  3948		 * check after taking the PT lock and making sure that nobody
  3949		 * concurrently faulted in this page and set PG_anon_exclusive.
  3950		 */
  3951		BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
  3952		BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
  3953	
  3954		/*
  3955		 * Check under PT lock (to protect against concurrent fork() sharing
  3956		 * the swap entry concurrently) for certainly exclusive pages.
  3957		 */
  3958		if (!folio_test_ksm(folio)) {
  3959			exclusive = pte_swp_exclusive(vmf->orig_pte);
  3960			if (folio != swapcache) {
  3961				/*
  3962				 * We have a fresh page that is not exposed to the
  3963				 * swapcache -> certainly exclusive.
  3964				 */
  3965				exclusive = true;
  3966			} else if (exclusive && folio_test_writeback(folio) &&
  3967				  data_race(si->flags & SWP_STABLE_WRITES)) {
  3968				/*
  3969				 * This is tricky: not all swap backends support
  3970				 * concurrent page modifications while under writeback.
  3971				 *
  3972				 * So if we stumble over such a page in the swapcache
  3973				 * we must not set the page exclusive, otherwise we can
  3974				 * map it writable without further checks and modify it
  3975				 * while still under writeback.
  3976				 *
  3977				 * For these problematic swap backends, simply drop the
  3978				 * exclusive marker: this is perfectly fine as we start
  3979				 * writeback only if we fully unmapped the page and
  3980				 * there are no unexpected references on the page after
  3981				 * unmapping succeeded. After fully unmapped, no
  3982				 * further GUP references (FOLL_GET and FOLL_PIN) can
  3983				 * appear, so dropping the exclusive marker and mapping
  3984				 * it only R/O is fine.
  3985				 */
  3986				exclusive = false;
  3987			}
  3988		}
  3989	
  3990		/*
  3991		 * Some architectures may have to restore extra metadata to the page
  3992		 * when reading from swap. This metadata may be indexed by swap entry
  3993		 * so this must be called before swap_free().
  3994		 */
  3995		arch_swap_restore(entry, folio);
  3996	
  3997		/*
  3998		 * Remove the swap entry and conditionally try to free up the swapcache.
  3999		 * We're already holding a reference on the page but haven't mapped it
  4000		 * yet.
  4001		 */
  4002		swap_free(entry);
  4003		if (should_try_to_free_swap(folio, vma, vmf->flags))
  4004			folio_free_swap(folio);
  4005	
  4006		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  4007		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  4008		pte = mk_pte(page, vma->vm_page_prot);
  4009	
  4010		/*
  4011		 * Same logic as in do_wp_page(); however, optimize for pages that are
  4012		 * certainly not shared either because we just allocated them without
  4013		 * exposing them to the swapcache or because the swap entry indicates
  4014		 * exclusivity.
  4015		 */
  4016		if (!folio_test_ksm(folio) &&
  4017		    (exclusive || folio_ref_count(folio) == 1)) {
  4018			if (vmf->flags & FAULT_FLAG_WRITE) {
  4019				pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  4020				vmf->flags &= ~FAULT_FLAG_WRITE;
  4021			}
  4022			rmap_flags |= RMAP_EXCLUSIVE;
  4023		}
  4024		flush_icache_page(vma, page);
  4025		if (pte_swp_soft_dirty(vmf->orig_pte))
  4026			pte = pte_mksoft_dirty(pte);
  4027		if (pte_swp_uffd_wp(vmf->orig_pte))
  4028			pte = pte_mkuffd_wp(pte);
  4029		vmf->orig_pte = pte;
  4030	
  4031		/* ksm created a completely new copy */
  4032		if (unlikely(folio != swapcache && swapcache)) {
  4033			page_add_new_anon_rmap(page, vma, vmf->address);
  4034			folio_add_lru_vma(folio, vma);
  4035		} else {
  4036			page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
  4037		}
  4038	
  4039		VM_BUG_ON(!folio_test_anon(folio) ||
  4040				(pte_write(pte) && !PageAnonExclusive(page)));
  4041		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  4042		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  4043	
  4044		folio_unlock(folio);
  4045		if (folio != swapcache && swapcache) {
  4046			/*
  4047			 * Hold the lock to avoid the swap entry to be reused
  4048			 * until we take the PT lock for the pte_same() check
  4049			 * (to avoid false positives from pte_same). For
  4050			 * further safety release the lock after the swap_free
  4051			 * so that the swap count won't change under a
  4052			 * parallel locked swapcache.
  4053			 */
  4054			folio_unlock(swapcache);
  4055			folio_put(swapcache);
  4056		}
  4057	
  4058		if (vmf->flags & FAULT_FLAG_WRITE) {
  4059			ret |= do_wp_page(vmf);
  4060			if (ret & VM_FAULT_ERROR)
  4061				ret &= VM_FAULT_ERROR;
  4062			goto out;
  4063		}
  4064	
  4065		/* No need to invalidate - it was non-present before */
  4066		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
  4067	unlock:
  4068		if (vmf->pte)
  4069			pte_unmap_unlock(vmf->pte, vmf->ptl);
  4070	out:
  4071		if (si)
  4072			put_swap_device(si);
  4073		return ret;
  4074	out_nomap:
  4075		if (vmf->pte)
  4076			pte_unmap_unlock(vmf->pte, vmf->ptl);
  4077	out_page:
  4078		folio_unlock(folio);
  4079	out_release:
  4080		folio_put(folio);
  4081		if (folio != swapcache && swapcache) {
  4082			folio_unlock(swapcache);
  4083			folio_put(swapcache);
  4084		}
  4085		if (si)
  4086			put_swap_device(si);
  4087		return ret;
  4088	}
  4089
  
Chris Li Nov. 21, 2023, 4:06 p.m. UTC | #2
On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> No feature change, just prepare for later commits.

You need to have a proper commit message why this change needs to happen.
Preparing is too generic, it does not give any real information.
For example, it seems you want to reduce one swap cache lookup because
swap_readahead already has it?

I am a bit puzzled at this patch. It shuffles a lot of sensitive code.
However I do not get  the value.
It seems like this patch should be merged with the later patch that
depends on it to be judged together.

>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/memory.c     | 61 +++++++++++++++++++++++--------------------------
>  mm/swap.h       | 10 ++++++--
>  mm/swap_state.c | 26 +++++++++++++--------
>  mm/swapfile.c   | 30 +++++++++++-------------
>  4 files changed, 66 insertions(+), 61 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index f4237a2e3b93..22af9f3e8c75 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3786,13 +3786,13 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>         struct vm_area_struct *vma = vmf->vma;
> -       struct folio *swapcache, *folio = NULL;
> +       struct folio *swapcache = NULL, *folio = NULL;
> +       enum swap_cache_result cache_result;
>         struct page *page;
>         struct swap_info_struct *si = NULL;
>         rmap_t rmap_flags = RMAP_NONE;
>         bool exclusive = false;
>         swp_entry_t entry;
> -       bool swapcached;
>         pte_t pte;
>         vm_fault_t ret = 0;
>
> @@ -3850,42 +3850,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         if (unlikely(!si))
>                 goto out;
>
> -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> -       if (folio)
> -               page = folio_file_page(folio, swp_offset(entry));
> -       swapcache = folio;

Is the motivation that swap_readahead() already has a swap cache look up so you
remove this look up here?


> -
> -       if (!folio) {
> -               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> -                                       vmf, &swapcached);
> -               if (page) {
> -                       folio = page_folio(page);
> -                       if (swapcached)
> -                               swapcache = folio;
> -               } else {
> +       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> +                               vmf, &cache_result);
> +       if (page) {
> +               folio = page_folio(page);
> +               if (cache_result != SWAP_CACHE_HIT) {
> +                       /* Had to read the page from swap area: Major fault */
> +                       ret = VM_FAULT_MAJOR;
> +                       count_vm_event(PGMAJFAULT);
> +                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> +               }
> +               if (cache_result != SWAP_CACHE_BYPASS)
> +                       swapcache = folio;
> +               if (PageHWPoison(page)) {

There is a lot of code shuffle here. From the diff it is hard to tell
if they are doing the same thing as before.

>                         /*
> -                        * Back out if somebody else faulted in this pte
> -                        * while we released the pte lock.
> +                        * hwpoisoned dirty swapcache pages are kept for killing
> +                        * owner processes (which may be unknown at hwpoison time)
>                          */
> -                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -                                       vmf->address, &vmf->ptl);
> -                       if (likely(vmf->pte &&
> -                                  pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> -                               ret = VM_FAULT_OOM;
> -                       goto unlock;
> +                       ret = VM_FAULT_HWPOISON;
> +                       goto out_release;
>                 }
> -
> -               /* Had to read the page from swap area: Major fault */
> -               ret = VM_FAULT_MAJOR;
> -               count_vm_event(PGMAJFAULT);
> -               count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> -       } else if (PageHWPoison(page)) {
> +       } else {
>                 /*
> -                * hwpoisoned dirty swapcache pages are kept for killing
> -                * owner processes (which may be unknown at hwpoison time)
> +                * Back out if somebody else faulted in this pte
> +                * while we released the pte lock.
>                  */
> -               ret = VM_FAULT_HWPOISON;
> -               goto out_release;
> +               vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> +                               vmf->address, &vmf->ptl);
> +               if (likely(vmf->pte &&
> +                          pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> +                       ret = VM_FAULT_OOM;
> +               goto unlock;
>         }
>
>         ret |= folio_lock_or_retry(folio, vmf);
> diff --git a/mm/swap.h b/mm/swap.h
> index a9a654af791e..ac9136eee690 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -30,6 +30,12 @@ extern struct address_space *swapper_spaces[];
>         (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
>                 >> SWAP_ADDRESS_SPACE_SHIFT])
>
> +enum swap_cache_result {
> +       SWAP_CACHE_HIT,
> +       SWAP_CACHE_MISS,
> +       SWAP_CACHE_BYPASS,
> +};

Does any function later care about CACHE_BYPASS?

Again, better introduce it with the function that uses it. Don't
introduce it for "just in case I might use it later".

> +
>  void show_swap_cache_info(void);
>  bool add_to_swap(struct folio *folio);
>  void *get_shadow_from_swap_cache(swp_entry_t entry);
> @@ -55,7 +61,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>                                     struct mempolicy *mpol, pgoff_t ilx);
>  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> -                             struct vm_fault *vmf, bool *swapcached);
> +                             struct vm_fault *vmf, enum swap_cache_result *result);
>
>  static inline unsigned int folio_swap_flags(struct folio *folio)
>  {
> @@ -92,7 +98,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
>  }
>
>  static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> -                       struct vm_fault *vmf, bool *swapcached)
> +                       struct vm_fault *vmf, enum swap_cache_result *result)
>  {
>         return NULL;
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index d87c20f9f7ec..e96d63bf8a22 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -908,8 +908,7 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
>   * @entry: swap entry of this memory
>   * @gfp_mask: memory allocation flags
>   * @vmf: fault information
> - * @swapcached: pointer to a bool used as indicator if the
> - *              page is swapped in through swapcache.
> + * @result: a return value to indicate swap cache usage.
>   *
>   * Returns the struct page for entry and addr, after queueing swapin.
>   *
> @@ -918,30 +917,39 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
>   * or vma-based(ie, virtual address based on faulty address) readahead.
>   */
>  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> -                             struct vm_fault *vmf, bool *swapcached)
> +                             struct vm_fault *vmf, enum swap_cache_result *result)
>  {
> +       enum swap_cache_result cache_result;
>         struct swap_info_struct *si;
>         struct mempolicy *mpol;
> +       struct folio *folio;
>         struct page *page;
>         pgoff_t ilx;
> -       bool cached;
> +
> +       folio = swap_cache_get_folio(entry, vmf->vma, vmf->address);
> +       if (folio) {
> +               page = folio_file_page(folio, swp_offset(entry));
> +               cache_result = SWAP_CACHE_HIT;
> +               goto done;
> +       }
>
>         si = swp_swap_info(entry);
>         mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
>         if (swap_use_no_readahead(si, swp_offset(entry))) {
>                 page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
> -               cached = false;
> +               cache_result = SWAP_CACHE_BYPASS;
>         } else if (swap_use_vma_readahead(si)) {
>                 page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> -               cached = true;
> +               cache_result = SWAP_CACHE_MISS;
>         } else {
>                 page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> -               cached = true;
> +               cache_result = SWAP_CACHE_MISS;
>         }
>         mpol_cond_put(mpol);
>
> -       if (swapcached)
> -               *swapcached = cached;
> +done:
> +       if (result)
> +               *result = cache_result;
>
>         return page;
>  }
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 01c3f53b6521..b6d57fff5e21 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1822,13 +1822,21 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>
>         si = swap_info[type];
>         do {
> -               struct folio *folio;
> +               struct page *page;
>                 unsigned long offset;
>                 unsigned char swp_count;
> +               struct folio *folio = NULL;
>                 swp_entry_t entry;
>                 int ret;
>                 pte_t ptent;
>
> +               struct vm_fault vmf = {
> +                       .vma = vma,
> +                       .address = addr,
> +                       .real_address = addr,
> +                       .pmd = pmd,
> +               };

Is this code move caused by skipping the swap cache look up here?

This is very sensitive code related to swap cache racing. It needs
very careful reviewing. Better not shuffle it for no good reason.

Chris

> +
>                 if (!pte++) {
>                         pte = pte_offset_map(pmd, addr);
>                         if (!pte)
> @@ -1847,22 +1855,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                 offset = swp_offset(entry);
>                 pte_unmap(pte);
>                 pte = NULL;
> -
> -               folio = swap_cache_get_folio(entry, vma, addr);
> -               if (!folio) {
> -                       struct page *page;
> -                       struct vm_fault vmf = {
> -                               .vma = vma,
> -                               .address = addr,
> -                               .real_address = addr,
> -                               .pmd = pmd,
> -                       };
> -
> -                       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> -                                               &vmf, NULL);
> -                       if (page)
> -                               folio = page_folio(page);
> -               }
> +               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> +                                       &vmf, NULL);
> +               if (page)
> +                       folio = page_folio(page);
>                 if (!folio) {
>                         /*
>                          * The entry could have been freed, and will not
> --
> 2.42.0
>
>
  
Kairui Song Nov. 24, 2023, 8:42 a.m. UTC | #3
Chris Li <chrisl@kernel.org> 于2023年11月22日周三 00:07写道:


>
> On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > No feature change, just prepare for later commits.
>
> You need to have a proper commit message why this change needs to happen.
> Preparing is too generic, it does not give any real information.
> For example, it seems you want to reduce one swap cache lookup because
> swap_readahead already has it?
>
> I am a bit puzzled at this patch. It shuffles a lot of sensitive code.
> However I do not get  the value.
> It seems like this patch should be merged with the later patch that
> depends on it to be judged together.
>
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/memory.c     | 61 +++++++++++++++++++++++--------------------------
> >  mm/swap.h       | 10 ++++++--
> >  mm/swap_state.c | 26 +++++++++++++--------
> >  mm/swapfile.c   | 30 +++++++++++-------------
> >  4 files changed, 66 insertions(+), 61 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index f4237a2e3b93..22af9f3e8c75 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3786,13 +3786,13 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >  vm_fault_t do_swap_page(struct vm_fault *vmf)
> >  {
> >         struct vm_area_struct *vma = vmf->vma;
> > -       struct folio *swapcache, *folio = NULL;
> > +       struct folio *swapcache = NULL, *folio = NULL;
> > +       enum swap_cache_result cache_result;
> >         struct page *page;
> >         struct swap_info_struct *si = NULL;
> >         rmap_t rmap_flags = RMAP_NONE;
> >         bool exclusive = false;
> >         swp_entry_t entry;
> > -       bool swapcached;
> >         pte_t pte;
> >         vm_fault_t ret = 0;
> >
> > @@ -3850,42 +3850,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >         if (unlikely(!si))
> >                 goto out;
> >
> > -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> > -       if (folio)
> > -               page = folio_file_page(folio, swp_offset(entry));
> > -       swapcache = folio;
>
> Is the motivation that swap_readahead() already has a swap cache look up so you
> remove this look up here?

Yes, the cache look up can is moved and shared in swapin_readahead,
and this also make it possible to use that look up to return a shadow
when entry is not a page, so another shadow look up can be saved for
sync (ZRAM) swapin path. This can help improve ZRAM performance for
~4% for a 10G ZRAM, and should improves more when the cache tree grows
large.

>
> > -
> > -       if (!folio) {
> > -               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > -                                       vmf, &swapcached);
> > -               if (page) {
> > -                       folio = page_folio(page);
> > -                       if (swapcached)
> > -                               swapcache = folio;
> > -               } else {
> > +       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > +                               vmf, &cache_result);
> > +       if (page) {
> > +               folio = page_folio(page);
> > +               if (cache_result != SWAP_CACHE_HIT) {
> > +                       /* Had to read the page from swap area: Major fault */
> > +                       ret = VM_FAULT_MAJOR;
> > +                       count_vm_event(PGMAJFAULT);
> > +                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > +               }
> > +               if (cache_result != SWAP_CACHE_BYPASS)
> > +                       swapcache = folio;
> > +               if (PageHWPoison(page)) {
>
> There is a lot of code shuffle here. From the diff it is hard to tell
> if they are doing the same thing as before.
>
> >                         /*
> > -                        * Back out if somebody else faulted in this pte
> > -                        * while we released the pte lock.
> > +                        * hwpoisoned dirty swapcache pages are kept for killing
> > +                        * owner processes (which may be unknown at hwpoison time)
> >                          */
> > -                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > -                                       vmf->address, &vmf->ptl);
> > -                       if (likely(vmf->pte &&
> > -                                  pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> > -                               ret = VM_FAULT_OOM;
> > -                       goto unlock;
> > +                       ret = VM_FAULT_HWPOISON;
> > +                       goto out_release;
> >                 }
> > -
> > -               /* Had to read the page from swap area: Major fault */
> > -               ret = VM_FAULT_MAJOR;
> > -               count_vm_event(PGMAJFAULT);
> > -               count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > -       } else if (PageHWPoison(page)) {
> > +       } else {
> >                 /*
> > -                * hwpoisoned dirty swapcache pages are kept for killing
> > -                * owner processes (which may be unknown at hwpoison time)
> > +                * Back out if somebody else faulted in this pte
> > +                * while we released the pte lock.
> >                  */
> > -               ret = VM_FAULT_HWPOISON;
> > -               goto out_release;
> > +               vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > +                               vmf->address, &vmf->ptl);
> > +               if (likely(vmf->pte &&
> > +                          pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> > +                       ret = VM_FAULT_OOM;
> > +               goto unlock;
> >         }
> >
> >         ret |= folio_lock_or_retry(folio, vmf);
> > diff --git a/mm/swap.h b/mm/swap.h
> > index a9a654af791e..ac9136eee690 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -30,6 +30,12 @@ extern struct address_space *swapper_spaces[];
> >         (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
> >                 >> SWAP_ADDRESS_SPACE_SHIFT])
> >
> > +enum swap_cache_result {
> > +       SWAP_CACHE_HIT,
> > +       SWAP_CACHE_MISS,
> > +       SWAP_CACHE_BYPASS,
> > +};
>
> Does any function later care about CACHE_BYPASS?
>
> Again, better introduce it with the function that uses it. Don't
> introduce it for "just in case I might use it later".

Yes,  callers in shmem will also need to know if the page is cached in
swap, and need a value to indicate the bypass case. I can add some
comments here to indicate the usage.

>
> > +
> >  void show_swap_cache_info(void);
> >  bool add_to_swap(struct folio *folio);
> >  void *get_shadow_from_swap_cache(swp_entry_t entry);
> > @@ -55,7 +61,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> >                                     struct mempolicy *mpol, pgoff_t ilx);
> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > -                             struct vm_fault *vmf, bool *swapcached);
> > +                             struct vm_fault *vmf, enum swap_cache_result *result);
> >
> >  static inline unsigned int folio_swap_flags(struct folio *folio)
> >  {
> > @@ -92,7 +98,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
> >  }
> >
> >  static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> > -                       struct vm_fault *vmf, bool *swapcached)
> > +                       struct vm_fault *vmf, enum swap_cache_result *result)
> >  {
> >         return NULL;
> >  }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index d87c20f9f7ec..e96d63bf8a22 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -908,8 +908,7 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >   * @entry: swap entry of this memory
> >   * @gfp_mask: memory allocation flags
> >   * @vmf: fault information
> > - * @swapcached: pointer to a bool used as indicator if the
> > - *              page is swapped in through swapcache.
> > + * @result: a return value to indicate swap cache usage.
> >   *
> >   * Returns the struct page for entry and addr, after queueing swapin.
> >   *
> > @@ -918,30 +917,39 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >   * or vma-based(ie, virtual address based on faulty address) readahead.
> >   */
> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > -                             struct vm_fault *vmf, bool *swapcached)
> > +                             struct vm_fault *vmf, enum swap_cache_result *result)
> >  {
> > +       enum swap_cache_result cache_result;
> >         struct swap_info_struct *si;
> >         struct mempolicy *mpol;
> > +       struct folio *folio;
> >         struct page *page;
> >         pgoff_t ilx;
> > -       bool cached;
> > +
> > +       folio = swap_cache_get_folio(entry, vmf->vma, vmf->address);
> > +       if (folio) {
> > +               page = folio_file_page(folio, swp_offset(entry));
> > +               cache_result = SWAP_CACHE_HIT;
> > +               goto done;
> > +       }
> >
> >         si = swp_swap_info(entry);
> >         mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> >         if (swap_use_no_readahead(si, swp_offset(entry))) {
> >                 page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
> > -               cached = false;
> > +               cache_result = SWAP_CACHE_BYPASS;
> >         } else if (swap_use_vma_readahead(si)) {
> >                 page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> > -               cached = true;
> > +               cache_result = SWAP_CACHE_MISS;
> >         } else {
> >                 page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > -               cached = true;
> > +               cache_result = SWAP_CACHE_MISS;
> >         }
> >         mpol_cond_put(mpol);
> >
> > -       if (swapcached)
> > -               *swapcached = cached;
> > +done:
> > +       if (result)
> > +               *result = cache_result;
> >
> >         return page;
> >  }
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 01c3f53b6521..b6d57fff5e21 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -1822,13 +1822,21 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >
> >         si = swap_info[type];
> >         do {
> > -               struct folio *folio;
> > +               struct page *page;
> >                 unsigned long offset;
> >                 unsigned char swp_count;
> > +               struct folio *folio = NULL;
> >                 swp_entry_t entry;
> >                 int ret;
> >                 pte_t ptent;
> >
> > +               struct vm_fault vmf = {
> > +                       .vma = vma,
> > +                       .address = addr,
> > +                       .real_address = addr,
> > +                       .pmd = pmd,
> > +               };
>
> Is this code move caused by skipping the swap cache look up here?

Yes.

>
> This is very sensitive code related to swap cache racing. It needs
> very careful reviewing. Better not shuffle it for no good reason.

Thanks for the suggestion, I'll try to avoid these shuffling, but
cache lookup is moved into swappin_readahead so some changes in the
original caller are not avoidable...
  
Chris Li Nov. 24, 2023, 9:10 a.m. UTC | #4
On Fri, Nov 24, 2023 at 12:42 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> Chris Li <chrisl@kernel.org> 于2023年11月22日周三 00:07写道:
>
>
> >
> > On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > No feature change, just prepare for later commits.
> >
> > You need to have a proper commit message why this change needs to happen.
> > Preparing is too generic, it does not give any real information.
> > For example, it seems you want to reduce one swap cache lookup because
> > swap_readahead already has it?
> >
> > I am a bit puzzled at this patch. It shuffles a lot of sensitive code.
> > However I do not get  the value.
> > It seems like this patch should be merged with the later patch that
> > depends on it to be judged together.
> >
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  mm/memory.c     | 61 +++++++++++++++++++++++--------------------------
> > >  mm/swap.h       | 10 ++++++--
> > >  mm/swap_state.c | 26 +++++++++++++--------
> > >  mm/swapfile.c   | 30 +++++++++++-------------
> > >  4 files changed, 66 insertions(+), 61 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index f4237a2e3b93..22af9f3e8c75 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -3786,13 +3786,13 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> > >  vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >  {
> > >         struct vm_area_struct *vma = vmf->vma;
> > > -       struct folio *swapcache, *folio = NULL;
> > > +       struct folio *swapcache = NULL, *folio = NULL;
> > > +       enum swap_cache_result cache_result;
> > >         struct page *page;
> > >         struct swap_info_struct *si = NULL;
> > >         rmap_t rmap_flags = RMAP_NONE;
> > >         bool exclusive = false;
> > >         swp_entry_t entry;
> > > -       bool swapcached;
> > >         pte_t pte;
> > >         vm_fault_t ret = 0;
> > >
> > > @@ -3850,42 +3850,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >         if (unlikely(!si))
> > >                 goto out;
> > >
> > > -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> > > -       if (folio)
> > > -               page = folio_file_page(folio, swp_offset(entry));
> > > -       swapcache = folio;
> >
> > Is the motivation that swap_readahead() already has a swap cache look up so you
> > remove this look up here?
>
> Yes, the cache look up can is moved and shared in swapin_readahead,
> and this also make it possible to use that look up to return a shadow
> when entry is not a page, so another shadow look up can be saved for
> sync (ZRAM) swapin path. This can help improve ZRAM performance for
> ~4% for a 10G ZRAM, and should improves more when the cache tree grows
> large.
>
> >
> > > -
> > > -       if (!folio) {
> > > -               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > -                                       vmf, &swapcached);
> > > -               if (page) {
> > > -                       folio = page_folio(page);
> > > -                       if (swapcached)
> > > -                               swapcache = folio;
> > > -               } else {
> > > +       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > +                               vmf, &cache_result);
> > > +       if (page) {
> > > +               folio = page_folio(page);
> > > +               if (cache_result != SWAP_CACHE_HIT) {
> > > +                       /* Had to read the page from swap area: Major fault */
> > > +                       ret = VM_FAULT_MAJOR;
> > > +                       count_vm_event(PGMAJFAULT);
> > > +                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > > +               }
> > > +               if (cache_result != SWAP_CACHE_BYPASS)
> > > +                       swapcache = folio;
> > > +               if (PageHWPoison(page)) {
> >
> > There is a lot of code shuffle here. From the diff it is hard to tell
> > if they are doing the same thing as before.
> >
> > >                         /*
> > > -                        * Back out if somebody else faulted in this pte
> > > -                        * while we released the pte lock.
> > > +                        * hwpoisoned dirty swapcache pages are kept for killing
> > > +                        * owner processes (which may be unknown at hwpoison time)
> > >                          */
> > > -                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > > -                                       vmf->address, &vmf->ptl);
> > > -                       if (likely(vmf->pte &&
> > > -                                  pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> > > -                               ret = VM_FAULT_OOM;
> > > -                       goto unlock;
> > > +                       ret = VM_FAULT_HWPOISON;
> > > +                       goto out_release;
> > >                 }
> > > -
> > > -               /* Had to read the page from swap area: Major fault */
> > > -               ret = VM_FAULT_MAJOR;
> > > -               count_vm_event(PGMAJFAULT);
> > > -               count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > > -       } else if (PageHWPoison(page)) {
> > > +       } else {
> > >                 /*
> > > -                * hwpoisoned dirty swapcache pages are kept for killing
> > > -                * owner processes (which may be unknown at hwpoison time)
> > > +                * Back out if somebody else faulted in this pte
> > > +                * while we released the pte lock.
> > >                  */
> > > -               ret = VM_FAULT_HWPOISON;
> > > -               goto out_release;
> > > +               vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > > +                               vmf->address, &vmf->ptl);
> > > +               if (likely(vmf->pte &&
> > > +                          pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> > > +                       ret = VM_FAULT_OOM;
> > > +               goto unlock;
> > >         }
> > >
> > >         ret |= folio_lock_or_retry(folio, vmf);
> > > diff --git a/mm/swap.h b/mm/swap.h
> > > index a9a654af791e..ac9136eee690 100644
> > > --- a/mm/swap.h
> > > +++ b/mm/swap.h
> > > @@ -30,6 +30,12 @@ extern struct address_space *swapper_spaces[];
> > >         (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
> > >                 >> SWAP_ADDRESS_SPACE_SHIFT])
> > >
> > > +enum swap_cache_result {
> > > +       SWAP_CACHE_HIT,
> > > +       SWAP_CACHE_MISS,
> > > +       SWAP_CACHE_BYPASS,
> > > +};
> >
> > Does any function later care about CACHE_BYPASS?
> >
> > Again, better introduce it with the function that uses it. Don't
> > introduce it for "just in case I might use it later".
>
> Yes,  callers in shmem will also need to know if the page is cached in
> swap, and need a value to indicate the bypass case. I can add some
> comments here to indicate the usage.

I also comment on the later patch. Because you do the look up without
the folio locked. The swap_cache can change by the time you use the
"*result". I suspect some of the swap_cache look up you need to add it
back due to the race before locking. This better introduces this field
with the user side together. Make it easier to reason the usage in one
patch.

BTW, one way to flatten the development history of the patch is to
flatten the branch as one big patch. Then copy/paste from the big
patch to introduce a sub patch step by step. That way the sub patch is
closer to the latest version of the code. Just something for you to
consider.

>
> >
> > > +
> > >  void show_swap_cache_info(void);
> > >  bool add_to_swap(struct folio *folio);
> > >  void *get_shadow_from_swap_cache(swp_entry_t entry);
> > > @@ -55,7 +61,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> > >                                     struct mempolicy *mpol, pgoff_t ilx);
> > >  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > > -                             struct vm_fault *vmf, bool *swapcached);
> > > +                             struct vm_fault *vmf, enum swap_cache_result *result);
> > >
> > >  static inline unsigned int folio_swap_flags(struct folio *folio)
> > >  {
> > > @@ -92,7 +98,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
> > >  }
> > >
> > >  static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> > > -                       struct vm_fault *vmf, bool *swapcached)
> > > +                       struct vm_fault *vmf, enum swap_cache_result *result)
> > >  {
> > >         return NULL;
> > >  }
> > > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > > index d87c20f9f7ec..e96d63bf8a22 100644
> > > --- a/mm/swap_state.c
> > > +++ b/mm/swap_state.c
> > > @@ -908,8 +908,7 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > >   * @entry: swap entry of this memory
> > >   * @gfp_mask: memory allocation flags
> > >   * @vmf: fault information
> > > - * @swapcached: pointer to a bool used as indicator if the
> > > - *              page is swapped in through swapcache.
> > > + * @result: a return value to indicate swap cache usage.
> > >   *
> > >   * Returns the struct page for entry and addr, after queueing swapin.
> > >   *
> > > @@ -918,30 +917,39 @@ static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > >   * or vma-based(ie, virtual address based on faulty address) readahead.
> > >   */
> > >  struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > -                             struct vm_fault *vmf, bool *swapcached)
> > > +                             struct vm_fault *vmf, enum swap_cache_result *result)
> > >  {
> > > +       enum swap_cache_result cache_result;
> > >         struct swap_info_struct *si;
> > >         struct mempolicy *mpol;
> > > +       struct folio *folio;
> > >         struct page *page;
> > >         pgoff_t ilx;
> > > -       bool cached;
> > > +
> > > +       folio = swap_cache_get_folio(entry, vmf->vma, vmf->address);
> > > +       if (folio) {
> > > +               page = folio_file_page(folio, swp_offset(entry));
> > > +               cache_result = SWAP_CACHE_HIT;
> > > +               goto done;
> > > +       }
> > >
> > >         si = swp_swap_info(entry);
> > >         mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> > >         if (swap_use_no_readahead(si, swp_offset(entry))) {
> > >                 page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
> > > -               cached = false;
> > > +               cache_result = SWAP_CACHE_BYPASS;
> > >         } else if (swap_use_vma_readahead(si)) {
> > >                 page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> > > -               cached = true;
> > > +               cache_result = SWAP_CACHE_MISS;
> > >         } else {
> > >                 page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > > -               cached = true;
> > > +               cache_result = SWAP_CACHE_MISS;
> > >         }
> > >         mpol_cond_put(mpol);
> > >
> > > -       if (swapcached)
> > > -               *swapcached = cached;
> > > +done:
> > > +       if (result)
> > > +               *result = cache_result;
> > >
> > >         return page;
> > >  }
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 01c3f53b6521..b6d57fff5e21 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -1822,13 +1822,21 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > >
> > >         si = swap_info[type];
> > >         do {
> > > -               struct folio *folio;
> > > +               struct page *page;
> > >                 unsigned long offset;
> > >                 unsigned char swp_count;
> > > +               struct folio *folio = NULL;
> > >                 swp_entry_t entry;
> > >                 int ret;
> > >                 pte_t ptent;
> > >
> > > +               struct vm_fault vmf = {
> > > +                       .vma = vma,
> > > +                       .address = addr,
> > > +                       .real_address = addr,
> > > +                       .pmd = pmd,
> > > +               };
> >
> > Is this code move caused by skipping the swap cache look up here?
>
> Yes.
>
> >
> > This is very sensitive code related to swap cache racing. It needs
> > very careful reviewing. Better not shuffle it for no good reason.
>
> Thanks for the suggestion, I'll try to avoid these shuffling, but
> cache lookup is moved into swappin_readahead so some changes in the
> original caller are not avoidable...
Yes I agree sometimes it is unavoidable.

Chris
  

Patch

diff --git a/mm/memory.c b/mm/memory.c
index f4237a2e3b93..22af9f3e8c75 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3786,13 +3786,13 @@  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct folio *swapcache, *folio = NULL;
+	struct folio *swapcache = NULL, *folio = NULL;
+	enum swap_cache_result cache_result;
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
 	bool exclusive = false;
 	swp_entry_t entry;
-	bool swapcached;
 	pte_t pte;
 	vm_fault_t ret = 0;
 
@@ -3850,42 +3850,37 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (unlikely(!si))
 		goto out;
 
-	folio = swap_cache_get_folio(entry, vma, vmf->address);
-	if (folio)
-		page = folio_file_page(folio, swp_offset(entry));
-	swapcache = folio;
-
-	if (!folio) {
-		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-					vmf, &swapcached);
-		if (page) {
-			folio = page_folio(page);
-			if (swapcached)
-				swapcache = folio;
-		} else {
+	page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+				vmf, &cache_result);
+	if (page) {
+		folio = page_folio(page);
+		if (cache_result != SWAP_CACHE_HIT) {
+			/* Had to read the page from swap area: Major fault */
+			ret = VM_FAULT_MAJOR;
+			count_vm_event(PGMAJFAULT);
+			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+		}
+		if (cache_result != SWAP_CACHE_BYPASS)
+			swapcache = folio;
+		if (PageHWPoison(page)) {
 			/*
-			 * Back out if somebody else faulted in this pte
-			 * while we released the pte lock.
+			 * hwpoisoned dirty swapcache pages are kept for killing
+			 * owner processes (which may be unknown at hwpoison time)
 			 */
-			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-					vmf->address, &vmf->ptl);
-			if (likely(vmf->pte &&
-				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
-				ret = VM_FAULT_OOM;
-			goto unlock;
+			ret = VM_FAULT_HWPOISON;
+			goto out_release;
 		}
-
-		/* Had to read the page from swap area: Major fault */
-		ret = VM_FAULT_MAJOR;
-		count_vm_event(PGMAJFAULT);
-		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
-	} else if (PageHWPoison(page)) {
+	} else {
 		/*
-		 * hwpoisoned dirty swapcache pages are kept for killing
-		 * owner processes (which may be unknown at hwpoison time)
+		 * Back out if somebody else faulted in this pte
+		 * while we released the pte lock.
 		 */
-		ret = VM_FAULT_HWPOISON;
-		goto out_release;
+		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+				vmf->address, &vmf->ptl);
+		if (likely(vmf->pte &&
+			   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+			ret = VM_FAULT_OOM;
+		goto unlock;
 	}
 
 	ret |= folio_lock_or_retry(folio, vmf);
diff --git a/mm/swap.h b/mm/swap.h
index a9a654af791e..ac9136eee690 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -30,6 +30,12 @@  extern struct address_space *swapper_spaces[];
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
 
+enum swap_cache_result {
+	SWAP_CACHE_HIT,
+	SWAP_CACHE_MISS,
+	SWAP_CACHE_BYPASS,
+};
+
 void show_swap_cache_info(void);
 bool add_to_swap(struct folio *folio);
 void *get_shadow_from_swap_cache(swp_entry_t entry);
@@ -55,7 +61,7 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
-			      struct vm_fault *vmf, bool *swapcached);
+			      struct vm_fault *vmf, enum swap_cache_result *result);
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
@@ -92,7 +98,7 @@  static inline struct page *swap_cluster_readahead(swp_entry_t entry,
 }
 
 static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
-			struct vm_fault *vmf, bool *swapcached)
+			struct vm_fault *vmf, enum swap_cache_result *result)
 {
 	return NULL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d87c20f9f7ec..e96d63bf8a22 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -908,8 +908,7 @@  static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
  * @vmf: fault information
- * @swapcached: pointer to a bool used as indicator if the
- *              page is swapped in through swapcache.
+ * @result: a return value to indicate swap cache usage.
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
@@ -918,30 +917,39 @@  static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
  * or vma-based(ie, virtual address based on faulty address) readahead.
  */
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
-			      struct vm_fault *vmf, bool *swapcached)
+			      struct vm_fault *vmf, enum swap_cache_result *result)
 {
+	enum swap_cache_result cache_result;
 	struct swap_info_struct *si;
 	struct mempolicy *mpol;
+	struct folio *folio;
 	struct page *page;
 	pgoff_t ilx;
-	bool cached;
+
+	folio = swap_cache_get_folio(entry, vmf->vma, vmf->address);
+	if (folio) {
+		page = folio_file_page(folio, swp_offset(entry));
+		cache_result = SWAP_CACHE_HIT;
+		goto done;
+	}
 
 	si = swp_swap_info(entry);
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
 	if (swap_use_no_readahead(si, swp_offset(entry))) {
 		page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, vmf->vma->vm_mm);
-		cached = false;
+		cache_result = SWAP_CACHE_BYPASS;
 	} else if (swap_use_vma_readahead(si)) {
 		page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
-		cached = true;
+		cache_result = SWAP_CACHE_MISS;
 	} else {
 		page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
-		cached = true;
+		cache_result = SWAP_CACHE_MISS;
 	}
 	mpol_cond_put(mpol);
 
-	if (swapcached)
-		*swapcached = cached;
+done:
+	if (result)
+		*result = cache_result;
 
 	return page;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 01c3f53b6521..b6d57fff5e21 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1822,13 +1822,21 @@  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 	si = swap_info[type];
 	do {
-		struct folio *folio;
+		struct page *page;
 		unsigned long offset;
 		unsigned char swp_count;
+		struct folio *folio = NULL;
 		swp_entry_t entry;
 		int ret;
 		pte_t ptent;
 
+		struct vm_fault vmf = {
+			.vma = vma,
+			.address = addr,
+			.real_address = addr,
+			.pmd = pmd,
+		};
+
 		if (!pte++) {
 			pte = pte_offset_map(pmd, addr);
 			if (!pte)
@@ -1847,22 +1855,10 @@  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		offset = swp_offset(entry);
 		pte_unmap(pte);
 		pte = NULL;
-
-		folio = swap_cache_get_folio(entry, vma, addr);
-		if (!folio) {
-			struct page *page;
-			struct vm_fault vmf = {
-				.vma = vma,
-				.address = addr,
-				.real_address = addr,
-				.pmd = pmd,
-			};
-
-			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-						&vmf, NULL);
-			if (page)
-				folio = page_folio(page);
-		}
+		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+					&vmf, NULL);
+		if (page)
+			folio = page_folio(page);
 		if (!folio) {
 			/*
 			 * The entry could have been freed, and will not