[-next,resend] mm: hwposion: support recovery from ksm_might_need_to_copy()

Message ID 20221209021525.196276-1-wangkefeng.wang@huawei.com
State New
Headers
Series [-next,resend] mm: hwposion: support recovery from ksm_might_need_to_copy() |

Commit Message

Kefeng Wang Dec. 9, 2022, 2:15 a.m. UTC
  When the kernel copy a page from ksm_might_need_to_copy(), but runs
into an uncorrectable error, it will crash since poisoned page is
consumed by kernel, this is similar to Copy-on-write poison recovery,
When an error is detected during the page copy, return VM_FAULT_HWPOISON,
which help us to avoid system crash. Note, memory failure on a KSM
page will be skipped, but still call memory_failure_queue() to be
consistent with general memory failure process.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/ksm.c      | 8 ++++++--
 mm/memory.c   | 3 +++
 mm/swapfile.c | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)
  

Comments

kernel test robot Dec. 9, 2022, 6:54 a.m. UTC | #1
Hi Kefeng,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Kefeng-Wang/mm-hwposion-support-recovery-from-ksm_might_need_to_copy/20221209-095943
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20221209021525.196276-1-wangkefeng.wang%40huawei.com
patch subject: [PATCH -next resend] mm: hwposion: support recovery from ksm_might_need_to_copy()
config: hexagon-randconfig-r041-20221207
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 6e4cea55f0d1104408b26ac574566a0e4de48036)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/503dfb85dafae7b3b09b7d6b29bb675fc7673a32
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Kefeng-Wang/mm-hwposion-support-recovery-from-ksm_might_need_to_copy/20221209-095943
        git checkout 503dfb85dafae7b3b09b7d6b29bb675fc7673a32
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from mm/memory.c:42:
   In file included from include/linux/kernel_stat.h:9:
   In file included from include/linux/interrupt.h:11:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __raw_readb(PCI_IOBASE + addr);
                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
   #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
                                                     ^
   In file included from mm/memory.c:42:
   In file included from include/linux/kernel_stat.h:9:
   In file included from include/linux/interrupt.h:11:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
                                                     ^
   In file included from mm/memory.c:42:
   In file included from include/linux/kernel_stat.h:9:
   In file included from include/linux/interrupt.h:11:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
>> mm/memory.c:3843:5: error: use of undeclared identifier 'els'; did you mean 'fls'?
                   } els if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
                     ^~~
                     fls
   arch/hexagon/include/asm/bitops.h:220:19: note: 'fls' declared here
   static inline int fls(unsigned int x)
                     ^
>> mm/memory.c:3843:8: error: expected ';' after expression
                   } els if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
                        ^
                        ;
   mm/memory.c:3843:5: warning: expression result unused [-Wunused-value]
                   } els if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
                     ^~~
   7 warnings and 2 errors generated.


vim +3843 mm/memory.c

  3680	
  3681	/*
  3682	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3683	 * but allow concurrent faults), and pte mapped but not yet locked.
  3684	 * We return with pte unmapped and unlocked.
  3685	 *
  3686	 * We return with the mmap_lock locked or unlocked in the same cases
  3687	 * as does filemap_fault().
  3688	 */
  3689	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3690	{
  3691		struct vm_area_struct *vma = vmf->vma;
  3692		struct folio *swapcache, *folio = NULL;
  3693		struct page *page;
  3694		struct swap_info_struct *si = NULL;
  3695		rmap_t rmap_flags = RMAP_NONE;
  3696		bool exclusive = false;
  3697		swp_entry_t entry;
  3698		pte_t pte;
  3699		int locked;
  3700		vm_fault_t ret = 0;
  3701		void *shadow = NULL;
  3702	
  3703		if (!pte_unmap_same(vmf))
  3704			goto out;
  3705	
  3706		entry = pte_to_swp_entry(vmf->orig_pte);
  3707		if (unlikely(non_swap_entry(entry))) {
  3708			if (is_migration_entry(entry)) {
  3709				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3710						     vmf->address);
  3711			} else if (is_device_exclusive_entry(entry)) {
  3712				vmf->page = pfn_swap_entry_to_page(entry);
  3713				ret = remove_device_exclusive_entry(vmf);
  3714			} else if (is_device_private_entry(entry)) {
  3715				vmf->page = pfn_swap_entry_to_page(entry);
  3716				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3717						vmf->address, &vmf->ptl);
  3718				if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
  3719					spin_unlock(vmf->ptl);
  3720					goto out;
  3721				}
  3722	
  3723				/*
  3724				 * Get a page reference while we know the page can't be
  3725				 * freed.
  3726				 */
  3727				get_page(vmf->page);
  3728				pte_unmap_unlock(vmf->pte, vmf->ptl);
  3729				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3730				put_page(vmf->page);
  3731			} else if (is_hwpoison_entry(entry)) {
  3732				ret = VM_FAULT_HWPOISON;
  3733			} else if (is_pte_marker_entry(entry)) {
  3734				ret = handle_pte_marker(vmf);
  3735			} else {
  3736				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3737				ret = VM_FAULT_SIGBUS;
  3738			}
  3739			goto out;
  3740		}
  3741	
  3742		/* Prevent swapoff from happening to us. */
  3743		si = get_swap_device(entry);
  3744		if (unlikely(!si))
  3745			goto out;
  3746	
  3747		folio = swap_cache_get_folio(entry, vma, vmf->address);
  3748		if (folio)
  3749			page = folio_file_page(folio, swp_offset(entry));
  3750		swapcache = folio;
  3751	
  3752		if (!folio) {
  3753			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
  3754			    __swap_count(entry) == 1) {
  3755				/* skip swapcache */
  3756				folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
  3757							vma, vmf->address, false);
  3758				page = &folio->page;
  3759				if (folio) {
  3760					__folio_set_locked(folio);
  3761					__folio_set_swapbacked(folio);
  3762	
  3763					if (mem_cgroup_swapin_charge_folio(folio,
  3764								vma->vm_mm, GFP_KERNEL,
  3765								entry)) {
  3766						ret = VM_FAULT_OOM;
  3767						goto out_page;
  3768					}
  3769					mem_cgroup_swapin_uncharge_swap(entry);
  3770	
  3771					shadow = get_shadow_from_swap_cache(entry);
  3772					if (shadow)
  3773						workingset_refault(folio, shadow);
  3774	
  3775					folio_add_lru(folio);
  3776	
  3777					/* To provide entry to swap_readpage() */
  3778					folio_set_swap_entry(folio, entry);
  3779					swap_readpage(page, true, NULL);
  3780					folio->private = NULL;
  3781				}
  3782			} else {
  3783				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3784							vmf);
  3785				if (page)
  3786					folio = page_folio(page);
  3787				swapcache = folio;
  3788			}
  3789	
  3790			if (!folio) {
  3791				/*
  3792				 * Back out if somebody else faulted in this pte
  3793				 * while we released the pte lock.
  3794				 */
  3795				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3796						vmf->address, &vmf->ptl);
  3797				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
  3798					ret = VM_FAULT_OOM;
  3799				goto unlock;
  3800			}
  3801	
  3802			/* Had to read the page from swap area: Major fault */
  3803			ret = VM_FAULT_MAJOR;
  3804			count_vm_event(PGMAJFAULT);
  3805			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3806		} else if (PageHWPoison(page)) {
  3807			/*
  3808			 * hwpoisoned dirty swapcache pages are kept for killing
  3809			 * owner processes (which may be unknown at hwpoison time)
  3810			 */
  3811			ret = VM_FAULT_HWPOISON;
  3812			goto out_release;
  3813		}
  3814	
  3815		locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
  3816	
  3817		if (!locked) {
  3818			ret |= VM_FAULT_RETRY;
  3819			goto out_release;
  3820		}
  3821	
  3822		if (swapcache) {
  3823			/*
  3824			 * Make sure folio_free_swap() or swapoff did not release the
  3825			 * swapcache from under us.  The page pin, and pte_same test
  3826			 * below, are not enough to exclude that.  Even if it is still
  3827			 * swapcache, we need to check that the page's swap has not
  3828			 * changed.
  3829			 */
  3830			if (unlikely(!folio_test_swapcache(folio) ||
  3831				     page_private(page) != entry.val))
  3832				goto out_page;
  3833	
  3834			/*
  3835			 * KSM sometimes has to copy on read faults, for example, if
  3836			 * page->index of !PageKSM() pages would be nonlinear inside the
  3837			 * anon VMA -- PageKSM() is lost on actual swapout.
  3838			 */
  3839			page = ksm_might_need_to_copy(page, vma, vmf->address);
  3840			if (unlikely(!page)) {
  3841				ret = VM_FAULT_OOM;
  3842				goto out_page;
> 3843			} els if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
  3844				ret = VM_FAULT_HWPOISON;
  3845				goto out_page;
  3846			}
  3847			folio = page_folio(page);
  3848	
  3849			/*
  3850			 * If we want to map a page that's in the swapcache writable, we
  3851			 * have to detect via the refcount if we're really the exclusive
  3852			 * owner. Try removing the extra reference from the local LRU
  3853			 * pagevecs if required.
  3854			 */
  3855			if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
  3856			    !folio_test_ksm(folio) && !folio_test_lru(folio))
  3857				lru_add_drain();
  3858		}
  3859	
  3860		cgroup_throttle_swaprate(page, GFP_KERNEL);
  3861	
  3862		/*
  3863		 * Back out if somebody else already faulted in this pte.
  3864		 */
  3865		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3866				&vmf->ptl);
  3867		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
  3868			goto out_nomap;
  3869	
  3870		if (unlikely(!folio_test_uptodate(folio))) {
  3871			ret = VM_FAULT_SIGBUS;
  3872			goto out_nomap;
  3873		}
  3874	
  3875		/*
  3876		 * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
  3877		 * must never point at an anonymous page in the swapcache that is
  3878		 * PG_anon_exclusive. Sanity check that this holds and especially, that
  3879		 * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
  3880		 * check after taking the PT lock and making sure that nobody
  3881		 * concurrently faulted in this page and set PG_anon_exclusive.
  3882		 */
  3883		BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
  3884		BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
  3885	
  3886		/*
  3887		 * Check under PT lock (to protect against concurrent fork() sharing
  3888		 * the swap entry concurrently) for certainly exclusive pages.
  3889		 */
  3890		if (!folio_test_ksm(folio)) {
  3891			/*
  3892			 * Note that pte_swp_exclusive() == false for architectures
  3893			 * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
  3894			 */
  3895			exclusive = pte_swp_exclusive(vmf->orig_pte);
  3896			if (folio != swapcache) {
  3897				/*
  3898				 * We have a fresh page that is not exposed to the
  3899				 * swapcache -> certainly exclusive.
  3900				 */
  3901				exclusive = true;
  3902			} else if (exclusive && folio_test_writeback(folio) &&
  3903				  data_race(si->flags & SWP_STABLE_WRITES)) {
  3904				/*
  3905				 * This is tricky: not all swap backends support
  3906				 * concurrent page modifications while under writeback.
  3907				 *
  3908				 * So if we stumble over such a page in the swapcache
  3909				 * we must not set the page exclusive, otherwise we can
  3910				 * map it writable without further checks and modify it
  3911				 * while still under writeback.
  3912				 *
  3913				 * For these problematic swap backends, simply drop the
  3914				 * exclusive marker: this is perfectly fine as we start
  3915				 * writeback only if we fully unmapped the page and
  3916				 * there are no unexpected references on the page after
  3917				 * unmapping succeeded. After fully unmapped, no
  3918				 * further GUP references (FOLL_GET and FOLL_PIN) can
  3919				 * appear, so dropping the exclusive marker and mapping
  3920				 * it only R/O is fine.
  3921				 */
  3922				exclusive = false;
  3923			}
  3924		}
  3925	
  3926		/*
  3927		 * Remove the swap entry and conditionally try to free up the swapcache.
  3928		 * We're already holding a reference on the page but haven't mapped it
  3929		 * yet.
  3930		 */
  3931		swap_free(entry);
  3932		if (should_try_to_free_swap(folio, vma, vmf->flags))
  3933			folio_free_swap(folio);
  3934	
  3935		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  3936		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  3937		pte = mk_pte(page, vma->vm_page_prot);
  3938	
  3939		/*
  3940		 * Same logic as in do_wp_page(); however, optimize for pages that are
  3941		 * certainly not shared either because we just allocated them without
  3942		 * exposing them to the swapcache or because the swap entry indicates
  3943		 * exclusivity.
  3944		 */
  3945		if (!folio_test_ksm(folio) &&
  3946		    (exclusive || folio_ref_count(folio) == 1)) {
  3947			if (vmf->flags & FAULT_FLAG_WRITE) {
  3948				pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  3949				vmf->flags &= ~FAULT_FLAG_WRITE;
  3950			}
  3951			rmap_flags |= RMAP_EXCLUSIVE;
  3952		}
  3953		flush_icache_page(vma, page);
  3954		if (pte_swp_soft_dirty(vmf->orig_pte))
  3955			pte = pte_mksoft_dirty(pte);
  3956		if (pte_swp_uffd_wp(vmf->orig_pte)) {
  3957			pte = pte_mkuffd_wp(pte);
  3958			pte = pte_wrprotect(pte);
  3959		}
  3960		vmf->orig_pte = pte;
  3961	
  3962		/* ksm created a completely new copy */
  3963		if (unlikely(folio != swapcache && swapcache)) {
  3964			page_add_new_anon_rmap(page, vma, vmf->address);
  3965			folio_add_lru_vma(folio, vma);
  3966		} else {
  3967			page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
  3968		}
  3969	
  3970		VM_BUG_ON(!folio_test_anon(folio) ||
  3971				(pte_write(pte) && !PageAnonExclusive(page)));
  3972		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  3973		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  3974	
  3975		folio_unlock(folio);
  3976		if (folio != swapcache && swapcache) {
  3977			/*
  3978			 * Hold the lock to avoid the swap entry to be reused
  3979			 * until we take the PT lock for the pte_same() check
  3980			 * (to avoid false positives from pte_same). For
  3981			 * further safety release the lock after the swap_free
  3982			 * so that the swap count won't change under a
  3983			 * parallel locked swapcache.
  3984			 */
  3985			folio_unlock(swapcache);
  3986			folio_put(swapcache);
  3987		}
  3988	
  3989		if (vmf->flags & FAULT_FLAG_WRITE) {
  3990			ret |= do_wp_page(vmf);
  3991			if (ret & VM_FAULT_ERROR)
  3992				ret &= VM_FAULT_ERROR;
  3993			goto out;
  3994		}
  3995	
  3996		/* No need to invalidate - it was non-present before */
  3997		update_mmu_cache(vma, vmf->address, vmf->pte);
  3998	unlock:
  3999		pte_unmap_unlock(vmf->pte, vmf->ptl);
  4000	out:
  4001		if (si)
  4002			put_swap_device(si);
  4003		return ret;
  4004	out_nomap:
  4005		pte_unmap_unlock(vmf->pte, vmf->ptl);
  4006	out_page:
  4007		folio_unlock(folio);
  4008	out_release:
  4009		folio_put(folio);
  4010		if (folio != swapcache && swapcache) {
  4011			folio_unlock(swapcache);
  4012			folio_put(swapcache);
  4013		}
  4014		if (si)
  4015			put_swap_device(si);
  4016		return ret;
  4017	}
  4018
  

Patch

diff --git a/mm/ksm.c b/mm/ksm.c
index dd02780c387f..83e2f74ae7da 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2629,8 +2629,12 @@  struct page *ksm_might_need_to_copy(struct page *page,
 		new_page = NULL;
 	}
 	if (new_page) {
-		copy_user_highpage(new_page, page, address, vma);
-
+		if (copy_mc_user_highpage(new_page, page, address, vma)) {
+			put_page(new_page);
+			new_page = ERR_PTR(-EHWPOISON);
+			memory_failure_queue(page_to_pfn(page), 0);
+			return new_page;
+		}
 		SetPageDirty(new_page);
 		__SetPageUptodate(new_page);
 		__SetPageLocked(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..8711488f5305 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3840,6 +3840,9 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (unlikely(!page)) {
 			ret = VM_FAULT_OOM;
 			goto out_page;
+		} els if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+			ret = VM_FAULT_HWPOISON;
+			goto out_page;
 		}
 		folio = page_folio(page);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 908a529bca12..d479811bc311 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1767,7 +1767,7 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 
 	swapcache = page;
 	page = ksm_might_need_to_copy(page, vma, addr);
-	if (unlikely(!page))
+	if (IS_ERR_OR_NULL(page))
 		return -ENOMEM;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);