[-next,resend,v3] mm: hwposion: support recovery from ksm_might_need_to_copy()

Message ID 20221213120523.141588-1-wangkefeng.wang@huawei.com
State New
Headers
Series [-next,resend,v3] mm: hwposion: support recovery from ksm_might_need_to_copy() |

Commit Message

Kefeng Wang Dec. 13, 2022, 12:05 p.m. UTC
  When the kernel copy a page from ksm_might_need_to_copy(), but runs
into an uncorrectable error, it will crash since poisoned page is
consumed by kernel, this is similar to Copy-on-write poison recovery,
When an error is detected during the page copy, return VM_FAULT_HWPOISON
in do_swap_page(), and install a hwpoison entry in unuse_pte() when
swapoff, which help us to avoid system crash. Note, memory failure on
a KSM page will be skipped, but still call memory_failure_queue() to
be consistent with general memory failure process.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
v3 resend: 
- enhance unuse_pte() if ksm_might_need_to_copy() return -EHWPOISON
- fix issue found by lkp 

 mm/ksm.c      |  8 ++++++--
 mm/memory.c   |  3 +++
 mm/swapfile.c | 20 ++++++++++++++------
 3 files changed, 23 insertions(+), 8 deletions(-)
  

Comments

HORIGUCHI NAOYA(堀口 直也) Dec. 16, 2022, 1:47 a.m. UTC | #1
On Tue, Dec 13, 2022 at 08:05:23PM +0800, Kefeng Wang wrote:
> When the kernel copy a page from ksm_might_need_to_copy(), but runs
> into an uncorrectable error, it will crash since poisoned page is
> consumed by kernel, this is similar to Copy-on-write poison recovery,

Maybe you mean "this is similar to the issue recently fixed by
Copy-on-write poison recovery."?  And if this sentence ends here,
please put "." instead of ",".

> When an error is detected during the page copy, return VM_FAULT_HWPOISON
> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
> swapoff, which help us to avoid system crash.  Note, memory failure on
> a KSM page will be skipped, but still call memory_failure_queue() to
> be consistent with general memory failure process.

Thank you for the work.  I have a few comment below ...

> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
> v3 resend: 
> - enhance unuse_pte() if ksm_might_need_to_copy() return -EHWPOISON
> - fix issue found by lkp 
> 
>  mm/ksm.c      |  8 ++++++--
>  mm/memory.c   |  3 +++
>  mm/swapfile.c | 20 ++++++++++++++------
>  3 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index dd02780c387f..83e2f74ae7da 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2629,8 +2629,12 @@ struct page *ksm_might_need_to_copy(struct page *page,
>  		new_page = NULL;
>  	}
>  	if (new_page) {
> -		copy_user_highpage(new_page, page, address, vma);
> -
> +		if (copy_mc_user_highpage(new_page, page, address, vma)) {
> +			put_page(new_page);
> +			new_page = ERR_PTR(-EHWPOISON);
> +			memory_failure_queue(page_to_pfn(page), 0);
> +			return new_page;

Simply return ERR_PTR(-EHWPOISON)?

> +		}
>  		SetPageDirty(new_page);
>  		__SetPageUptodate(new_page);
>  		__SetPageLocked(new_page);
> diff --git a/mm/memory.c b/mm/memory.c
> index aad226daf41b..5b2c137dfb2a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3840,6 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		if (unlikely(!page)) {
>  			ret = VM_FAULT_OOM;
>  			goto out_page;
> +		} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
> +			ret = VM_FAULT_HWPOISON;
> +			goto out_page;
>  		}
>  		folio = page_folio(page);
>  
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 908a529bca12..0efb1c2c2415 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1763,12 +1763,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  	struct page *swapcache;
>  	spinlock_t *ptl;
>  	pte_t *pte, new_pte;
> +	bool hwposioned = false;
>  	int ret = 1;
>  
>  	swapcache = page;
>  	page = ksm_might_need_to_copy(page, vma, addr);
>  	if (unlikely(!page))
>  		return -ENOMEM;
> +	else if (unlikely(PTR_ERR(page) == -EHWPOISON))
> +		hwposioned = true;
>  
>  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
> @@ -1776,15 +1779,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto out;
>  	}
>  
> -	if (unlikely(!PageUptodate(page))) {
> -		pte_t pteval;
> +	if (hwposioned || !PageUptodate(page)) {
> +		swp_entry_t swp_entry;
>  
>  		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> -		pteval = swp_entry_to_pte(make_swapin_error_entry());
> -		set_pte_at(vma->vm_mm, addr, pte, pteval);
> -		swap_free(entry);
> +		if (hwposioned) {
> +			swp_entry = make_hwpoison_entry(swapcache);
> +			page = swapcache;

This might work for the process accessing to the broken page, but ksm
pages are likely to be shared by multiple processes, so it would be
much nicer if you can convert all mapping entries for the error ksm page
into hwpoisoned ones.  Maybe in this thorough approach,
hwpoison_user_mappings() is updated to call try_to_unmap() for ksm pages.
But it's not necessary to do this together with applying mcsafe-memcpy,
because recovery action and mcsafe-memcpy can be done independently.

Thanks,
Naoya Horiguchi

> +		} else {
> +			swp_entry = make_swapin_error_entry();
> +		}
> +		new_pte = swp_entry_to_pte(swp_entry);
>  		ret = 0;
> -		goto out;
> +		goto setpte;
>  	}
>  
>  	/* See do_swap_page() */
> @@ -1816,6 +1823,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  		new_pte = pte_mksoft_dirty(new_pte);
>  	if (pte_swp_uffd_wp(*pte))
>  		new_pte = pte_mkuffd_wp(new_pte);
> +setpte:
>  	set_pte_at(vma->vm_mm, addr, pte, new_pte);
>  	swap_free(entry);
>  out:
> -- 
> 2.35.3
  
Kefeng Wang Dec. 16, 2022, 8:42 a.m. UTC | #2
On 2022/12/16 9:47, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Dec 13, 2022 at 08:05:23PM +0800, Kefeng Wang wrote:
>> When the kernel copy a page from ksm_might_need_to_copy(), but runs
>> into an uncorrectable error, it will crash since poisoned page is
>> consumed by kernel, this is similar to Copy-on-write poison recovery,
> 
> Maybe you mean "this is similar to the issue recently fixed by
> Copy-on-write poison recovery."?  And if this sentence ends here,
> please put "." instead of ",".

That what I mean, will update the changelog.
> 
>> When an error is detected during the page copy, return VM_FAULT_HWPOISON
>> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
>> swapoff, which help us to avoid system crash.  Note, memory failure on
>> a KSM page will be skipped, but still call memory_failure_queue() to
>> be consistent with general memory failure process.
> 
> Thank you for the work.  I have a few comment below ...
> 
>>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>> v3 resend:
>> - enhance unuse_pte() if ksm_might_need_to_copy() return -EHWPOISON
>> - fix issue found by lkp
>>
>>   mm/ksm.c      |  8 ++++++--
>>   mm/memory.c   |  3 +++
>>   mm/swapfile.c | 20 ++++++++++++++------
>>   3 files changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index dd02780c387f..83e2f74ae7da 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -2629,8 +2629,12 @@ struct page *ksm_might_need_to_copy(struct page *page,
>>   		new_page = NULL;
>>   	}
>>   	if (new_page) {
>> -		copy_user_highpage(new_page, page, address, vma);
>> -
>> +		if (copy_mc_user_highpage(new_page, page, address, vma)) {
>> +			put_page(new_page);
>> +			new_page = ERR_PTR(-EHWPOISON);
>> +			memory_failure_queue(page_to_pfn(page), 0);
>> +			return new_page;
> 
> Simply return ERR_PTR(-EHWPOISON)?

OK.

> 
>> +		}
>>   		SetPageDirty(new_page);
>>   		__SetPageUptodate(new_page);
>>   		__SetPageLocked(new_page);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index aad226daf41b..5b2c137dfb2a 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3840,6 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>   		if (unlikely(!page)) {
>>   			ret = VM_FAULT_OOM;
>>   			goto out_page;
>> +		} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
>> +			ret = VM_FAULT_HWPOISON;
>> +			goto out_page;
>>   		}
>>   		folio = page_folio(page);
>>   
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 908a529bca12..0efb1c2c2415 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1763,12 +1763,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>   	struct page *swapcache;
>>   	spinlock_t *ptl;
>>   	pte_t *pte, new_pte;
>> +	bool hwposioned = false;
>>   	int ret = 1;
>>   
>>   	swapcache = page;
>>   	page = ksm_might_need_to_copy(page, vma, addr);
>>   	if (unlikely(!page))
>>   		return -ENOMEM;
>> +	else if (unlikely(PTR_ERR(page) == -EHWPOISON))
>> +		hwposioned = true;
>>   
>>   	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>>   	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
>> @@ -1776,15 +1779,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>   		goto out;
>>   	}
>>   
>> -	if (unlikely(!PageUptodate(page))) {
>> -		pte_t pteval;
>> +	if (hwposioned || !PageUptodate(page)) {
>> +		swp_entry_t swp_entry;
>>   
>>   		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>> -		pteval = swp_entry_to_pte(make_swapin_error_entry());
>> -		set_pte_at(vma->vm_mm, addr, pte, pteval);
>> -		swap_free(entry);
>> +		if (hwposioned) {
>> +			swp_entry = make_hwpoison_entry(swapcache);
>> +			page = swapcache;
> 
> This might work for the process accessing to the broken page, but ksm
> pages are likely to be shared by multiple processes, so it would be
> much nicer if you can convert all mapping entries for the error ksm page
> into hwpoisoned ones.  Maybe in this thorough approach,
> hwpoison_user_mappings() is updated to call try_to_unmap() for ksm pages.
> But it's not necessary to do this together with applying mcsafe-memcpy,
> because recovery action and mcsafe-memcpy can be done independently.

Yes, the memory failure won't handle KSM page (commit 01e00f880ca7 
"HWPOISON: fix oops on ksm pages"), we could support it later,

> 
> Thanks,
> Naoya Horiguchi
> 
>> +		} else {
>> +			swp_entry = make_swapin_error_entry();
>> +		}
>> +		new_pte = swp_entry_to_pte(swp_entry);
>>   		ret = 0;
>> -		goto out;
>> +		goto setpte;
>>   	}
>>   
>>   	/* See do_swap_page() */
>> @@ -1816,6 +1823,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>   		new_pte = pte_mksoft_dirty(new_pte);
>>   	if (pte_swp_uffd_wp(*pte))
>>   		new_pte = pte_mkuffd_wp(new_pte);
>> +setpte:
>>   	set_pte_at(vma->vm_mm, addr, pte, new_pte);
>>   	swap_free(entry);
>>   out:
>> -- 
>> 2.35.3
> 
>
  
Miaohe Lin Dec. 17, 2022, 2:24 a.m. UTC | #3
On 2022/12/16 9:47, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Dec 13, 2022 at 08:05:23PM +0800, Kefeng Wang wrote:
>> When the kernel copy a page from ksm_might_need_to_copy(), but runs
>> into an uncorrectable error, it will crash since poisoned page is
>> consumed by kernel, this is similar to Copy-on-write poison recovery,
> 
> Maybe you mean "this is similar to the issue recently fixed by
> Copy-on-write poison recovery."?  And if this sentence ends here,
> please put "." instead of ",".
> 
>> When an error is detected during the page copy, return VM_FAULT_HWPOISON
>> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
>> swapoff, which help us to avoid system crash.  Note, memory failure on
>> a KSM page will be skipped, but still call memory_failure_queue() to
>> be consistent with general memory failure process.
> 
> Thank you for the work.  I have a few comment below ...

Thanks both.

>> -	if (unlikely(!PageUptodate(page))) {
>> -		pte_t pteval;
>> +	if (hwposioned || !PageUptodate(page)) {
>> +		swp_entry_t swp_entry;
>>  
>>  		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>> -		pteval = swp_entry_to_pte(make_swapin_error_entry());
>> -		set_pte_at(vma->vm_mm, addr, pte, pteval);
>> -		swap_free(entry);
>> +		if (hwposioned) {
>> +			swp_entry = make_hwpoison_entry(swapcache);
>> +			page = swapcache;
> 
> This might work for the process accessing to the broken page, but ksm
> pages are likely to be shared by multiple processes, so it would be
> much nicer if you can convert all mapping entries for the error ksm page
> into hwpoisoned ones.  Maybe in this thorough approach,
> hwpoison_user_mappings() is updated to call try_to_unmap() for ksm pages.
> But it's not necessary to do this together with applying mcsafe-memcpy,
> because recovery action and mcsafe-memcpy can be done independently.
> 

I'm afraid leaving the ksm page in the cache will repeatedly trigger uncorrectable error for the
same page if ksm pages are shared by multiple processes. This might reach the hardware threshold
and result in fatal uncorrectable error (thus casuing system to panic). So IMHO it might be better
to check if page is hwpoisoned before calling ksm_might_need_to_copy() if above thorough approach
is not implemented. But I can easily be wrong.

Thanks,
Miaohe Lin
  
Andrew Morton Feb. 1, 2023, 12:32 a.m. UTC | #4
On Tue, 13 Dec 2022 20:05:23 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:

> When the kernel copy a page from ksm_might_need_to_copy(), but runs
> into an uncorrectable error, it will crash since poisoned page is
> consumed by kernel, this is similar to Copy-on-write poison recovery,
> When an error is detected during the page copy, return VM_FAULT_HWPOISON
> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
> swapoff, which help us to avoid system crash. Note, memory failure on
> a KSM page will be skipped, but still call memory_failure_queue() to
> be consistent with general memory failure process.

I believe we're awaiting a v4 of this?

Did we consider a -stable backport?  "kernel crash" sounds undesirable...

Can we identify a Fixes: target for this?

Thanks.
  
Kefeng Wang Feb. 1, 2023, 1:23 a.m. UTC | #5
On 2022/12/17 10:24, Miaohe Lin wrote:
> On 2022/12/16 9:47, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Tue, Dec 13, 2022 at 08:05:23PM +0800, Kefeng Wang wrote:
>>> When the kernel copy a page from ksm_might_need_to_copy(), but runs
>>> into an uncorrectable error, it will crash since poisoned page is
>>> consumed by kernel, this is similar to Copy-on-write poison recovery,
>>
>> Maybe you mean "this is similar to the issue recently fixed by
>> Copy-on-write poison recovery."?  And if this sentence ends here,
>> please put "." instead of ",".
>>
>>> When an error is detected during the page copy, return VM_FAULT_HWPOISON
>>> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
>>> swapoff, which help us to avoid system crash.  Note, memory failure on
>>> a KSM page will be skipped, but still call memory_failure_queue() to
>>> be consistent with general memory failure process.
>>
>> Thank you for the work.  I have a few comment below ...
> 
> Thanks both.
> 
>>> -	if (unlikely(!PageUptodate(page))) {
>>> -		pte_t pteval;
>>> +	if (hwposioned || !PageUptodate(page)) {
>>> +		swp_entry_t swp_entry;
>>>   
>>>   		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>>> -		pteval = swp_entry_to_pte(make_swapin_error_entry());
>>> -		set_pte_at(vma->vm_mm, addr, pte, pteval);
>>> -		swap_free(entry);
>>> +		if (hwposioned) {
>>> +			swp_entry = make_hwpoison_entry(swapcache);
>>> +			page = swapcache;
>>
>> This might work for the process accessing to the broken page, but ksm
>> pages are likely to be shared by multiple processes, so it would be
>> much nicer if you can convert all mapping entries for the error ksm page
>> into hwpoisoned ones.  Maybe in this thorough approach,
>> hwpoison_user_mappings() is updated to call try_to_unmap() for ksm pages.
>> But it's not necessary to do this together with applying mcsafe-memcpy,
>> because recovery action and mcsafe-memcpy can be done independently.
>>
> 
> I'm afraid leaving the ksm page in the cache will repeatedly trigger uncorrectable error for the
> same page if ksm pages are shared by multiple processes. This might reach the hardware threshold
> and result in fatal uncorrectable error (thus casuing system to panic). So IMHO it might be better
> to check if page is hwpoisoned before calling ksm_might_need_to_copy() if above thorough approach
> is not implemented. But I can easily be wrong.
> 
Oh,missing this one,there are only two caller, one in do_swap_page(),
it has already check whether the page is hwpoisoned or not, another is 
in unuse_pte() which only called from swapoff, it is not a hot patch, so
I don't think it is an urgent requirement. Thanks.
  
Kefeng Wang Feb. 1, 2023, 1:33 a.m. UTC | #6
On 2023/2/1 8:32, Andrew Morton wrote:
> On Tue, 13 Dec 2022 20:05:23 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> 
>> When the kernel copy a page from ksm_might_need_to_copy(), but runs
>> into an uncorrectable error, it will crash since poisoned page is
>> consumed by kernel, this is similar to Copy-on-write poison recovery,
>> When an error is detected during the page copy, return VM_FAULT_HWPOISON
>> in do_swap_page(), and install a hwpoison entry in unuse_pte() when
>> swapoff, which help us to avoid system crash. Note, memory failure on
>> a KSM page will be skipped, but still call memory_failure_queue() to
>> be consistent with general memory failure process.
> 
> I believe we're awaiting a v4 of this?

Sorry, forget this one.
> 
> Did we consider a -stable backport?  "kernel crash" sounds undesirable...

This one depends on Copy-on-write poison recovery patchset, and I check 
the commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on 
write faults") is not included into stable, and both of them are 
enhancement of COPY_MC feature, so it seems that we don't need to
backport to stable.

> 
> Can we identify a Fixes: target for this?

As it is a part of COPY_MC, I don't think it is need a Fixes tag.

I will resend a new one to address the comments of  HORIGUCHI NAOYA(堀口 
直也).

Thanks.
  

Patch

diff --git a/mm/ksm.c b/mm/ksm.c
index dd02780c387f..83e2f74ae7da 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2629,8 +2629,12 @@  struct page *ksm_might_need_to_copy(struct page *page,
 		new_page = NULL;
 	}
 	if (new_page) {
-		copy_user_highpage(new_page, page, address, vma);
-
+		if (copy_mc_user_highpage(new_page, page, address, vma)) {
+			put_page(new_page);
+			new_page = ERR_PTR(-EHWPOISON);
+			memory_failure_queue(page_to_pfn(page), 0);
+			return new_page;
+		}
 		SetPageDirty(new_page);
 		__SetPageUptodate(new_page);
 		__SetPageLocked(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..5b2c137dfb2a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3840,6 +3840,9 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (unlikely(!page)) {
 			ret = VM_FAULT_OOM;
 			goto out_page;
+		} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+			ret = VM_FAULT_HWPOISON;
+			goto out_page;
 		}
 		folio = page_folio(page);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 908a529bca12..0efb1c2c2415 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1763,12 +1763,15 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *swapcache;
 	spinlock_t *ptl;
 	pte_t *pte, new_pte;
+	bool hwposioned = false;
 	int ret = 1;
 
 	swapcache = page;
 	page = ksm_might_need_to_copy(page, vma, addr);
 	if (unlikely(!page))
 		return -ENOMEM;
+	else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+		hwposioned = true;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
@@ -1776,15 +1779,19 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		goto out;
 	}
 
-	if (unlikely(!PageUptodate(page))) {
-		pte_t pteval;
+	if (hwposioned || !PageUptodate(page)) {
+		swp_entry_t swp_entry;
 
 		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
-		pteval = swp_entry_to_pte(make_swapin_error_entry());
-		set_pte_at(vma->vm_mm, addr, pte, pteval);
-		swap_free(entry);
+		if (hwposioned) {
+			swp_entry = make_hwpoison_entry(swapcache);
+			page = swapcache;
+		} else {
+			swp_entry = make_swapin_error_entry();
+		}
+		new_pte = swp_entry_to_pte(swp_entry);
 		ret = 0;
-		goto out;
+		goto setpte;
 	}
 
 	/* See do_swap_page() */
@@ -1816,6 +1823,7 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		new_pte = pte_mksoft_dirty(new_pte);
 	if (pte_swp_uffd_wp(*pte))
 		new_pte = pte_mkuffd_wp(new_pte);
+setpte:
 	set_pte_at(vma->vm_mm, addr, pte, new_pte);
 	swap_free(entry);
 out: