[RFC] mm, hwpoison: Recover from copy-on-write machine checks
Commit Message
If the kernel is copying a page as the result of a copy-on-write
fault and runs into an uncorrectable error, Linux will crash because
it does not have recovery code for this case where poison is consumed
by the kernel.
It is easy to set up a test case. Just inject an error into a private
page, fork(2), and have the child process write to the page.
I wrapped that neatly into a test at:
git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git
just enable ACPI error injection and run:
# ./einj_mem-uc -f copy-on-write
[Note this test needs some better reporting for the case where this
patch has been applied and the system does NOT crash]
Patch below works ... but there are probably many places where it could
fit better into the general "mm" way of doing things. E.g. using the
copy_mc_to_kernel() function does what I need here, but the name doesn't
seem like it is quite right.
Basic idea is very simple ... if the kernel gets a machine check copying
the page, just free up the new page that was going to be the target of
the copy and return VM_FAULT_HWPOISON to the calling stack.
Slightly-signed-off-by: Tony Luck <tony.luck@intel.com>
---
include/linux/highmem.h | 19 +++++++++++++++++++
mm/memory.c | 28 ++++++++++++++++++++--------
2 files changed, 39 insertions(+), 8 deletions(-)
Comments
On Mon, Oct 17, 2022 at 04:42:03PM -0700, Tony Luck wrote:
> If the kernel is copying a page as the result of a copy-on-write
> fault and runs into an uncorrectable error, Linux will crash because
> it does not have recovery code for this case where poison is consumed
> by the kernel.
>
> It is easy to set up a test case. Just inject an error into a private
> page, fork(2), and have the child process write to the page.
>
> I wrapped that neatly into a test at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git
>
> just enable ACPI error injection and run:
>
> # ./einj_mem-uc -f copy-on-write
>
> [Note this test needs some better reporting for the case where this
> patch has been applied and the system does NOT crash]
>
> Patch below works ... but there are probably many places where it could
> fit better into the general "mm" way of doing things. E.g. using the
> copy_mc_to_kernel() function does what I need here, but the name doesn't
> seem like it is quite right.
As for the name "copy_user_highpage_mc", it simply sounds like an mcsafe
variant of copy_user_highpage, so I have no objection.
Recently similar routine is suggested in
https://lore.kernel.org/linux-mm/20221010160142.1087120-2-jiaqiyan@google.com/
, so maybe we need some coordination between related interfaces.
>
> Basic idea is very simple ... if the kernel gets a machine check copying
> the page, just free up the new page that was going to be the target of
> the copy and return VM_FAULT_HWPOISON to the calling stack.
>
> Slightly-signed-off-by: Tony Luck <tony.luck@intel.com>
I'm basically supportive to what this patch intends to do.
> ---
> include/linux/highmem.h | 19 +++++++++++++++++++
> mm/memory.c | 28 ++++++++++++++++++++--------
> 2 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index e9912da5441b..5967541fbf0e 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -319,6 +319,25 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
>
> #endif
>
> +static inline int copy_user_highpage_mc(struct page *to, struct page *from,
> + unsigned long vaddr, struct vm_area_struct *vma)
> +{
> + unsigned long ret = 0;
> +#ifdef copy_mc_to_kernel
> + char *vfrom, *vto;
> +
> + vfrom = kmap_local_page(from);
> + vto = kmap_local_page(to);
> + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> + kunmap_local(vto);
> + kunmap_local(vfrom);
> +#else
> + copy_user_highpage(to, from, vaddr, vma);
> +#endif
> +
> + return ret;
> +}
> +
> #ifndef __HAVE_ARCH_COPY_HIGHPAGE
>
> static inline void copy_highpage(struct page *to, struct page *from)
> diff --git a/mm/memory.c b/mm/memory.c
> index f88c351aecd4..b5e22bf4c10a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2848,8 +2848,14 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
> return same;
> }
>
> -static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
> - struct vm_fault *vmf)
> +/*
> + * Return:
> + * -1 = copy failed due to poison in source page
Simply calling "poison" might cause confusion with page poisoning feature,
so "hwpoison" might be better. But I know that "poison" is commonly used
under arch/x86, and it's not clear to me what to do with this terminology.
Maybe using -EHWPOISON instead of -1 might be helpful to the distinction.
> + * 0 = copied failed (some other reason)
> + * 1 = copied succeeded
> + */
> +static inline int __wp_page_copy_user(struct page *dst, struct page *src,
> + struct vm_fault *vmf)
> {
> bool ret;
> void *kaddr;
...
> @@ -3121,7 +3129,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> if (!new_page)
> goto oom;
>
> - if (!__wp_page_copy_user(new_page, old_page, vmf)) {
> + ret = __wp_page_copy_user(new_page, old_page, vmf);
> + if (ret == -1) {
> + put_page(new_page);
Maybe I miss something, but don't you have to care about refcount of
old_page in this branch (as done in "ret == 0" branch)?
Thanks,
Naoya Horiguchi
> + return VM_FAULT_HWPOISON;
> + } else if (ret == 0) {
> /*
> * COW failed, if the fault was solved by other,
> * it's fine. If not, userspace would re-fault on
> --
> 2.37.3
>> + * -1 = copy failed due to poison in source page
> Simply calling "poison" might cause confusion with page poisoning feature,
> so "hwpoison" might be better. But I know that "poison" is commonly used
> under arch/x86, and it's not clear to me what to do with this terminology.
> Maybe using -EHWPOISON instead of -1 might be helpful to the distinction.
Agreed. Using -EHWPOISON return is clearer here.
>> - if (!__wp_page_copy_user(new_page, old_page, vmf)) {
>> + ret = __wp_page_copy_user(new_page, old_page, vmf);
>> + if (ret == -1) {
>> + put_page(new_page);
>
> Maybe I miss something, but don't you have to care about refcount of
> old_page in this branch (as done in "ret == 0" branch)?
You didn't miss anything. But I did. More needs to be done with old_page
(it is where the poison is). I got "lucky" just ignoring/forgetting about it in
my patch ... the system just happened to recover, but I think the poison
may not have been handled for the parent process. that still has the page
mapped.. Need to think about this more.
Thanks for the review.
-Tony
@@ -319,6 +319,25 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
#endif
+static inline int copy_user_highpage_mc(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ unsigned long ret = 0;
+#ifdef copy_mc_to_kernel
+ char *vfrom, *vto;
+
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
+ ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
+#else
+ copy_user_highpage(to, from, vaddr, vma);
+#endif
+
+ return ret;
+}
+
#ifndef __HAVE_ARCH_COPY_HIGHPAGE
static inline void copy_highpage(struct page *to, struct page *from)
@@ -2848,8 +2848,14 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
return same;
}
-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+/*
+ * Return:
+ * -1 = copy failed due to poison in source page
+ * 0 = copied failed (some other reason)
+ * 1 = copied succeeded
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
bool ret;
void *kaddr;
@@ -2860,8 +2866,9 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
unsigned long addr = vmf->address;
if (likely(src)) {
- copy_user_highpage(dst, src, addr, vma);
- return true;
+ if (copy_user_highpage_mc(dst, src, addr, vma))
+ return -1;
+ return 1;
}
/*
@@ -2888,7 +2895,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* and update local tlb only
*/
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = 0;
goto pte_unlock;
}
@@ -2913,7 +2920,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = 0;
goto pte_unlock;
}
@@ -2932,7 +2939,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
}
}
- ret = true;
+ ret = 1;
pte_unlock:
if (locked)
@@ -3104,6 +3111,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
+ int ret;
delayacct_wpcopy_start();
@@ -3121,7 +3129,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (!new_page)
goto oom;
- if (!__wp_page_copy_user(new_page, old_page, vmf)) {
+ ret = __wp_page_copy_user(new_page, old_page, vmf);
+ if (ret == -1) {
+ put_page(new_page);
+ return VM_FAULT_HWPOISON;
+ } else if (ret == 0) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on