[v2] mm: hwpoison: coredump: support recovery from dump_user_range()

Message ID 20230417045323.11054-1-wangkefeng.wang@huawei.com
State New
Headers
Series [v2] mm: hwpoison: coredump: support recovery from dump_user_range() |

Commit Message

Kefeng Wang April 17, 2023, 4:53 a.m. UTC
  The dump_user_range() is used to copy the user page to a coredump file,
but if a hardware memory error occurred during copy, which called from
__kernel_write_iter() in dump_user_range(), it crashes,

  CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
 
  pc : __memcpy+0x110/0x260
  lr : _copy_from_iter+0x3bc/0x4c8
  ...
  Call trace:
   __memcpy+0x110/0x260
   copy_page_from_iter+0xcc/0x130
   pipe_write+0x164/0x6d8
   __kernel_write_iter+0x9c/0x210
   dump_user_range+0xc8/0x1d8
   elf_core_dump+0x308/0x368
   do_coredump+0x2e8/0xa40
   get_signal+0x59c/0x788
   do_signal+0x118/0x1f8
   do_notify_resume+0xf0/0x280
   el0_da+0x130/0x138
   el0t_64_sync_handler+0x68/0xc0
   el0t_64_sync+0x188/0x190

Generally, the '->write_iter' of file ops will use copy_page_from_iter()
and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
in both of them to handle #MC during source read, which stop coredump
processing and kill the task instead of kernel panic, but the source
address may not always a user address, so introduce a new copy_mc flag in
struct iov_iter{} to indicate that the iter could do a safe memory copy,
also introduce the helpers to set/cleck the flag, for now, it's only
used in coredump's dump_user_range(), but it could expand to any other
scenarios to fix the similar issue.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tong Tiangen <tongtiangen@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
v2:
- move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
- reposition the copy_mc in struct iov_iter for easy merge, suggested
  by Andrew Morton
- drop unnecessary clear flag helper
- fix checkpatch warning
 fs/coredump.c       |  1 +
 include/linux/uio.h | 16 ++++++++++++++++
 lib/iov_iter.c      | 17 +++++++++++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)
  

Comments

HORIGUCHI NAOYA(堀口 直也) April 18, 2023, 3:13 a.m. UTC | #1
On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
> The dump_user_range() is used to copy the user page to a coredump file,
> but if a hardware memory error occurred during copy, which called from
> __kernel_write_iter() in dump_user_range(), it crashes,
> 
>   CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>  
>   pc : __memcpy+0x110/0x260
>   lr : _copy_from_iter+0x3bc/0x4c8
>   ...
>   Call trace:
>    __memcpy+0x110/0x260
>    copy_page_from_iter+0xcc/0x130
>    pipe_write+0x164/0x6d8
>    __kernel_write_iter+0x9c/0x210
>    dump_user_range+0xc8/0x1d8
>    elf_core_dump+0x308/0x368
>    do_coredump+0x2e8/0xa40
>    get_signal+0x59c/0x788
>    do_signal+0x118/0x1f8
>    do_notify_resume+0xf0/0x280
>    el0_da+0x130/0x138
>    el0t_64_sync_handler+0x68/0xc0
>    el0t_64_sync+0x188/0x190
> 
> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
> in both of them to handle #MC during source read, which stop coredump
> processing and kill the task instead of kernel panic, but the source
> address may not always a user address, so introduce a new copy_mc flag in
> struct iov_iter{} to indicate that the iter could do a safe memory copy,
> also introduce the helpers to set/cleck the flag, for now, it's only
> used in coredump's dump_user_range(), but it could expand to any other
> scenarios to fix the similar issue.
> 
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Miaohe Lin <linmiaohe@huawei.com>
> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
> Cc: Tong Tiangen <tongtiangen@huawei.com>
> Cc: Jens Axboe <axboe@kernel.dk>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
> v2:
> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>   by Andrew Morton
> - drop unnecessary clear flag helper
> - fix checkpatch warning
>  fs/coredump.c       |  1 +
>  include/linux/uio.h | 16 ++++++++++++++++
>  lib/iov_iter.c      | 17 +++++++++++++++--
>  3 files changed, 32 insertions(+), 2 deletions(-)
> 
...
> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>  EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>  #endif /* CONFIG_ARCH_HAS_COPY_MC */
>  
> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
> +				 size_t size)
> +{
> +	if (iov_iter_is_copy_mc(i))
> +		return (void *)copy_mc_to_kernel(to, from, size);

Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
due to a memory error?

Thanks,
Naoya Horiguchi

> +	return memcpy(to, from, size);
> +}
> +
>  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
>  {
>  	if (WARN_ON_ONCE(!i->data_source))
  
Kefeng Wang April 18, 2023, 9:45 a.m. UTC | #2
On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>> The dump_user_range() is used to copy the user page to a coredump file,
>> but if a hardware memory error occurred during copy, which called from
>> __kernel_write_iter() in dump_user_range(), it crashes,
>>
>>    CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>   
>>    pc : __memcpy+0x110/0x260
>>    lr : _copy_from_iter+0x3bc/0x4c8
>>    ...
>>    Call trace:
>>     __memcpy+0x110/0x260
>>     copy_page_from_iter+0xcc/0x130
>>     pipe_write+0x164/0x6d8
>>     __kernel_write_iter+0x9c/0x210
>>     dump_user_range+0xc8/0x1d8
>>     elf_core_dump+0x308/0x368
>>     do_coredump+0x2e8/0xa40
>>     get_signal+0x59c/0x788
>>     do_signal+0x118/0x1f8
>>     do_notify_resume+0xf0/0x280
>>     el0_da+0x130/0x138
>>     el0t_64_sync_handler+0x68/0xc0
>>     el0t_64_sync+0x188/0x190
>>
>> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
>> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
>> in both of them to handle #MC during source read, which stop coredump
>> processing and kill the task instead of kernel panic, but the source
>> address may not always a user address, so introduce a new copy_mc flag in
>> struct iov_iter{} to indicate that the iter could do a safe memory copy,
>> also introduce the helpers to set/cleck the flag, for now, it's only
>> used in coredump's dump_user_range(), but it could expand to any other
>> scenarios to fix the similar issue.
>>
>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>> Cc: Christian Brauner <brauner@kernel.org>
>> Cc: Miaohe Lin <linmiaohe@huawei.com>
>> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
>> Cc: Tong Tiangen <tongtiangen@huawei.com>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>> v2:
>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>>    by Andrew Morton
>> - drop unnecessary clear flag helper
>> - fix checkpatch warning
>>   fs/coredump.c       |  1 +
>>   include/linux/uio.h | 16 ++++++++++++++++
>>   lib/iov_iter.c      | 17 +++++++++++++++--
>>   3 files changed, 32 insertions(+), 2 deletions(-)
>>
> ...
>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>   EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>   #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>   
>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
>> +				 size_t size)
>> +{
>> +	if (iov_iter_is_copy_mc(i))
>> +		return (void *)copy_mc_to_kernel(to, from, size);
> 
> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
> due to a memory error?

For dump_user_range(), the task is dying, if copy incomplete size, the 
coredump will fail and task will exit, also memory_failure will
be called by kill_me_maybe(),

  CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 #29
  Call Trace:
   <TASK>
   dump_stack_lvl+0x37/0x50
   memory_failure+0x51/0x970
   kill_me_maybe+0x5b/0xc0
   task_work_run+0x5a/0x90
   exit_to_user_mode_prepare+0x194/0x1a0
   irqentry_exit_to_user_mode+0x9/0x30
   noist_exc_machine_check+0x40/0x80
   asm_exc_machine_check+0x33/0x40


> 
> Thanks,
> Naoya Horiguchi
> 
>> +	return memcpy(to, from, size);
>> +}
>> +
>>   size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
>>   {
>>   	if (WARN_ON_ONCE(!i->data_source))
  
HORIGUCHI NAOYA(堀口 直也) April 19, 2023, 7:25 a.m. UTC | #3
On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
> 
> 
> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
> > > The dump_user_range() is used to copy the user page to a coredump file,
> > > but if a hardware memory error occurred during copy, which called from
> > > __kernel_write_iter() in dump_user_range(), it crashes,
> > > 
> > >    CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
> > >    pc : __memcpy+0x110/0x260
> > >    lr : _copy_from_iter+0x3bc/0x4c8
> > >    ...
> > >    Call trace:
> > >     __memcpy+0x110/0x260
> > >     copy_page_from_iter+0xcc/0x130
> > >     pipe_write+0x164/0x6d8
> > >     __kernel_write_iter+0x9c/0x210
> > >     dump_user_range+0xc8/0x1d8
> > >     elf_core_dump+0x308/0x368
> > >     do_coredump+0x2e8/0xa40
> > >     get_signal+0x59c/0x788
> > >     do_signal+0x118/0x1f8
> > >     do_notify_resume+0xf0/0x280
> > >     el0_da+0x130/0x138
> > >     el0t_64_sync_handler+0x68/0xc0
> > >     el0t_64_sync+0x188/0x190
> > > 
> > > Generally, the '->write_iter' of file ops will use copy_page_from_iter()
> > > and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
> > > in both of them to handle #MC during source read, which stop coredump
> > > processing and kill the task instead of kernel panic, but the source
> > > address may not always a user address, so introduce a new copy_mc flag in
> > > struct iov_iter{} to indicate that the iter could do a safe memory copy,
> > > also introduce the helpers to set/cleck the flag, for now, it's only
> > > used in coredump's dump_user_range(), but it could expand to any other
> > > scenarios to fix the similar issue.
> > > 
> > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > Cc: Christian Brauner <brauner@kernel.org>
> > > Cc: Miaohe Lin <linmiaohe@huawei.com>
> > > Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > Cc: Tong Tiangen <tongtiangen@huawei.com>
> > > Cc: Jens Axboe <axboe@kernel.dk>
> > > Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> > > ---
> > > v2:
> > > - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
> > > - reposition the copy_mc in struct iov_iter for easy merge, suggested
> > >    by Andrew Morton
> > > - drop unnecessary clear flag helper
> > > - fix checkpatch warning
> > >   fs/coredump.c       |  1 +
> > >   include/linux/uio.h | 16 ++++++++++++++++
> > >   lib/iov_iter.c      | 17 +++++++++++++++--
> > >   3 files changed, 32 insertions(+), 2 deletions(-)
> > > 
> > ...
> > > @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > >   EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
> > >   #endif /* CONFIG_ARCH_HAS_COPY_MC */
> > > +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
> > > +				 size_t size)
> > > +{
> > > +	if (iov_iter_is_copy_mc(i))
> > > +		return (void *)copy_mc_to_kernel(to, from, size);
> > 
> > Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
> > due to a memory error?
> 
> For dump_user_range(), the task is dying, if copy incomplete size, the
> coredump will fail and task will exit, also memory_failure will
> be called by kill_me_maybe(),
> 
>  CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 #29
>  Call Trace:
>   <TASK>
>   dump_stack_lvl+0x37/0x50
>   memory_failure+0x51/0x970
>   kill_me_maybe+0x5b/0xc0
>   task_work_run+0x5a/0x90
>   exit_to_user_mode_prepare+0x194/0x1a0
>   irqentry_exit_to_user_mode+0x9/0x30
>   noist_exc_machine_check+0x40/0x80
>   asm_exc_machine_check+0x33/0x40

Is this call trace printed out when copy_mc_to_kernel() failed by finding
a memory error (or in some testcase using error injection)?
In my understanding, an MCE should not be triggered when MC-safe copy tries
to access to a memory error.  So I feel that we might be talking about
different scenarios.

When I questioned previously, I thought about the following scenario:

  - a process terminates abnormally for any reason like segmentation fault,
  - then, kernel tries to create a coredump,
  - during this, the copying routine accesses to corrupted page to read.

In this case the corrupted page should not be handled by memory_failure()
yet (because otherwise properly handled hwpoisoned page should be ignored
by coredump process).  The coredump process would exit with failure with
your patch, but then, the corrupted page is still left unhandled and can
be reused, so any other thread can easily access to it again.

You can find a few other places (like __wp_page_copy_user and ksm_might_need_to_copy)
to call memory_failure_queue() to cope with such unhandled error pages.
So does memcpy_from_iter() do the same?

Thanks,
Naoya Horiguchi
  
Kefeng Wang April 19, 2023, 12:03 p.m. UTC | #4
On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>
>>
>> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>>>> The dump_user_range() is used to copy the user page to a coredump file,
>>>> but if a hardware memory error occurred during copy, which called from
>>>> __kernel_write_iter() in dump_user_range(), it crashes,
>>>>
>>>>     CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>>>     pc : __memcpy+0x110/0x260
>>>>     lr : _copy_from_iter+0x3bc/0x4c8
>>>>     ...
>>>>     Call trace:
>>>>      __memcpy+0x110/0x260
>>>>      copy_page_from_iter+0xcc/0x130
>>>>      pipe_write+0x164/0x6d8
>>>>      __kernel_write_iter+0x9c/0x210
>>>>      dump_user_range+0xc8/0x1d8
>>>>      elf_core_dump+0x308/0x368
>>>>      do_coredump+0x2e8/0xa40
>>>>      get_signal+0x59c/0x788
>>>>      do_signal+0x118/0x1f8
>>>>      do_notify_resume+0xf0/0x280
>>>>      el0_da+0x130/0x138
>>>>      el0t_64_sync_handler+0x68/0xc0
>>>>      el0t_64_sync+0x188/0x190
>>>>
>>>> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
>>>> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
>>>> in both of them to handle #MC during source read, which stop coredump
>>>> processing and kill the task instead of kernel panic, but the source
>>>> address may not always a user address, so introduce a new copy_mc flag in
>>>> struct iov_iter{} to indicate that the iter could do a safe memory copy,
>>>> also introduce the helpers to set/cleck the flag, for now, it's only
>>>> used in coredump's dump_user_range(), but it could expand to any other
>>>> scenarios to fix the similar issue.
>>>>
>>>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>>>> Cc: Christian Brauner <brauner@kernel.org>
>>>> Cc: Miaohe Lin <linmiaohe@huawei.com>
>>>> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>> Cc: Tong Tiangen <tongtiangen@huawei.com>
>>>> Cc: Jens Axboe <axboe@kernel.dk>
>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>> ---
>>>> v2:
>>>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>>>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>>>>     by Andrew Morton
>>>> - drop unnecessary clear flag helper
>>>> - fix checkpatch warning
>>>>    fs/coredump.c       |  1 +
>>>>    include/linux/uio.h | 16 ++++++++++++++++
>>>>    lib/iov_iter.c      | 17 +++++++++++++++--
>>>>    3 files changed, 32 insertions(+), 2 deletions(-)
>>>>
>>> ...
>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
>>>> +				 size_t size)
>>>> +{
>>>> +	if (iov_iter_is_copy_mc(i))
>>>> +		return (void *)copy_mc_to_kernel(to, from, size);
>>>
>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
>>> due to a memory error?
>>
>> For dump_user_range(), the task is dying, if copy incomplete size, the
>> coredump will fail and task will exit, also memory_failure will
>> be called by kill_me_maybe(),
>>
>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 #29
>>   Call Trace:
>>    <TASK>
>>    dump_stack_lvl+0x37/0x50
>>    memory_failure+0x51/0x970
>>    kill_me_maybe+0x5b/0xc0
>>    task_work_run+0x5a/0x90
>>    exit_to_user_mode_prepare+0x194/0x1a0
>>    irqentry_exit_to_user_mode+0x9/0x30
>>    noist_exc_machine_check+0x40/0x80
>>    asm_exc_machine_check+0x33/0x40
> 
> Is this call trace printed out when copy_mc_to_kernel() failed by finding
> a memory error (or in some testcase using error injection)?

I add dump_stack() into memory_failure() to check whether the poisoned
memory is called or not, and the call trace shows it do call
memory_failure(), but I get confused when do the test.

> In my understanding, an MCE should not be triggered when MC-safe copy tries
> to access to a memory error.  So I feel that we might be talking about
> different scenarios.
> 
> When I questioned previously, I thought about the following scenario:
> 
>    - a process terminates abnormally for any reason like segmentation fault,
>    - then, kernel tries to create a coredump,
>    - during this, the copying routine accesses to corrupted page to read.
> 
Yes, we tested like your described,

1) inject memory error into a process
2) send a SIGABT/SIGBUS to process to trigger the coredump

Without patch, the system panic, and with patch only process exits.

> In this case the corrupted page should not be handled by memory_failure()
> yet (because otherwise properly handled hwpoisoned page should be ignored
> by coredump process).  The coredump process would exit with failure with
> your patch, but then, the corrupted page is still left unhandled and can
> be reused, so any other thread can easily access to it again.

As shown above, the corrupted page will be handled by memory_failure(), 
but what I'm wondering,
1) memory_failure() is not always called
2) look at the above call trace, it looks like from asynchronous
    interrupt, not from synchronous exception, right?

> 
> You can find a few other places (like __wp_page_copy_user and ksm_might_need_to_copy)
> to call memory_failure_queue() to cope with such unhandled error pages.
> So does memcpy_from_iter() do the same?

I add some debug print in do_machine_check() on x86:

1) COW,
   m.kflags: MCE_IN_KERNEL_RECOV
   fixup_type: EX_TYPE_DEFAULT_MCE_SAFE

   CPU: 11 PID: 2038 Comm: einj_mem_uc
   Call Trace:
    <#MC>
    dump_stack_lvl+0x37/0x50
    do_machine_check+0x7ad/0x840
    exc_machine_check+0x5a/0x90
    asm_exc_machine_check+0x1e/0x40
   RIP: 0010:copy_mc_fragile+0x35/0x62

   if (m.kflags & MCE_IN_KERNEL_RECOV) {
           if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
                   mce_panic("Failed kernel mode recovery", &m, msg);
   }

   if (m.kflags & MCE_IN_KERNEL_COPYIN)
           queue_task_work(&m, msg, kill_me_never);

There is no memory_failure() called when
EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
so we manually add a memory_failure_queue() to handle with
the poisoned page.

2) Coredump,  nothing print about m.kflags and fixup_type,
with above check, add a memory_failure_queue() or memory_failure() seems
to be needed for memcpy_from_iter(), but it is totally different from
the COW scenario


Another question, other copy_mc_to_kernel() callers, eg,
nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
should they need a memory_failure_queue(), if so, why not add it into
do_machine_check() ?

Thanks.



> 
> Thanks,
> Naoya Horiguchi
  
Jane Chu April 20, 2023, 2:03 a.m. UTC | #5
On 4/19/2023 5:03 AM, Kefeng Wang wrote:
> 
> 
> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>>>>> The dump_user_range() is used to copy the user page to a coredump 
>>>>> file,
>>>>> but if a hardware memory error occurred during copy, which called from
>>>>> __kernel_write_iter() in dump_user_range(), it crashes,
>>>>>
>>>>>     CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>>>>     pc : __memcpy+0x110/0x260
>>>>>     lr : _copy_from_iter+0x3bc/0x4c8
>>>>>     ...
>>>>>     Call trace:
>>>>>      __memcpy+0x110/0x260
>>>>>      copy_page_from_iter+0xcc/0x130
>>>>>      pipe_write+0x164/0x6d8
>>>>>      __kernel_write_iter+0x9c/0x210
>>>>>      dump_user_range+0xc8/0x1d8
>>>>>      elf_core_dump+0x308/0x368
>>>>>      do_coredump+0x2e8/0xa40
>>>>>      get_signal+0x59c/0x788
>>>>>      do_signal+0x118/0x1f8
>>>>>      do_notify_resume+0xf0/0x280
>>>>>      el0_da+0x130/0x138
>>>>>      el0t_64_sync_handler+0x68/0xc0
>>>>>      el0t_64_sync+0x188/0x190
>>>>>
>>>>> Generally, the '->write_iter' of file ops will use 
>>>>> copy_page_from_iter()
>>>>> and copy_page_from_iter_atomic(), change memcpy() to 
>>>>> copy_mc_to_kernel()
>>>>> in both of them to handle #MC during source read, which stop coredump
>>>>> processing and kill the task instead of kernel panic, but the source
>>>>> address may not always a user address, so introduce a new copy_mc 
>>>>> flag in
>>>>> struct iov_iter{} to indicate that the iter could do a safe memory 
>>>>> copy,
>>>>> also introduce the helpers to set/cleck the flag, for now, it's only
>>>>> used in coredump's dump_user_range(), but it could expand to any other
>>>>> scenarios to fix the similar issue.
>>>>>
>>>>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>>>>> Cc: Christian Brauner <brauner@kernel.org>
>>>>> Cc: Miaohe Lin <linmiaohe@huawei.com>
>>>>> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>>> Cc: Tong Tiangen <tongtiangen@huawei.com>
>>>>> Cc: Jens Axboe <axboe@kernel.dk>
>>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>>> ---
>>>>> v2:
>>>>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>>>>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>>>>>     by Andrew Morton
>>>>> - drop unnecessary clear flag helper
>>>>> - fix checkpatch warning
>>>>>    fs/coredump.c       |  1 +
>>>>>    include/linux/uio.h | 16 ++++++++++++++++
>>>>>    lib/iov_iter.c      | 17 +++++++++++++++--
>>>>>    3 files changed, 32 insertions(+), 2 deletions(-)
>>>>>
>>>> ...
>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, 
>>>>> size_t bytes, struct iov_iter *i)
>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const 
>>>>> void *from,
>>>>> +                 size_t size)
>>>>> +{
>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>
>>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() 
>>>> fails
>>>> due to a memory error?
>>>
>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>> coredump will fail and task will exit, also memory_failure will
>>> be called by kill_me_maybe(),
>>>
>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 #29
>>>   Call Trace:
>>>    <TASK>
>>>    dump_stack_lvl+0x37/0x50
>>>    memory_failure+0x51/0x970
>>>    kill_me_maybe+0x5b/0xc0
>>>    task_work_run+0x5a/0x90
>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>    noist_exc_machine_check+0x40/0x80
>>>    asm_exc_machine_check+0x33/0x40
>>
>> Is this call trace printed out when copy_mc_to_kernel() failed by finding
>> a memory error (or in some testcase using error injection)?
> 
> I add dump_stack() into memory_failure() to check whether the poisoned
> memory is called or not, and the call trace shows it do call
> memory_failure(), but I get confused when do the test.
> 
>> In my understanding, an MCE should not be triggered when MC-safe copy 
>> tries
>> to access to a memory error.  So I feel that we might be talking about
>> different scenarios.
>>
>> When I questioned previously, I thought about the following scenario:
>>
>>    - a process terminates abnormally for any reason like segmentation 
>> fault,
>>    - then, kernel tries to create a coredump,
>>    - during this, the copying routine accesses to corrupted page to read.
>>
> Yes, we tested like your described,
> 
> 1) inject memory error into a process
> 2) send a SIGABT/SIGBUS to process to trigger the coredump
> 
> Without patch, the system panic, and with patch only process exits.
> 
>> In this case the corrupted page should not be handled by memory_failure()
>> yet (because otherwise properly handled hwpoisoned page should be ignored
>> by coredump process).  The coredump process would exit with failure with
>> your patch, but then, the corrupted page is still left unhandled and can
>> be reused, so any other thread can easily access to it again.
> 
> As shown above, the corrupted page will be handled by memory_failure(), 
> but what I'm wondering,
> 1) memory_failure() is not always called
> 2) look at the above call trace, it looks like from asynchronous
>     interrupt, not from synchronous exception, right?
> 
>>
>> You can find a few other places (like __wp_page_copy_user and 
>> ksm_might_need_to_copy)
>> to call memory_failure_queue() to cope with such unhandled error pages.
>> So does memcpy_from_iter() do the same?
> 
> I add some debug print in do_machine_check() on x86:
> 
> 1) COW,
>    m.kflags: MCE_IN_KERNEL_RECOV
>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
> 
>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>    Call Trace:
>     <#MC>
>     dump_stack_lvl+0x37/0x50
>     do_machine_check+0x7ad/0x840
>     exc_machine_check+0x5a/0x90
>     asm_exc_machine_check+0x1e/0x40
>    RIP: 0010:copy_mc_fragile+0x35/0x62
> 
>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>                    mce_panic("Failed kernel mode recovery", &m, msg);
>    }
> 
>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>            queue_task_work(&m, msg, kill_me_never);
> 
> There is no memory_failure() called when
> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
> so we manually add a memory_failure_queue() to handle with
> the poisoned page.
> 
> 2) Coredump,  nothing print about m.kflags and fixup_type,
> with above check, add a memory_failure_queue() or memory_failure() seems
> to be needed for memcpy_from_iter(), but it is totally different from
> the COW scenario
> 
> 
> Another question, other copy_mc_to_kernel() callers, eg,
> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> should they need a memory_failure_queue(), if so, why not add it into
> do_machine_check() ?

In the dax case, if the source address is poisoned, and we do follow up 
with memory_failure_queue(pfn, flags), what should the value of the 
'flags' be ?

thanks,
-jane

> 
> Thanks.
> 
> 
> 
>>
>> Thanks,
>> Naoya Horiguchi
>
  
Kefeng Wang April 20, 2023, 2:59 a.m. UTC | #6
On 2023/4/20 10:03, Jane Chu wrote:
> 
> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>
>>
>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>
>>>>
...
>>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, 
>>>>>> size_t bytes, struct iov_iter *i)
>>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const 
>>>>>> void *from,
>>>>>> +                 size_t size)
>>>>>> +{
>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>
>>>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() 
>>>>> fails
>>>>> due to a memory error?
>>>>
>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>> coredump will fail and task will exit, also memory_failure will
>>>> be called by kill_me_maybe(),
>>>>
>>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 
>>>> #29
>>>>   Call Trace:
>>>>    <TASK>
>>>>    dump_stack_lvl+0x37/0x50
>>>>    memory_failure+0x51/0x970
>>>>    kill_me_maybe+0x5b/0xc0
>>>>    task_work_run+0x5a/0x90
>>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>>    noist_exc_machine_check+0x40/0x80
>>>>    asm_exc_machine_check+0x33/0x40
>>>
>>> Is this call trace printed out when copy_mc_to_kernel() failed by 
>>> finding
>>> a memory error (or in some testcase using error injection)?
>>
>> I add dump_stack() into memory_failure() to check whether the poisoned
>> memory is called or not, and the call trace shows it do call
>> memory_failure(), but I get confused when do the test.
>>
>>> In my understanding, an MCE should not be triggered when MC-safe copy 
>>> tries
>>> to access to a memory error.  So I feel that we might be talking about
>>> different scenarios.
>>>
>>> When I questioned previously, I thought about the following scenario:
>>>
>>>    - a process terminates abnormally for any reason like segmentation 
>>> fault,
>>>    - then, kernel tries to create a coredump,
>>>    - during this, the copying routine accesses to corrupted page to 
>>> read.
>>>
>> Yes, we tested like your described,
>>
>> 1) inject memory error into a process
>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>
>> Without patch, the system panic, and with patch only process exits.
>>
>>> In this case the corrupted page should not be handled by 
>>> memory_failure()
>>> yet (because otherwise properly handled hwpoisoned page should be 
>>> ignored
>>> by coredump process).  The coredump process would exit with failure with
>>> your patch, but then, the corrupted page is still left unhandled and can
>>> be reused, so any other thread can easily access to it again.
>>
>> As shown above, the corrupted page will be handled by 
>> memory_failure(), but what I'm wondering,
>> 1) memory_failure() is not always called
>> 2) look at the above call trace, it looks like from asynchronous
>>     interrupt, not from synchronous exception, right?
>>
>>>
>>> You can find a few other places (like __wp_page_copy_user and 
>>> ksm_might_need_to_copy)
>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>> So does memcpy_from_iter() do the same?
>>
>> I add some debug print in do_machine_check() on x86:
>>
>> 1) COW,
>>    m.kflags: MCE_IN_KERNEL_RECOV
>>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>
>>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>>    Call Trace:
>>     <#MC>
>>     dump_stack_lvl+0x37/0x50
>>     do_machine_check+0x7ad/0x840
>>     exc_machine_check+0x5a/0x90
>>     asm_exc_machine_check+0x1e/0x40
>>    RIP: 0010:copy_mc_fragile+0x35/0x62
>>
>>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>                    mce_panic("Failed kernel mode recovery", &m, msg);
>>    }
>>
>>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>            queue_task_work(&m, msg, kill_me_never);
>>
>> There is no memory_failure() called when
>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>> so we manually add a memory_failure_queue() to handle with
>> the poisoned page.
>>
>> 2) Coredump,  nothing print about m.kflags and fixup_type,
>> with above check, add a memory_failure_queue() or memory_failure() seems
>> to be needed for memcpy_from_iter(), but it is totally different from
>> the COW scenario
>>
>>
>> Another question, other copy_mc_to_kernel() callers, eg,
>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>> should they need a memory_failure_queue(), if so, why not add it into
>> do_machine_check() ?
> 

What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
is designed to identify fixups which allow in kernel #MC recovery,
that is, the caller of copy_mc_to_kernel() must know the source
is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
the MCE_SAFE type.

diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..63e94484c5d6 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m, 
struct pt_regs *regs)
         case EX_TYPE_COPY:
                 if (!copy_user)
                         return IN_KERNEL;
-               m->kflags |= MCE_IN_KERNEL_COPYIN;
                 fallthrough;

         case EX_TYPE_FAULT_MCE_SAFE:
         case EX_TYPE_DEFAULT_MCE_SAFE:
-               m->kflags |= MCE_IN_KERNEL_RECOV;
+               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
                 return IN_KERNEL_RECOV;

         default:

then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy, 
or every Machine Check safe memory copy will need a memory_failure_xx() 
call.

+Thomas,who add the two types, could you share some comments about 
this,thanks.

> In the dax case, if the source address is poisoned, and we do follow up 
> with memory_failure_queue(pfn, flags), what should the value of the 
> 'flags' be ?


I think flags = 0 is enough to for all copy_mc_xxx to isolate the 
poisoned page.

Thanks.
  
Kefeng Wang April 20, 2023, 3:05 p.m. UTC | #7
On 2023/4/20 10:59, Kefeng Wang wrote:
> 
> 
> On 2023/4/20 10:03, Jane Chu wrote:
>>
>> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>>
>>>>>
> ...
>>>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, 
>>>>>>> size_t bytes, struct iov_iter *i)
>>>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, 
>>>>>>> const void *from,
>>>>>>> +                 size_t size)
>>>>>>> +{
>>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>>
>>>>>> Is it helpful to call memory_failure_queue() if 
>>>>>> copy_mc_to_kernel() fails
>>>>>> due to a memory error?
>>>>>
>>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>>> coredump will fail and task will exit, also memory_failure will
>>>>> be called by kill_me_maybe(),
>>>>>
>>>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               
>>>>> 6.3.0-rc5 #29
>>>>>   Call Trace:
>>>>>    <TASK>
>>>>>    dump_stack_lvl+0x37/0x50
>>>>>    memory_failure+0x51/0x970
>>>>>    kill_me_maybe+0x5b/0xc0
>>>>>    task_work_run+0x5a/0x90
>>>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>>>    noist_exc_machine_check+0x40/0x80
>>>>>    asm_exc_machine_check+0x33/0x40
>>>>
>>>> Is this call trace printed out when copy_mc_to_kernel() failed by 
>>>> finding
>>>> a memory error (or in some testcase using error injection)?
>>>
>>> I add dump_stack() into memory_failure() to check whether the poisoned
>>> memory is called or not, and the call trace shows it do call
>>> memory_failure(), but I get confused when do the test.
>>>
>>>> In my understanding, an MCE should not be triggered when MC-safe 
>>>> copy tries
>>>> to access to a memory error.  So I feel that we might be talking about
>>>> different scenarios.
>>>>
>>>> When I questioned previously, I thought about the following scenario:
>>>>
>>>>    - a process terminates abnormally for any reason like 
>>>> segmentation fault,
>>>>    - then, kernel tries to create a coredump,
>>>>    - during this, the copying routine accesses to corrupted page to 
>>>> read.
>>>>
>>> Yes, we tested like your described,
>>>
>>> 1) inject memory error into a process
>>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>>
>>> Without patch, the system panic, and with patch only process exits.
>>>
>>>> In this case the corrupted page should not be handled by 
>>>> memory_failure()
>>>> yet (because otherwise properly handled hwpoisoned page should be 
>>>> ignored
>>>> by coredump process).  The coredump process would exit with failure 
>>>> with
>>>> your patch, but then, the corrupted page is still left unhandled and 
>>>> can
>>>> be reused, so any other thread can easily access to it again.
>>>
>>> As shown above, the corrupted page will be handled by 
>>> memory_failure(), but what I'm wondering,
>>> 1) memory_failure() is not always called
>>> 2) look at the above call trace, it looks like from asynchronous
>>>     interrupt, not from synchronous exception, right?
>>>
>>>>
>>>> You can find a few other places (like __wp_page_copy_user and 
>>>> ksm_might_need_to_copy)
>>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>>> So does memcpy_from_iter() do the same?
>>>
>>> I add some debug print in do_machine_check() on x86:
>>>
>>> 1) COW,
>>>    m.kflags: MCE_IN_KERNEL_RECOV
>>>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>>
>>>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>>>    Call Trace:
>>>     <#MC>
>>>     dump_stack_lvl+0x37/0x50
>>>     do_machine_check+0x7ad/0x840
>>>     exc_machine_check+0x5a/0x90
>>>     asm_exc_machine_check+0x1e/0x40
>>>    RIP: 0010:copy_mc_fragile+0x35/0x62
>>>
>>>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>>                    mce_panic("Failed kernel mode recovery", &m, msg);
>>>    }
>>>
>>>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>>            queue_task_work(&m, msg, kill_me_never);
>>>
>>> There is no memory_failure() called when
>>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>>> so we manually add a memory_failure_queue() to handle with
>>> the poisoned page.
>>>
>>> 2) Coredump,  nothing print about m.kflags and fixup_type,

Sorry,I forget to set coredump file size :(

The coredump do trigger the do_machine_check() with same m.kflags and 
fixup_type like cow


>>> with above check, add a memory_failure_queue() or memory_failure() seems
>>> to be needed for memcpy_from_iter(), but it is totally different from
>>> the COW scenario
>>>

so the memcpy_from_iter() from coredump is same as cow scenario.

>>>
>>> Another question, other copy_mc_to_kernel() callers, eg,
>>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>>> should they need a memory_failure_queue(), if so, why not add it into
>>> do_machine_check() ?
>>
> 
> What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> is designed to identify fixups which allow in kernel #MC recovery,
> that is, the caller of copy_mc_to_kernel() must know the source
> is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> the MCE_SAFE type.

And I think we need the following change for MCE_SAFE copy to set
MCE_IN_KERNEL_COPYIN.

> 
> diff --git a/arch/x86/kernel/cpu/mce/severity.c 
> b/arch/x86/kernel/cpu/mce/severity.c
> index c4477162c07d..63e94484c5d6 100644
> --- a/arch/x86/kernel/cpu/mce/severity.c
> +++ b/arch/x86/kernel/cpu/mce/severity.c
> @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m, 
> struct pt_regs *regs)
>          case EX_TYPE_COPY:
>                  if (!copy_user)
>                          return IN_KERNEL;
> -               m->kflags |= MCE_IN_KERNEL_COPYIN;
>                  fallthrough;
> 
>          case EX_TYPE_FAULT_MCE_SAFE:
>          case EX_TYPE_DEFAULT_MCE_SAFE:
> -               m->kflags |= MCE_IN_KERNEL_RECOV;
> +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
>                  return IN_KERNEL_RECOV;
> 
>          default:
> 
> then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy, 
> or every Machine Check safe memory copy will need a memory_failure_xx() 
> call.

which help use to kill unneeded memory_failure_queue() call, any comments?

> 
> +Thomas,who add the two types, could you share some comments about 
> this,thanks.
> 
>> In the dax case, if the source address is poisoned, and we do follow 
>> up with memory_failure_queue(pfn, flags), what should the value of the 
>> 'flags' be ?
> 

With above diff change, we don't add a memory_failure_queue() into dax too.

Thanks

> 
> I think flags = 0 is enough to for all copy_mc_xxx to isolate the 
> poisoned page.
> 
> Thanks.
  
HORIGUCHI NAOYA(堀口 直也) April 21, 2023, 3:13 a.m. UTC | #8
On Thu, Apr 20, 2023 at 11:05:12PM +0800, Kefeng Wang wrote:
> 
> 
> On 2023/4/20 10:59, Kefeng Wang wrote:
> > 
> > 
> > On 2023/4/20 10:03, Jane Chu wrote:
> > > 
> > > On 4/19/2023 5:03 AM, Kefeng Wang wrote:
> > > > 
> > > > 
> > > > On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
> > > > > On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
> > > > > > 
> > > > > > 
> > ...
> > > > > > > > @@ -371,6 +372,14 @@ size_t
> > > > > > > > _copy_mc_to_iter(const void *addr, size_t bytes,
> > > > > > > > struct iov_iter *i)
> > > > > > > >    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
> > > > > > > >    #endif /* CONFIG_ARCH_HAS_COPY_MC */
> > > > > > > > +static void *memcpy_from_iter(struct iov_iter
> > > > > > > > *i, void *to, const void *from,
> > > > > > > > +                 size_t size)
> > > > > > > > +{
> > > > > > > > +    if (iov_iter_is_copy_mc(i))
> > > > > > > > +        return (void *)copy_mc_to_kernel(to, from, size);
> > > > > > > 
> > > > > > > Is it helpful to call memory_failure_queue() if
> > > > > > > copy_mc_to_kernel() fails
> > > > > > > due to a memory error?
> > > > > > 
> > > > > > For dump_user_range(), the task is dying, if copy incomplete size, the
> > > > > > coredump will fail and task will exit, also memory_failure will
> > > > > > be called by kill_me_maybe(),
> > > > > > 
> > > > > >   CPU: 0 PID: 1418 Comm: test Tainted: G   M
> > > > > > 6.3.0-rc5 #29
> > > > > >   Call Trace:
> > > > > >    <TASK>
> > > > > >    dump_stack_lvl+0x37/0x50
> > > > > >    memory_failure+0x51/0x970
> > > > > >    kill_me_maybe+0x5b/0xc0
> > > > > >    task_work_run+0x5a/0x90
> > > > > >    exit_to_user_mode_prepare+0x194/0x1a0
> > > > > >    irqentry_exit_to_user_mode+0x9/0x30
> > > > > >    noist_exc_machine_check+0x40/0x80
> > > > > >    asm_exc_machine_check+0x33/0x40
> > > > > 
> > > > > Is this call trace printed out when copy_mc_to_kernel()
> > > > > failed by finding
> > > > > a memory error (or in some testcase using error injection)?
> > > > 
> > > > I add dump_stack() into memory_failure() to check whether the poisoned
> > > > memory is called or not, and the call trace shows it do call
> > > > memory_failure(), but I get confused when do the test.
> > > > 
> > > > > In my understanding, an MCE should not be triggered when
> > > > > MC-safe copy tries
> > > > > to access to a memory error.  So I feel that we might be talking about
> > > > > different scenarios.
> > > > > 
> > > > > When I questioned previously, I thought about the following scenario:
> > > > > 
> > > > >    - a process terminates abnormally for any reason like
> > > > > segmentation fault,
> > > > >    - then, kernel tries to create a coredump,
> > > > >    - during this, the copying routine accesses to corrupted
> > > > > page to read.
> > > > > 
> > > > Yes, we tested like your described,
> > > > 
> > > > 1) inject memory error into a process
> > > > 2) send a SIGABT/SIGBUS to process to trigger the coredump
> > > > 
> > > > Without patch, the system panic, and with patch only process exits.
> > > > 
> > > > > In this case the corrupted page should not be handled by
> > > > > memory_failure()
> > > > > yet (because otherwise properly handled hwpoisoned page
> > > > > should be ignored
> > > > > by coredump process).  The coredump process would exit with
> > > > > failure with
> > > > > your patch, but then, the corrupted page is still left
> > > > > unhandled and can
> > > > > be reused, so any other thread can easily access to it again.
> > > > 
> > > > As shown above, the corrupted page will be handled by
> > > > memory_failure(), but what I'm wondering,
> > > > 1) memory_failure() is not always called
> > > > 2) look at the above call trace, it looks like from asynchronous
> > > >     interrupt, not from synchronous exception, right?
> > > > 
> > > > > 
> > > > > You can find a few other places (like __wp_page_copy_user
> > > > > and ksm_might_need_to_copy)
> > > > > to call memory_failure_queue() to cope with such unhandled error pages.
> > > > > So does memcpy_from_iter() do the same?
> > > > 
> > > > I add some debug print in do_machine_check() on x86:
> > > > 
> > > > 1) COW,
> > > >    m.kflags: MCE_IN_KERNEL_RECOV
> > > >    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
> > > > 
> > > >    CPU: 11 PID: 2038 Comm: einj_mem_uc
> > > >    Call Trace:
> > > >     <#MC>
> > > >     dump_stack_lvl+0x37/0x50
> > > >     do_machine_check+0x7ad/0x840
> > > >     exc_machine_check+0x5a/0x90
> > > >     asm_exc_machine_check+0x1e/0x40
> > > >    RIP: 0010:copy_mc_fragile+0x35/0x62
> > > > 
> > > >    if (m.kflags & MCE_IN_KERNEL_RECOV) {
> > > >            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> > > >                    mce_panic("Failed kernel mode recovery", &m, msg);
> > > >    }
> > > > 
> > > >    if (m.kflags & MCE_IN_KERNEL_COPYIN)
> > > >            queue_task_work(&m, msg, kill_me_never);
> > > > 
> > > > There is no memory_failure() called when
> > > > EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
> > > > so we manually add a memory_failure_queue() to handle with
> > > > the poisoned page.
> > > > 
> > > > 2) Coredump,  nothing print about m.kflags and fixup_type,
> 
> Sorry,I forget to set coredump file size :(
> 
> The coredump do trigger the do_machine_check() with same m.kflags and
> fixup_type like cow
> 
> 
> > > > with above check, add a memory_failure_queue() or memory_failure() seems
> > > > to be needed for memcpy_from_iter(), but it is totally different from
> > > > the COW scenario
> > > > 
> 
> so the memcpy_from_iter() from coredump is same as cow scenario.

Okay, thank you for confirmation.

> 
> > > > 
> > > > Another question, other copy_mc_to_kernel() callers, eg,
> > > > nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> > > > should they need a memory_failure_queue(), if so, why not add it into
> > > > do_machine_check() ?
> > > 
> > 
> > What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> > is designed to identify fixups which allow in kernel #MC recovery,
> > that is, the caller of copy_mc_to_kernel() must know the source
> > is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> > the MCE_SAFE type.
> 
> And I think we need the following change for MCE_SAFE copy to set
> MCE_IN_KERNEL_COPYIN.
> 
> > 
> > diff --git a/arch/x86/kernel/cpu/mce/severity.c
> > b/arch/x86/kernel/cpu/mce/severity.c
> > index c4477162c07d..63e94484c5d6 100644
> > --- a/arch/x86/kernel/cpu/mce/severity.c
> > +++ b/arch/x86/kernel/cpu/mce/severity.c
> > @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
> > struct pt_regs *regs)
> >          case EX_TYPE_COPY:
> >                  if (!copy_user)
> >                          return IN_KERNEL;
> > -               m->kflags |= MCE_IN_KERNEL_COPYIN;

This change seems to not related to what you try to fix.
Could this break some other workloads like copying from user address?

> >                  fallthrough;
> > 
> >          case EX_TYPE_FAULT_MCE_SAFE:
> >          case EX_TYPE_DEFAULT_MCE_SAFE:
> > -               m->kflags |= MCE_IN_KERNEL_RECOV;
> > +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
> >                  return IN_KERNEL_RECOV;
> > 
> >          default:
> > 
> > then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
> > or every Machine Check safe memory copy will need a memory_failure_xx()
> > call.
> 
> which help use to kill unneeded memory_failure_queue() call, any comments?

I'm not 100% sure that we can safely use queue_task_work() instead of
memory_failure_queue() (due to the difference between workqueue and task
work, which should be recently discussed in thread [1]).  So I prefer to
keep the approach of memory_failure_queue() to keep the impact minimum.

[1] https://lore.kernel.org/lkml/20230417011407.58319-1-xueshuai@linux.alibaba.com/T/#u

Thanks,
Naoya Horiguchi

> 
> 
> > 
> > +Thomas,who add the two types, could you share some comments about
> > this,thanks.
> > 
> > > In the dax case, if the source address is poisoned, and we do follow
> > > up with memory_failure_queue(pfn, flags), what should the value of
> > > the 'flags' be ?
> > 
> 
> With above diff change, we don't add a memory_failure_queue() into dax too.
  
Kefeng Wang April 21, 2023, 5:43 a.m. UTC | #9
On 2023/4/21 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Apr 20, 2023 at 11:05:12PM +0800, Kefeng Wang wrote:
>>
>>
>> On 2023/4/20 10:59, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/20 10:03, Jane Chu wrote:
>>>>
>>>> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>>>>
>>>>>
>>>>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>>>>
>>>>>>>
>>> ...
>>>>>>>>> @@ -371,6 +372,14 @@ size_t
>>>>>>>>> _copy_mc_to_iter(const void *addr, size_t bytes,
>>>>>>>>> struct iov_iter *i)
>>>>>>>>>     EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>>>>     #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>>>>> +static void *memcpy_from_iter(struct iov_iter
>>>>>>>>> *i, void *to, const void *from,
>>>>>>>>> +                 size_t size)
>>>>>>>>> +{
>>>>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>>>>
>>>>>>>> Is it helpful to call memory_failure_queue() if
>>>>>>>> copy_mc_to_kernel() fails
>>>>>>>> due to a memory error?
>>>>>>>
>>>>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>>>>> coredump will fail and task will exit, also memory_failure will
>>>>>>> be called by kill_me_maybe(),
>>>>>>>
>>>>>>>    CPU: 0 PID: 1418 Comm: test Tainted: G   M
>>>>>>> 6.3.0-rc5 #29
>>>>>>>    Call Trace:
>>>>>>>     <TASK>
>>>>>>>     dump_stack_lvl+0x37/0x50
>>>>>>>     memory_failure+0x51/0x970
>>>>>>>     kill_me_maybe+0x5b/0xc0
>>>>>>>     task_work_run+0x5a/0x90
>>>>>>>     exit_to_user_mode_prepare+0x194/0x1a0
>>>>>>>     irqentry_exit_to_user_mode+0x9/0x30
>>>>>>>     noist_exc_machine_check+0x40/0x80
>>>>>>>     asm_exc_machine_check+0x33/0x40
>>>>>>
>>>>>> Is this call trace printed out when copy_mc_to_kernel()
>>>>>> failed by finding
>>>>>> a memory error (or in some testcase using error injection)?
>>>>>
>>>>> I add dump_stack() into memory_failure() to check whether the poisoned
>>>>> memory is called or not, and the call trace shows it do call
>>>>> memory_failure(), but I get confused when do the test.
>>>>>
>>>>>> In my understanding, an MCE should not be triggered when
>>>>>> MC-safe copy tries
>>>>>> to access to a memory error.  So I feel that we might be talking about
>>>>>> different scenarios.
>>>>>>
>>>>>> When I questioned previously, I thought about the following scenario:
>>>>>>
>>>>>>     - a process terminates abnormally for any reason like
>>>>>> segmentation fault,
>>>>>>     - then, kernel tries to create a coredump,
>>>>>>     - during this, the copying routine accesses to corrupted
>>>>>> page to read.
>>>>>>
>>>>> Yes, we tested like your described,
>>>>>
>>>>> 1) inject memory error into a process
>>>>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>>>>
>>>>> Without patch, the system panic, and with patch only process exits.
>>>>>
>>>>>> In this case the corrupted page should not be handled by
>>>>>> memory_failure()
>>>>>> yet (because otherwise properly handled hwpoisoned page
>>>>>> should be ignored
>>>>>> by coredump process).  The coredump process would exit with
>>>>>> failure with
>>>>>> your patch, but then, the corrupted page is still left
>>>>>> unhandled and can
>>>>>> be reused, so any other thread can easily access to it again.
>>>>>
>>>>> As shown above, the corrupted page will be handled by
>>>>> memory_failure(), but what I'm wondering,
>>>>> 1) memory_failure() is not always called
>>>>> 2) look at the above call trace, it looks like from asynchronous
>>>>>      interrupt, not from synchronous exception, right?
>>>>>
>>>>>>
>>>>>> You can find a few other places (like __wp_page_copy_user
>>>>>> and ksm_might_need_to_copy)
>>>>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>>>>> So does memcpy_from_iter() do the same?
>>>>>
>>>>> I add some debug print in do_machine_check() on x86:
>>>>>
>>>>> 1) COW,
>>>>>     m.kflags: MCE_IN_KERNEL_RECOV
>>>>>     fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>>>>
>>>>>     CPU: 11 PID: 2038 Comm: einj_mem_uc
>>>>>     Call Trace:
>>>>>      <#MC>
>>>>>      dump_stack_lvl+0x37/0x50
>>>>>      do_machine_check+0x7ad/0x840
>>>>>      exc_machine_check+0x5a/0x90
>>>>>      asm_exc_machine_check+0x1e/0x40
>>>>>     RIP: 0010:copy_mc_fragile+0x35/0x62
>>>>>
>>>>>     if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>>>>             if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>>>>                     mce_panic("Failed kernel mode recovery", &m, msg);
>>>>>     }
>>>>>
>>>>>     if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>>>>             queue_task_work(&m, msg, kill_me_never);
>>>>>
>>>>> There is no memory_failure() called when
>>>>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>>>>> so we manually add a memory_failure_queue() to handle with
>>>>> the poisoned page.
>>>>>
>>>>> 2) Coredump,  nothing print about m.kflags and fixup_type,
>>
>> Sorry,I forget to set coredump file size :(
>>
>> The coredump do trigger the do_machine_check() with same m.kflags and
>> fixup_type like cow
>>
>>
>>>>> with above check, add a memory_failure_queue() or memory_failure() seems
>>>>> to be needed for memcpy_from_iter(), but it is totally different from
>>>>> the COW scenario
>>>>>
>>
>> so the memcpy_from_iter() from coredump is same as cow scenario.
> 
> Okay, thank you for confirmation.
> 
>>
>>>>>
>>>>> Another question, other copy_mc_to_kernel() callers, eg,
>>>>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>>>>> should they need a memory_failure_queue(), if so, why not add it into
>>>>> do_machine_check() ?
>>>>
>>>
>>> What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
>>> is designed to identify fixups which allow in kernel #MC recovery,
>>> that is, the caller of copy_mc_to_kernel() must know the source
>>> is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
>>> the MCE_SAFE type.
>>
>> And I think we need the following change for MCE_SAFE copy to set
>> MCE_IN_KERNEL_COPYIN.
>>
>>>
>>> diff --git a/arch/x86/kernel/cpu/mce/severity.c
>>> b/arch/x86/kernel/cpu/mce/severity.c
>>> index c4477162c07d..63e94484c5d6 100644
>>> --- a/arch/x86/kernel/cpu/mce/severity.c
>>> +++ b/arch/x86/kernel/cpu/mce/severity.c
>>> @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
>>> struct pt_regs *regs)
>>>           case EX_TYPE_COPY:
>>>                   if (!copy_user)
>>>                           return IN_KERNEL;
>>> -               m->kflags |= MCE_IN_KERNEL_COPYIN;
> 
> This change seems to not related to what you try to fix.
> Could this break some other workloads like copying from user address?
> 

Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
break it.


>>>                   fallthrough;
>>>
>>>           case EX_TYPE_FAULT_MCE_SAFE:
>>>           case EX_TYPE_DEFAULT_MCE_SAFE:
>>> -               m->kflags |= MCE_IN_KERNEL_RECOV;
>>> +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
>>>                   return IN_KERNEL_RECOV;
>>>
>>>           default:
>>>
>>> then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
>>> or every Machine Check safe memory copy will need a memory_failure_xx()
>>> call.
>>
>> which help use to kill unneeded memory_failure_queue() call, any comments?
> 
> I'm not 100% sure that we can safely use queue_task_work() instead of
> memory_failure_queue() (due to the difference between workqueue and task
> work, which should be recently discussed in thread [1]).  So I prefer to
> keep the approach of memory_failure_queue() to keep the impact minimum.
> 

+tony for x86 mce

The x86 call queue_task_work() for EX_TYPE_COPY, so EX_TYPE_FAULT_MCE_SAFE
and EX_TYPE_DEFAULT_MCE_SAFE should be similar to EX_TYPE_COPY,
memcpy_mc_xxx return bytes not copied, let the task to decide
what to do next, and call memory_failure(pfn, 0) to isolate
the poisoned page.

1) queue_task_work() will make the memory_failure() called before 
return-to-user
2) memory_failure_queue() called in COW will put the work on a specific
cpu(current task is running), and memory_failure() will be called in
the work. see more from commit d302c2398ba2 ("mm, hwpoison: when copy-
on-write hits poison, take page offline"),  "It is important, but not 
urgent, to mark the source page as h/w poisoned and unmap it from other
tasks."

Both of them just wants to isolate memory, they shouldn't add action,
they set flag=0 for memory_failure(). so preliminarily, there are not 
different.



> [1] https://lore.kernel.org/lkml/20230417011407.58319-1-xueshuai@linux.alibaba.com/T/#u
> 

The COPY_MC support on arm64 is still under review[1],  xueshuai's patch
is only trying to fix the uncorrected si_code of synchronous exceptions
when memory error occurred, so I think it is not involved the COPY_MC.


[1] 
https://lore.kernel.org/lkml/20221219120008.3818828-1-tongtiangen@huawei.com/


Thanks

> Thanks,
> Naoya Horiguchi
> 
>>
>>
>>>
>>> +Thomas,who add the two types, could you share some comments about
>>> this,thanks.
>>>
>>>> In the dax case, if the source address is poisoned, and we do follow
>>>> up with memory_failure_queue(pfn, flags), what should the value of
>>>> the 'flags' be ?
>>>
>>
>> With above diff change, we don't add a memory_failure_queue() into dax too.
  
HORIGUCHI NAOYA(堀口 直也) April 24, 2023, 6:44 a.m. UTC | #10
On Fri, Apr 21, 2023 at 01:43:39PM +0800, Kefeng Wang wrote:
...
> > > > > > 
> > > > > > Another question, other copy_mc_to_kernel() callers, eg,
> > > > > > nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> > > > > > should they need a memory_failure_queue(), if so, why not add it into
> > > > > > do_machine_check() ?
> > > > > 
> > > > 
> > > > What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> > > > is designed to identify fixups which allow in kernel #MC recovery,
> > > > that is, the caller of copy_mc_to_kernel() must know the source
> > > > is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> > > > the MCE_SAFE type.
> > > 
> > > And I think we need the following change for MCE_SAFE copy to set
> > > MCE_IN_KERNEL_COPYIN.
> > > 
> > > > 
> > > > diff --git a/arch/x86/kernel/cpu/mce/severity.c
> > > > b/arch/x86/kernel/cpu/mce/severity.c
> > > > index c4477162c07d..63e94484c5d6 100644
> > > > --- a/arch/x86/kernel/cpu/mce/severity.c
> > > > +++ b/arch/x86/kernel/cpu/mce/severity.c
> > > > @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
> > > > struct pt_regs *regs)
> > > >           case EX_TYPE_COPY:
> > > >                   if (!copy_user)
> > > >                           return IN_KERNEL;
> > > > -               m->kflags |= MCE_IN_KERNEL_COPYIN;
> > 
> > This change seems to not related to what you try to fix.
> > Could this break some other workloads like copying from user address?
> > 
> 
> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
> break it.
> 
> 
> > > >                   fallthrough;

Sorry, I overlooked this fallthrough. So this change is fine to me.

> > > > 
> > > >           case EX_TYPE_FAULT_MCE_SAFE:
> > > >           case EX_TYPE_DEFAULT_MCE_SAFE:
> > > > -               m->kflags |= MCE_IN_KERNEL_RECOV;
> > > > +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
> > > >                   return IN_KERNEL_RECOV;
> > > > 
> > > >           default:
> > > > 
> > > > then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
> > > > or every Machine Check safe memory copy will need a memory_failure_xx()
> > > > call.
> > > 
> > > which help use to kill unneeded memory_failure_queue() call, any comments?
> > 
> > I'm not 100% sure that we can safely use queue_task_work() instead of
> > memory_failure_queue() (due to the difference between workqueue and task
> > work, which should be recently discussed in thread [1]).  So I prefer to
> > keep the approach of memory_failure_queue() to keep the impact minimum.
> > 
> 
> +tony for x86 mce
> 
> The x86 call queue_task_work() for EX_TYPE_COPY, so EX_TYPE_FAULT_MCE_SAFE
> and EX_TYPE_DEFAULT_MCE_SAFE should be similar to EX_TYPE_COPY,
> memcpy_mc_xxx return bytes not copied, let the task to decide
> what to do next, and call memory_failure(pfn, 0) to isolate
> the poisoned page.
> 
> 1) queue_task_work() will make the memory_failure() called before
> return-to-user
> 2) memory_failure_queue() called in COW will put the work on a specific
> cpu(current task is running), and memory_failure() will be called in
> the work. see more from commit d302c2398ba2 ("mm, hwpoison: when copy-
> on-write hits poison, take page offline"),  "It is important, but not
> urgent, to mark the source page as h/w poisoned and unmap it from other
> tasks."
> 
> Both of them just wants to isolate memory, they shouldn't add action,
> they set flag=0 for memory_failure(). so preliminarily, there are not
> different.

Thanks, sounds good to me.

- Naoya Horiguchi

> 
> 
> 
> > [1] https://lore.kernel.org/lkml/20230417011407.58319-1-xueshuai@linux.alibaba.com/T/#u
> > 
> 
> The COPY_MC support on arm64 is still under review[1],  xueshuai's patch
> is only trying to fix the uncorrected si_code of synchronous exceptions
> when memory error occurred, so I think it is not involved the COPY_MC.
  
Luck, Tony April 24, 2023, 4:17 p.m. UTC | #11
> > This change seems to not related to what you try to fix.
> > Could this break some other workloads like copying from user address?
> > 
> 
> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
> break it.

Should Linux even try to take a core dump for a SIGBUS generated because
the application accessed a poisoned page?

It doesn't seem like it would be useful. Core dumps are for debugging s/w
program errors in applications and libraries. That isn't the case when there
is a poison consumption. The application did nothing wrong.

This patch is still useful though. There may be an undiscovered poison
page in the application. Avoiding a kernel crash when dumping core
is still a good thing.

-Tony
  
Kefeng Wang April 25, 2023, 1:47 a.m. UTC | #12
On 2023/4/25 0:17, Luck, Tony wrote:
>>> This change seems to not related to what you try to fix.
>>> Could this break some other workloads like copying from user address?
>>>
>>
>> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
>> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
>> break it.
> 
> Should Linux even try to take a core dump for a SIGBUS generated because
> the application accessed a poisoned page?
> 
> It doesn't seem like it would be useful. Core dumps are for debugging s/w
> program errors in applications and libraries. That isn't the case when there
> is a poison consumption. The application did nothing wrong.
> 
> This patch is still useful though. There may be an undiscovered poison
> page in the application. Avoiding a kernel crash when dumping core
> is still a good thing.

Thanks for your confirm, and what your option about add
MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
which kill every call memory_failure_queue() after mc safe copy return?

> 
> -Tony
  
Luck, Tony April 25, 2023, 5:16 p.m. UTC | #13
> Thanks for your confirm, and what your option about add
> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> which kill every call memory_failure_queue() after mc safe copy return?

I haven't been following this thread closely. Can you give a link to the e-mail
where you posted a patch that does this? Or just repost that patch if easier.

-Tony
  
Kefeng Wang April 26, 2023, 1:23 a.m. UTC | #14
On 2023/4/26 1:16, Luck, Tony wrote:
>> Thanks for your confirm, and what your option about add
>> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
>> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
>> which kill every call memory_failure_queue() after mc safe copy return?
> 
> I haven't been following this thread closely. Can you give a link to the e-mail
> where you posted a patch that does this? Or just repost that patch if easier.

The major diff changes is [1], I will post a formal patch when -rc1 out, 
thanks.

[1] 
https://lore.kernel.org/linux-mm/6dc1b117-020e-be9e-7e5e-a349ffb7d00a@huawei.com/
> 
> -Tony
  
Luck, Tony April 26, 2023, 3:45 p.m. UTC | #15
> >> Thanks for your confirm, and what your option about add
> >> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> >> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> >> which kill every call memory_failure_queue() after mc safe copy return?
> >
> > I haven't been following this thread closely. Can you give a link to the e-mail
> > where you posted a patch that does this? Or just repost that patch if easier.
>
> The major diff changes is [1], I will post a formal patch when -rc1 out,
> thanks.
>
> [1]
> https://lore.kernel.org/linux-mm/6dc1b117-020e-be9e-7e5e-a349ffb7d00a@huawei.com/

There seem to be a few misconceptions in that message. Not sure if all of them
were resolved.  Here are some pertinent points:

>>> In my understanding, an MCE should not be triggered when MC-safe copy 
>>> tries
>>> to access to a memory error.  So I feel that we might be talking about
>>> different scenarios.

This is wrong. There is still a machine check when a MC-safe copy does a read
from a location that has a memory error.

The recovery flow in this case does not involve queue_task_work(). That is only
useful for machine check exceptions taken in user context. The queued work will
be executed to call memory_failure() from the kernel, but in process context (not
from the machine check exception stack) to handle the error.

For machine checks taken by kernel code (MC-safe copy functions) the recovery
path is here:

                if (m.kflags & MCE_IN_KERNEL_RECOV) {
                        if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
                                mce_panic("Failed kernel mode recovery", &m, msg);
                }

                if (m.kflags & MCE_IN_KERNEL_COPYIN)
                        queue_task_work(&m, msg, kill_me_never);

The "fixup_exception()" ensures that on return from the machine check handler
code returns to the extable[] fixup location instead of the instruction that was
loading from the memory error location.

When the exception was from one of the copy_from_user() variants it makes
sense to also do the queue_task_work() because the kernel is going to return
to the user context (with an EFAULT error code from whatever system call was
attempting the copy_from_user()).

But in the core dump case there is no return to user. The process is being
terminated by the signal that leads to this core dump. So even though you
may consider the page being accessed to be a "user" page, you can't fix
it by queueing work to run on return to user.

I don't have an well thought out suggestion on how to make sure that memory_failure()
is called for the page in this case. Maybe the core dump code can check for the
return from the MC-safe copy it is using and handle it in the error path?

-Tony
  
Kefeng Wang April 27, 2023, 1:06 a.m. UTC | #16
On 2023/4/26 23:45, Luck, Tony wrote:
>>>> Thanks for your confirm, and what your option about add
>>>> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
>>>> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
>>>> which kill every call memory_failure_queue() after mc safe copy return?
>>>
>>> I haven't been following this thread closely. Can you give a link to the e-mail
>>> where you posted a patch that does this? Or just repost that patch if easier.
>>
>> The major diff changes is [1], I will post a formal patch when -rc1 out,
>> thanks.
>>
>> [1]
>> https://lore.kernel.org/linux-mm/6dc1b117-020e-be9e-7e5e-a349ffb7d00a@huawei.com/
> 
> There seem to be a few misconceptions in that message. Not sure if all of them
> were resolved.  Here are some pertinent points:
> 
>>>> In my understanding, an MCE should not be triggered when MC-safe copy
>>>> tries
>>>> to access to a memory error.  So I feel that we might be talking about
>>>> different scenarios.
> 
> This is wrong. There is still a machine check when a MC-safe copy does a read
> from a location that has a memory error.
> 
> The recovery flow in this case does not involve queue_task_work(). That is only
> useful for machine check exceptions taken in user context. The queued work will
> be executed to call memory_failure() from the kernel, but in process context (not
> from the machine check exception stack) to handle the error.
> 
> For machine checks taken by kernel code (MC-safe copy functions) the recovery
> path is here:
> 
>                  if (m.kflags & MCE_IN_KERNEL_RECOV) {
>                          if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>                                  mce_panic("Failed kernel mode recovery", &m, msg);
>                  }
> 
>                  if (m.kflags & MCE_IN_KERNEL_COPYIN)
>                          queue_task_work(&m, msg, kill_me_never);
> 
> The "fixup_exception()" ensures that on return from the machine check handler
> code returns to the extable[] fixup location instead of the instruction that was
> loading from the memory error location.
> 
> When the exception was from one of the copy_from_user() variants it makes
> sense to also do the queue_task_work() because the kernel is going to return
> to the user context (with an EFAULT error code from whatever system call was
> attempting the copy_from_user()).
> 
> But in the core dump case there is no return to user. The process is being
> terminated by the signal that leads to this core dump. So even though you
> may consider the page being accessed to be a "user" page, you can't fix
> it by queueing work to run on return to user.

For coredump,the task work will be called too, see following code,

get_signal
	sig_kernel_coredump
		elf_core_dump
			dump_user_range
				_copy_from_iter // with MC-safe copy, return without panic
	do_group_exit(ksig->info.si_signo);
		do_exit
			exit_task_work
				task_work_run
					kill_me_never
						memory_failure

I also add debug print to check the memory_failure() processing after
add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
normal page and huge page, it works too.

> 
> I don't have an well thought out suggestion on how to make sure that memory_failure()
> is called for the page in this case. Maybe the core dump code can check for the
> return from the MC-safe copy it is using and handle it in the error path?
> 

So we could safely add MCE_IN_KERNEL_COPYIN to all MCE_SAFE exception type?

The kernel diff based on next-20230425 shown bellow

diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..63e94484c5d6 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m, 
struct pt_regs *regs)
  	case EX_TYPE_COPY:
  		if (!copy_user)
  			return IN_KERNEL;
-		m->kflags |= MCE_IN_KERNEL_COPYIN;
  		fallthrough;

  	case EX_TYPE_FAULT_MCE_SAFE:
  	case EX_TYPE_DEFAULT_MCE_SAFE:
-		m->kflags |= MCE_IN_KERNEL_RECOV;
+		m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
  		return IN_KERNEL_RECOV;

  	default:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4509a566fe6c..59a7afe3dfce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3667,23 +3667,19 @@ enum mf_flags {
  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
  		      unsigned long count, int mf_flags);
  extern int memory_failure(unsigned long pfn, int flags);
+extern void memory_failure_queue(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
  extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
  #ifdef CONFIG_MEMORY_FAILURE
-extern void memory_failure_queue(unsigned long pfn, int flags);
  extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
  					bool *migratable_cleared);
  void num_poisoned_pages_inc(unsigned long pfn);
  void num_poisoned_pages_sub(unsigned long pfn, long i);
  struct task_struct *task_early_kill(struct task_struct *tsk, int 
force_early);
  #else
-static inline void memory_failure_queue(unsigned long pfn, int flags)
-{
-}
-
  static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int 
flags,
  					bool *migratable_cleared)
  {
diff --git a/mm/ksm.c b/mm/ksm.c
index 0156bded3a66..7abdf4892387 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2794,7 +2794,6 @@ struct page *ksm_might_need_to_copy(struct page *page,
  	if (new_page) {
  		if (copy_mc_user_highpage(new_page, page, address, vma)) {
  			put_page(new_page);
-			memory_failure_queue(page_to_pfn(page), 0);
  			return ERR_PTR(-EHWPOISON);
  		}
  		SetPageDirty(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index 5e2c6b1fc00e..c0f586257017 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2814,10 +2814,8 @@ static inline int __wp_page_copy_user(struct page 
*dst, struct page *src,
  	unsigned long addr = vmf->address;

  	if (likely(src)) {
-		if (copy_mc_user_highpage(dst, src, addr, vma)) {
-			memory_failure_queue(page_to_pfn(src), 0);
+		if (copy_mc_user_highpage(dst, src, addr, vma))
  			return -EHWPOISON;
-		}
  		return 0;
  	}

@@ -5852,10 +5850,8 @@ static int copy_user_gigantic_page(struct folio 
*dst, struct folio *src,

  		cond_resched();
  		if (copy_mc_user_highpage(dst_page, src_page,
-					  addr + i*PAGE_SIZE, vma)) {
-			memory_failure_queue(page_to_pfn(src_page), 0);
+					  addr + i*PAGE_SIZE, vma))
  			return -EHWPOISON;
-		}
  	}
  	return 0;
  }
@@ -5871,10 +5867,8 @@ static int copy_subpage(unsigned long addr, int 
idx, void *arg)
  	struct copy_subpage_arg *copy_arg = arg;

  	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-				  addr, copy_arg->vma)) {
-		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+				  addr, copy_arg->vma))
  		return -EHWPOISON;
-	}
  	return 0;
  }
  
HORIGUCHI NAOYA(堀口 直也) April 27, 2023, 2:31 a.m. UTC | #17
On Thu, Apr 27, 2023 at 09:06:46AM +0800, Kefeng Wang wrote:
> 
> 
> On 2023/4/26 23:45, Luck, Tony wrote:
> > > > > Thanks for your confirm, and what your option about add
> > > > > MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> > > > > to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> > > > > which kill every call memory_failure_queue() after mc safe copy return?
> > > > 
> > > > I haven't been following this thread closely. Can you give a link to the e-mail
> > > > where you posted a patch that does this? Or just repost that patch if easier.
> > > 
> > > The major diff changes is [1], I will post a formal patch when -rc1 out,
> > > thanks.
> > > 
> > > [1]
> > > https://lore.kernel.org/linux-mm/6dc1b117-020e-be9e-7e5e-a349ffb7d00a@huawei.com/
> > 
> > There seem to be a few misconceptions in that message. Not sure if all of them
> > were resolved.  Here are some pertinent points:
> > 
> > > > > In my understanding, an MCE should not be triggered when MC-safe copy
> > > > > tries
> > > > > to access to a memory error.  So I feel that we might be talking about
> > > > > different scenarios.
> > 
> > This is wrong. There is still a machine check when a MC-safe copy does a read
> > from a location that has a memory error.

Yes, the above was my first impression to be proven wrong ;)

> > 
> > The recovery flow in this case does not involve queue_task_work(). That is only
> > useful for machine check exceptions taken in user context. The queued work will
> > be executed to call memory_failure() from the kernel, but in process context (not
> > from the machine check exception stack) to handle the error.
> > 
> > For machine checks taken by kernel code (MC-safe copy functions) the recovery
> > path is here:
> > 
> >                  if (m.kflags & MCE_IN_KERNEL_RECOV) {
> >                          if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> >                                  mce_panic("Failed kernel mode recovery", &m, msg);
> >                  }
> > 
> >                  if (m.kflags & MCE_IN_KERNEL_COPYIN)
> >                          queue_task_work(&m, msg, kill_me_never);
> > 
> > The "fixup_exception()" ensures that on return from the machine check handler
> > code returns to the extable[] fixup location instead of the instruction that was
> > loading from the memory error location.
> > 
> > When the exception was from one of the copy_from_user() variants it makes
> > sense to also do the queue_task_work() because the kernel is going to return
> > to the user context (with an EFAULT error code from whatever system call was
> > attempting the copy_from_user()).
> > 
> > But in the core dump case there is no return to user. The process is being
> > terminated by the signal that leads to this core dump. So even though you
> > may consider the page being accessed to be a "user" page, you can't fix
> > it by queueing work to run on return to user.
> 
> For coredump,the task work will be called too, see following code,
> 
> get_signal
> 	sig_kernel_coredump
> 		elf_core_dump
> 			dump_user_range
> 				_copy_from_iter // with MC-safe copy, return without panic
> 	do_group_exit(ksig->info.si_signo);
> 		do_exit
> 			exit_task_work
> 				task_work_run
> 					kill_me_never
> 						memory_failure
> 
> I also add debug print to check the memory_failure() processing after
> add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
> normal page and huge page, it works too.

Sounds nice to me.
Maybe this information is worth documenting in the patch description.

Thanks,
Naoya Horiguchi
  
Luck, Tony April 27, 2023, 4:45 p.m. UTC | #18
> > But in the core dump case there is no return to user. The process is being
> > terminated by the signal that leads to this core dump. So even though you
> > may consider the page being accessed to be a "user" page, you can't fix
> > it by queueing work to run on return to user.
> 
> For coredump,the task work will be called too, see following code,
> 
> get_signal
> 	sig_kernel_coredump
> 		elf_core_dump
> 			dump_user_range
> 				_copy_from_iter // with MC-safe copy, return without panic
> 	do_group_exit(ksig->info.si_signo);
> 		do_exit
> 			exit_task_work
> 				task_work_run
> 					kill_me_never
> 						memory_failure
> 

Nice. I didn't realize that the exit code path would clear any pending task_work() requests.
But it makes sense that this happens. Thanks for filling a gap in my knowledge.

-Tony
  
Kefeng Wang April 28, 2023, 8:56 a.m. UTC | #19
On 2023/4/27 10:31, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Apr 27, 2023 at 09:06:46AM +0800, Kefeng Wang wrote:
>>
>>
...
>> For coredump,the task work will be called too, see following code,
>>
>> get_signal
>> 	sig_kernel_coredump
>> 		elf_core_dump
>> 			dump_user_range
>> 				_copy_from_iter // with MC-safe copy, return without panic
>> 	do_group_exit(ksig->info.si_signo);
>> 		do_exit
>> 			exit_task_work
>> 				task_work_run
>> 					kill_me_never
>> 						memory_failure
>>
>> I also add debug print to check the memory_failure() processing after
>> add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
>> normal page and huge page, it works too.
> 
> Sounds nice to me.
> Maybe this information is worth documenting in the patch description.
> 

Sure, will make a formal patch and send out, thanks.

> Thanks,
> Naoya Horiguchi
  
Kefeng Wang April 28, 2023, 8:59 a.m. UTC | #20
On 2023/4/28 0:45, Luck, Tony wrote:
>>> But in the core dump case there is no return to user. The process is being
>>> terminated by the signal that leads to this core dump. So even though you
>>> may consider the page being accessed to be a "user" page, you can't fix
>>> it by queueing work to run on return to user.
>>
>> For coredump,the task work will be called too, see following code,
>>
>> get_signal
>> 	sig_kernel_coredump
>> 		elf_core_dump
>> 			dump_user_range
>> 				_copy_from_iter // with MC-safe copy, return without panic
>> 	do_group_exit(ksig->info.si_signo);
>> 		do_exit
>> 			exit_task_work
>> 				task_work_run
>> 					kill_me_never
>> 						memory_failure
>>
> 
> Nice. I didn't realize that the exit code path would clear any pending task_work() requests.
> But it makes sense that this happens. Thanks for filling a gap in my knowledge.
> 
Yep, we could be benefit from it to unify memory failure handling :)

> -Tony
  

Patch

diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@  static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	pos = file->f_pos;
 	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
 	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+	iov_iter_set_copy_mc(&iter);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index c459e1d5772b..aa3a4c6ba585 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,6 +40,7 @@  struct iov_iter_state {
 
 struct iov_iter {
 	u8 iter_type;
+	bool copy_mc;
 	bool nofault;
 	bool data_source;
 	bool user_backed;
@@ -241,8 +242,22 @@  size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 
 #ifdef CONFIG_ARCH_HAS_COPY_MC
 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+static inline void iov_iter_set_copy_mc(struct iov_iter *i)
+{
+	i->copy_mc = true;
+}
+
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+	return i->copy_mc;
+}
 #else
 #define _copy_mc_to_iter _copy_to_iter
+static inline void iov_iter_set_copy_mc(struct iov_iter *i) { }
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+	return false;
+}
 #endif
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
@@ -357,6 +372,7 @@  static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter) {
 		.iter_type = ITER_UBUF,
+		.copy_mc = false,
 		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 08587feb94cc..7b9d8419fee7 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -288,6 +288,7 @@  void iov_iter_init(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter) {
 		.iter_type = ITER_IOVEC,
+		.copy_mc = false,
 		.nofault = false,
 		.user_backed = true,
 		.data_source = direction,
@@ -371,6 +372,14 @@  size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 #endif /* CONFIG_ARCH_HAS_COPY_MC */
 
+static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
+				 size_t size)
+{
+	if (iov_iter_is_copy_mc(i))
+		return (void *)copy_mc_to_kernel(to, from, size);
+	return memcpy(to, from, size);
+}
+
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(!i->data_source))
@@ -380,7 +389,7 @@  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(addr + off, base, len),
-		memcpy(addr + off, base, len)
+		memcpy_from_iter(i, addr + off, base, len)
 	)
 
 	return bytes;
@@ -571,7 +580,7 @@  size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
 	}
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(p + off, base, len),
-		memcpy(p + off, base, len)
+		memcpy_from_iter(i, p + off, base, len)
 	)
 	kunmap_atomic(kaddr);
 	return bytes;
@@ -704,6 +713,7 @@  void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter){
 		.iter_type = ITER_KVEC,
+		.copy_mc = false,
 		.data_source = direction,
 		.kvec = kvec,
 		.nr_segs = nr_segs,
@@ -720,6 +730,7 @@  void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter){
 		.iter_type = ITER_BVEC,
+		.copy_mc = false,
 		.data_source = direction,
 		.bvec = bvec,
 		.nr_segs = nr_segs,
@@ -748,6 +759,7 @@  void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 	BUG_ON(direction & ~1);
 	*i = (struct iov_iter) {
 		.iter_type = ITER_XARRAY,
+		.copy_mc = false,
 		.data_source = direction,
 		.xarray = xarray,
 		.xarray_start = start,
@@ -771,6 +783,7 @@  void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 	BUG_ON(direction != READ);
 	*i = (struct iov_iter){
 		.iter_type = ITER_DISCARD,
+		.copy_mc = false,
 		.data_source = false,
 		.count = count,
 		.iov_offset = 0