[RFC,v12,2/6] x86: mm: Skip faulting instruction for VM_DROPPABLE faults
Commit Message
The prior commit introduced VM_DROPPABLE, but in a limited form where
the faulting instruction was retried instead of skipped. Finish that up
with the platform-specific aspect of skipping the actual instruction.
This works by copying userspace's %rip to a stack buffer of size
MAX_INSN_SIZE, decoding it, and then adding the length of the decoded
instruction to userspace's %rip. In the event any of these fail, just
fallback to not advancing %rip and trying again.
Cc: linux-mm@kvack.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
arch/x86/mm/fault.c | 19 +++++++++++++++++++
include/linux/mm_types.h | 5 ++++-
mm/memory.c | 4 +++-
3 files changed, 26 insertions(+), 2 deletions(-)
Comments
On Mon, Dec 12, 2022 at 11:53:43AM -0700, Jason A. Donenfeld wrote:
> + if (fault & VM_FAULT_SKIP_INSN) {
> + u8 insn_buf[MAX_INSN_SIZE];
> + struct insn insn;
> + size_t len;
> +
> + len = sizeof(insn_buf) - copy_from_user(insn_buf, (void *)regs->ip, sizeof(insn_buf));
> + if (!len)
> + return;
> +
> + if (insn_decode(&insn, insn_buf, len, in_32bit_syscall() ? INSN_MODE_32 : INSN_MODE_64) < 0)
> + return;
> +
> + regs->ip += insn.length;
> + return;
> + }
I just found umip.c, which does basically the same thing, but does it
correctly. For v+1, the above snippet will instead do this:
if (fault & VM_FAULT_SKIP_INSN) {
u8 buf[MAX_INSN_SIZE];
struct insn insn;
int nr_copied;
nr_copied = insn_fetch_from_user(regs, buf);
if (nr_copied <= 0)
return;
if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
return;
regs->ip += insn.length;
return;
}
Same thing, but those helpers do correct inspection of the environment
and registers.
Also, seeing this already being done in umip.c is heartening that the
approach here isn't overly insane.
Jason
@@ -33,6 +33,8 @@
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
#include <asm/irq_stack.h>
+#include <asm/insn.h> /* insn_decode() */
+#include <asm/compat.h> /* in_32bit_syscall() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -1454,6 +1456,23 @@ void do_user_addr_fault(struct pt_regs *regs,
}
mmap_read_unlock(mm);
+
+ if (fault & VM_FAULT_SKIP_INSN) {
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
+ size_t len;
+
+ len = sizeof(insn_buf) - copy_from_user(insn_buf, (void *)regs->ip, sizeof(insn_buf));
+ if (!len)
+ return;
+
+ if (insn_decode(&insn, insn_buf, len, in_32bit_syscall() ? INSN_MODE_32 : INSN_MODE_64) < 0)
+ return;
+
+ regs->ip += insn.length;
+ return;
+ }
+
if (likely(!(fault & VM_FAULT_ERROR)))
return;
@@ -861,6 +861,7 @@ typedef __bitwise unsigned int vm_fault_t;
* fsync() to complete (for synchronous page faults
* in DAX)
* @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
+ * @VM_FAULT_SKIP_INSN: ->handle the fault by skipping faulting instruction
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
@@ -879,6 +880,7 @@ enum vm_fault_reason {
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
+ VM_FAULT_SKIP_INSN = (__force vm_fault_t)0x008000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
@@ -903,7 +905,8 @@ enum vm_fault_reason {
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
+ { VM_FAULT_SKIP_INSN, "SKIP_INSN" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
@@ -5220,8 +5220,10 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
lru_gen_exit_fault();
/* If the mapping is droppable, then errors due to OOM aren't fatal. */
- if (vma->vm_flags & VM_DROPPABLE)
+ if ((ret & VM_FAULT_OOM) && (vma->vm_flags & VM_DROPPABLE)) {
ret &= ~VM_FAULT_OOM;
+ ret |= VM_FAULT_SKIP_INSN;
+ }
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();