On Wed, Mar 22, 2023 at 02:48:26PM +0000, Catalin Marinas wrote:
> On Tue, Mar 21, 2023 at 12:25:13PM +0000, Mark Rutland wrote:
> > For some combinations of sizes and alignments __{arch,raw}_copy_to_user
> > will copy some bytes between (to + size - N) and (to + size), but will
> > never modify bytes past (to + size).
> >
> > This violates the documentation in <linux/uaccess.h>, which states:
> >
> > > If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes
> > > starting at to must become equal to the bytes fetched from the
> > > corresponding area starting at from. All data past to + size - N must
> > > be left unmodified.
> >
> > This can be demonstrated through testing, e.g.
> >
> > | # test_copy_to_user: EXPECTATION FAILED at lib/usercopy_kunit.c:287
> > | post-destination bytes modified (dst_page[4082]=0x1, offset=4081, size=16, ret=15)
> > | [FAILED] 16 byte copy
> >
> > This happens because the __arch_copy_to_user() can make unaligned stores
> > to the userspace buffer, and the ARM architecture permits (but does not
> > require) that such unaligned stores write some bytes before raising a
> > fault (per ARM DDI 0487I.a Section B2.2.1 and Section B2.7.1). The
> > extable fixup handlers in __arch_copy_to_user() assume that any faulting
> > store has failed entirely, and so under-report the number of bytes
> > copied when an unaligned store writes some bytes before faulting.
>
> I find the Arm ARM hard to parse (no surprise here). Do you happen to
> know what the behavior is for the new CPY instructions? I'd very much
> like to use those for uaccess as well eventually but if they have the
> same imp def behaviour, I'd rather relax the documentation and continue
> to live with the current behaviour.
My understanding is that those have to be broken up into a set of smaller
accesses, and so the same applies, with the architectural window for clobbering
being the *entire* range from 'to' to 'to + size'
That said, the description of the forward-only instructions (CPYF*) suggests
those might have to be exact.
In ARM DDI 0487I.a, section C6.2.80 "CPYFPWT, CPYFMWT, CPYFEWT", it says:
> The memory copy performed by these instructions is in the forward direction
> only, so the instructions are suitable for a memory copy only where there is
> no overlap between the source and destination locations, or where the source
> address is greater than the destination address.
That *might* mean in practice that CPYF* instructions can't clobber later bytes
in dst in case they'll later be consumed as src bytes, but it doesn't strictly
say that (and maybe the core would figure out how much potential overlap there
is and tune the copy chunk size).
We could chase our architects on that.
> > The only architecturally guaranteed way to avoid this is to only use
> > aligned stores to write to user memory. This patch rewrites
> > __arch_copy_to_user() to only access the user buffer with aligned
> > stores, such that the bytes written can always be determined reliably.
>
> Can we not fall back to byte-at-a-time? There's still a potential race
> if the page becomes read-only for example. Well, probably not worth it
> if we decide to go this route.
I was trying to avoid that shape of race.
I also believe that if we have a misaligned store straddling two pages, and the
first page is faulting, it the store can do a partial write to the 2nd page,
which I suspected is not what we want (though maybe that's beningn, if we're
going to say that clobbering anywhere within the dst buffer is fine).
> Where we may notice some small performance degradation is copy_to_user()
> where the reads from the source end up unaligned due to the destination
> buffer alignment. I doubt that's a common case though and most CPUs can
> probably cope just well with this.
FWIW, I instrumented a kernel build workload, and the significant majority of
the time (IIRC by ~100x), both src and dst happened to be aligned to at least 8
bytes, so even if that were slow its the rare case. I guess that's because any
structures we copy are likely to contain a long or a pointer, and a bunch of
other copies are going to be bulk page copies into buffers which the allocator
has aligned to at least 8 bytes.
Thanks,
Mark.
@@ -9,6 +9,14 @@
#include <asm/assembler.h>
#include <asm/cache.h>
+#define USER_OFF(off, x...) USER(fixup_offset_##off, x)
+
+#define FIXUP_OFFSET(n) \
+fixup_offset_##n: \
+ sub x0, x3, x0; \
+ sub x0, x0, n; \
+ ret
+
/*
* Copy to user space from a kernel buffer (alignment handled by the hardware)
*
@@ -18,56 +26,168 @@
* x2 - n
* Returns:
* x0 - bytes not copied
+ *
+ * Unlike a memcpy(), we need to handle faults on user addresses, and we need
+ * to precisely report the number of bytes (not) copied. We must only use
+ * aligned single-copy-atomic stores to write to user memory, as stores which
+ * are not single-copy-atomic (e.g. unaligned stores, STP, ASMID stores) can be
+ * split into separate byte accesses (per ARM DDI 0487I.a, Section B2.2.1) and
+ * some arbitatrary set of those byte accesses might occur prior to a fault
+ * being raised (per per ARM DDI 0487I.a, Section B2.7.1).
+ *
+ * We use STTR to write to user memory, which has 1/2/4/8 byte forms, and the
+ * user address ('to') might have arbitrary alignment, so we must handle
+ * misalignment up to 8 bytes.
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
- .endm
+SYM_FUNC_START(__arch_copy_to_user)
+ /*
+ * The end address. Fixup handlers will use this to calculate
+ * the number of bytes copied.
+ */
+ add x3, x0, x2
- .macro strb1 reg, ptr, val
- user_ldst 9998f, sttrb, \reg, \ptr, \val
- .endm
+ /*
+ * Tracing of a kernel build indicates that for the vast
+ * majority of calls to copy_to_user(), 'to' is aligned to 8
+ * bytes. When this is the case, we want to skip to the bulk
+ * copy as soon as possible.
+ */
+ ands x4, x0, 0x7
+ b.eq body
- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
- .endm
+ /*
+ * For small unaligned copies, it's not worth trying to be
+ * optimal.
+ */
+ cmp x2, #8
+ b.lo bytewise_loop
- .macro strh1 reg, ptr, val
- user_ldst 9997f, sttrh, \reg, \ptr, \val
- .endm
+ /*
+ * Calculate the distance to the next 8-byte boundary.
+ */
+ mov x5, #8
+ sub x4, x5, x4
- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
- .endm
+SYM_INNER_LABEL(head_realign_1b, SYM_L_LOCAL)
+ tbz x4, #0, head_realign_2b
- .macro str1 reg, ptr, val
- user_ldst 9997f, sttr, \reg, \ptr, \val
- .endm
+ ldrb w8, [x1], #1
+USER_OFF(0, sttrb w8, [x0])
+ add x0, x0, #1
- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
- .endm
+SYM_INNER_LABEL(head_realign_2b, SYM_L_LOCAL)
+ tbz x4, #1, head_realign_4b
- .macro stp1 reg1, reg2, ptr, val
- user_stp 9997f, \reg1, \reg2, \ptr, \val
- .endm
+ ldrh w8, [x1], #2
+USER_OFF(0, sttrh w8, [x0])
+ add x0, x0, #2
-end .req x5
-srcin .req x15
-SYM_FUNC_START(__arch_copy_to_user)
- add end, x0, x2
- mov srcin, x1
-#include "copy_template.S"
- mov x0, #0
- ret
+SYM_INNER_LABEL(head_realign_4b, SYM_L_LOCAL)
+ tbz x4, #2, head_realigned
- // Exception fixups
-9997: cmp dst, dstin
- b.ne 9998f
- // Before being absolutely sure we couldn't copy anything, try harder
- ldrb tmp1w, [srcin]
-USER(9998f, sttrb tmp1w, [dst])
- add dst, dst, #1
-9998: sub x0, end, dst // bytes not copied
- ret
+ ldr w8, [x1], #4
+USER_OFF(0, sttr w8, [x0])
+ add x0, x0, #4
+
+SYM_INNER_LABEL(head_realigned, SYM_L_LOCAL)
+ /*
+ * Any 1-7 byte misalignment has now been fixed; subtract this
+ * misalignment from the remaining size.
+ */
+ sub x2, x2, x4
+
+SYM_INNER_LABEL(body, SYM_L_LOCAL)
+ cmp x2, #64
+ b.lo tail_32b
+
+SYM_INNER_LABEL(body_64b_loop, SYM_L_LOCAL)
+ ldp x8, x9, [x1, #0]
+ ldp x10, x11, [x1, #16]
+ ldp x12, x13, [x1, #32]
+ ldp x14, x15, [x1, #48]
+USER_OFF(0, sttr x8, [x0, #0])
+USER_OFF(8, sttr x9, [x0, #8])
+USER_OFF(16, sttr x10, [x0, #16])
+USER_OFF(24, sttr x11, [x0, #24])
+USER_OFF(32, sttr x12, [x0, #32])
+USER_OFF(40, sttr x13, [x0, #40])
+USER_OFF(48, sttr x14, [x0, #48])
+USER_OFF(56, sttr x15, [x0, #56])
+ add x0, x0, #64
+ add x1, x1, #64
+ sub x2, x2, #64
+
+ cmp x2, #64
+ b.hs body_64b_loop
+
+SYM_INNER_LABEL(tail_32b, SYM_L_LOCAL)
+ tbz x2, #5, tail_16b
+
+ ldp x8, x9, [x1, #0]
+ ldp x10, x11, [x1, #16]
+USER_OFF(0, sttr x8, [x0, #0])
+USER_OFF(8, sttr x9, [x0, #8])
+USER_OFF(16, sttr x10, [x0, #16])
+USER_OFF(24, sttr x11, [x0, #24])
+ add x0, x0, #32
+ add x1, x1, #32
+
+SYM_INNER_LABEL(tail_16b, SYM_L_LOCAL)
+ tbz x2, #4, tail_8b
+
+ ldp x8, x9, [x1], #16
+USER_OFF(0, sttr x8, [x0, #0])
+USER_OFF(8, sttr x9, [x0, #8])
+ add x0, x0, #16
+
+SYM_INNER_LABEL(tail_8b, SYM_L_LOCAL)
+ tbz x2, #3, tail_4b
+
+ ldr x8, [x1], #8
+USER_OFF(0, sttr x8, [x0])
+ add x0, x0, #8
+
+SYM_INNER_LABEL(tail_4b, SYM_L_LOCAL)
+ tbz x2, #2, tail_2b
+
+ ldr w8, [x1], #4
+USER_OFF(0, sttr w8, [x0])
+ add x0, x0, #4
+
+SYM_INNER_LABEL(tail_2b, SYM_L_LOCAL)
+ tbz x2, #1, tail_1b
+
+ ldrh w8, [x1], #2
+USER_OFF(0, sttrh w8, [x0])
+ add x0, x0, #2
+
+SYM_INNER_LABEL(tail_1b, SYM_L_LOCAL)
+ tbz x2, #0, done
+
+ ldrb w8, [x1]
+USER_OFF(0, sttrb w8, [x0])
+
+SYM_INNER_LABEL(done, SYM_L_LOCAL)
+ mov x0, xzr
+ ret
+
+SYM_INNER_LABEL(bytewise_loop, SYM_L_LOCAL)
+ cbz x2, done
+
+ ldrb w8, [x1], #1
+USER_OFF(0, sttrb w8, [x0])
+ add x0, x0, #1
+ sub x2, x2, #1
+
+ b bytewise_loop
+
+FIXUP_OFFSET(0)
+FIXUP_OFFSET(8)
+FIXUP_OFFSET(16)
+FIXUP_OFFSET(24)
+FIXUP_OFFSET(32)
+FIXUP_OFFSET(40)
+FIXUP_OFFSET(48)
+FIXUP_OFFSET(56)
SYM_FUNC_END(__arch_copy_to_user)
EXPORT_SYMBOL(__arch_copy_to_user)