[2/3] riscv: optimized memmove

Message ID 20240128111013.2450-3-jszhang@kernel.org
State New
Headers
Series riscv: optimize memcpy/memmove/memset |

Commit Message

Jisheng Zhang Jan. 28, 2024, 11:10 a.m. UTC
  From: Matteo Croce <mcroce@microsoft.com>

When the destination buffer is before the source one, or when the
buffers doesn't overlap, it's safe to use memcpy() instead, which is
optimized to use a bigger data size possible.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 arch/riscv/include/asm/string.h |   4 +-
 arch/riscv/kernel/riscv_ksyms.c |   2 -
 arch/riscv/lib/Makefile         |   1 -
 arch/riscv/lib/memmove.S        | 317 --------------------------------
 arch/riscv/lib/string.c         |  25 +++
 5 files changed, 27 insertions(+), 322 deletions(-)
 delete mode 100644 arch/riscv/lib/memmove.S
  

Comments

David Laight Jan. 28, 2024, 12:47 p.m. UTC | #1
From: Jisheng Zhang
> Sent: 28 January 2024 11:10
> 
> When the destination buffer is before the source one, or when the
> buffers doesn't overlap, it's safe to use memcpy() instead, which is
> optimized to use a bigger data size possible.
> 
..
> + * Simply check if the buffer overlaps an call memcpy() in case,
> + * otherwise do a simple one byte at time backward copy.

I'd at least do a 64bit copy loop if the addresses are aligned.

Thinks a bit more....

Put the copy 64 bytes code (the body of the memcpy() loop)
into it an inline function and call it with increasing addresses
in memcpy() are decrementing addresses in memmove.

So memcpy() contains:
	src_lim = src_lim + count;
	... alignment copy
	for (; src + 64 <= src_lim; src += 64; dest += 64)
		copy_64_bytes(dest, src);
	... tail copy

Then you can do something very similar for backwards copies.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
Jisheng Zhang Jan. 30, 2024, 11:30 a.m. UTC | #2
On Sun, Jan 28, 2024 at 12:47:00PM +0000, David Laight wrote:
> From: Jisheng Zhang
> > Sent: 28 January 2024 11:10
> > 
> > When the destination buffer is before the source one, or when the
> > buffers doesn't overlap, it's safe to use memcpy() instead, which is
> > optimized to use a bigger data size possible.
> > 
> ...
> > + * Simply check if the buffer overlaps an call memcpy() in case,
> > + * otherwise do a simple one byte at time backward copy.
> 
> I'd at least do a 64bit copy loop if the addresses are aligned.
> 
> Thinks a bit more....
> 
> Put the copy 64 bytes code (the body of the memcpy() loop)
> into it an inline function and call it with increasing addresses
> in memcpy() are decrementing addresses in memmove.

Hi David,

Besides the 64 bytes copy, there's another optimization in __memcpy:
word-by-word copy even if s and d are not aligned.
So if we make the two optimizd copy as inline functions and call them
in memmove(), we almost duplicate the __memcpy code, so I think
directly calling __memcpy is a bit better.

Thanks
> 
> So memcpy() contains:
> 	src_lim = src_lim + count;
> 	... alignment copy
> 	for (; src + 64 <= src_lim; src += 64; dest += 64)
> 		copy_64_bytes(dest, src);
> 	... tail copy
> 
> Then you can do something very similar for backwards copies.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>
  
Nick Kossifidis Jan. 30, 2024, 11:39 a.m. UTC | #3
On 1/28/24 13:10, Jisheng Zhang wrote:
> From: Matteo Croce <mcroce@microsoft.com>
> 
> When the destination buffer is before the source one, or when the
> buffers doesn't overlap, it's safe to use memcpy() instead, which is
> optimized to use a bigger data size possible.
> 
> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

I'd expect to have memmove handle both fw/bw copying and then memcpy 
being an alias to memmove, to also take care when regions overlap and 
avoid undefined behavior.


> --- a/arch/riscv/lib/string.c
> +++ b/arch/riscv/lib/string.c
> @@ -119,3 +119,28 @@ void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy)
>   EXPORT_SYMBOL(memcpy);
>   void *__pi_memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
>   void *__pi___memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
> +
> +/*
> + * Simply check if the buffer overlaps an call memcpy() in case,
> + * otherwise do a simple one byte at time backward copy.
> + */
> +void *__memmove(void *dest, const void *src, size_t count)
> +{
> +	if (dest < src || src + count <= dest)
> +		return __memcpy(dest, src, count);
> +
> +	if (dest > src) {
> +		const char *s = src + count;
> +		char *tmp = dest + count;
> +
> +		while (count--)
> +			*--tmp = *--s;
> +	}
> +	return dest;
> +}
> +EXPORT_SYMBOL(__memmove);
> +

Here is an approach for the backwards case to get things started...

static void
copy_bw(void *dst_ptr, const void *src_ptr, size_t len)
{
	union const_data src = { .as_bytes = src_ptr + len };
	union data dst = { .as_bytes = dst_ptr + len };
	size_t remaining = len;
	size_t src_offt = 0;

	if (len < 2 * WORD_SIZE)
		goto trailing_bw;

	for(; dst.as_uptr & WORD_MASK; remaining--)
		*--dst.as_bytes = *--src.as_bytes;

	src_offt = src.as_uptr & WORD_MASK;
	if (!src_offt) {
		for (; remaining >= WORD_SIZE; remaining -= WORD_SIZE)
			*--dst.as_ulong = *--src.as_ulong;
	} else {
		unsigned long cur, prev;
		src.as_bytes -= src_offt;
		for (; remaining >= WORD_SIZE; remaining -= WORD_SIZE) {
			cur = *src.as_ulong;
			prev = *--src.as_ulong;
			*--dst.as_ulong = cur << ((WORD_SIZE - src_offt) * 8) |
					  prev >> (src_offt * 8);
		}
		src.as_bytes += src_offt;
	}

  trailing_bw:
	while (remaining-- > 0)
		*--dst.as_bytes = *--src.as_bytes;
}

Regards,
Nick
  
David Laight Jan. 30, 2024, 11:51 a.m. UTC | #4
From: Jisheng Zhang
> Sent: 30 January 2024 11:31
> 
> On Sun, Jan 28, 2024 at 12:47:00PM +0000, David Laight wrote:
> > From: Jisheng Zhang
> > > Sent: 28 January 2024 11:10
> > >
> > > When the destination buffer is before the source one, or when the
> > > buffers doesn't overlap, it's safe to use memcpy() instead, which is
> > > optimized to use a bigger data size possible.
> > >
> > ...
> > > + * Simply check if the buffer overlaps an call memcpy() in case,
> > > + * otherwise do a simple one byte at time backward copy.
> >
> > I'd at least do a 64bit copy loop if the addresses are aligned.
> >
> > Thinks a bit more....
> >
> > Put the copy 64 bytes code (the body of the memcpy() loop)
> > into it an inline function and call it with increasing addresses
> > in memcpy() are decrementing addresses in memmove.
> 
> Hi David,
> 
> Besides the 64 bytes copy, there's another optimization in __memcpy:
> word-by-word copy even if s and d are not aligned.
> So if we make the two optimizd copy as inline functions and call them
> in memmove(), we almost duplicate the __memcpy code, so I think
> directly calling __memcpy is a bit better.

If a forwards copy is valid call memcpy() - which I think you do.
If not you can still use the same 'copy 8 register' code
that memcpy() uses - just with a decrementing block address.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
Jisheng Zhang Jan. 30, 2024, 1:12 p.m. UTC | #5
On Tue, Jan 30, 2024 at 01:39:10PM +0200, Nick Kossifidis wrote:
> On 1/28/24 13:10, Jisheng Zhang wrote:
> > From: Matteo Croce <mcroce@microsoft.com>
> > 
> > When the destination buffer is before the source one, or when the
> > buffers doesn't overlap, it's safe to use memcpy() instead, which is
> > optimized to use a bigger data size possible.
> > 
> > Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> > Reported-by: kernel test robot <lkp@intel.com>
> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> 
> I'd expect to have memmove handle both fw/bw copying and then memcpy being
> an alias to memmove, to also take care when regions overlap and avoid
> undefined behavior.

Hi Nick,

Here is somthing from man memcpy:

"void *memcpy(void dest[restrict .n], const void src[restrict .n],
                    size_t n);

The  memcpy()  function copies n bytes from memory area src to memory area dest.
The memory areas must not overlap.  Use memmove(3) if the memory areas do  over‐
lap."

IMHO, the "restrict" implies that there's no overlap. If overlap
happens, the manual doesn't say what will happen.

From another side, I have a concern: currently, other arch don't have
this alias behavior, IIUC(at least, per my understanding of arm and arm64
memcpy implementations)they just copy forward. I want to keep similar behavior
for riscv.

So I want to hear more before going towards alias-memcpy-to-memmove direction.

Thanks
> 
> 
> > --- a/arch/riscv/lib/string.c
> > +++ b/arch/riscv/lib/string.c
> > @@ -119,3 +119,28 @@ void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy)
> >   EXPORT_SYMBOL(memcpy);
> >   void *__pi_memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
> >   void *__pi___memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
> > +
> > +/*
> > + * Simply check if the buffer overlaps an call memcpy() in case,
> > + * otherwise do a simple one byte at time backward copy.
> > + */
> > +void *__memmove(void *dest, const void *src, size_t count)
> > +{
> > +	if (dest < src || src + count <= dest)
> > +		return __memcpy(dest, src, count);
> > +
> > +	if (dest > src) {
> > +		const char *s = src + count;
> > +		char *tmp = dest + count;
> > +
> > +		while (count--)
> > +			*--tmp = *--s;
> > +	}
> > +	return dest;
> > +}
> > +EXPORT_SYMBOL(__memmove);
> > +
> 
> Here is an approach for the backwards case to get things started...
> 
> static void
> copy_bw(void *dst_ptr, const void *src_ptr, size_t len)
> {
> 	union const_data src = { .as_bytes = src_ptr + len };
> 	union data dst = { .as_bytes = dst_ptr + len };
> 	size_t remaining = len;
> 	size_t src_offt = 0;
> 
> 	if (len < 2 * WORD_SIZE)
> 		goto trailing_bw;
> 
> 	for(; dst.as_uptr & WORD_MASK; remaining--)
> 		*--dst.as_bytes = *--src.as_bytes;
> 
> 	src_offt = src.as_uptr & WORD_MASK;
> 	if (!src_offt) {
> 		for (; remaining >= WORD_SIZE; remaining -= WORD_SIZE)
> 			*--dst.as_ulong = *--src.as_ulong;
> 	} else {
> 		unsigned long cur, prev;
> 		src.as_bytes -= src_offt;
> 		for (; remaining >= WORD_SIZE; remaining -= WORD_SIZE) {
> 			cur = *src.as_ulong;
> 			prev = *--src.as_ulong;
> 			*--dst.as_ulong = cur << ((WORD_SIZE - src_offt) * 8) |
> 					  prev >> (src_offt * 8);
> 		}
> 		src.as_bytes += src_offt;
> 	}
> 
>  trailing_bw:
> 	while (remaining-- > 0)
> 		*--dst.as_bytes = *--src.as_bytes;
> }
> 
> Regards,
> Nick
  
Nick Kossifidis Jan. 30, 2024, 4:52 p.m. UTC | #6
On 1/30/24 15:12, Jisheng Zhang wrote:
> On Tue, Jan 30, 2024 at 01:39:10PM +0200, Nick Kossifidis wrote:
>> On 1/28/24 13:10, Jisheng Zhang wrote:
>>> From: Matteo Croce <mcroce@microsoft.com>
>>>
>>> When the destination buffer is before the source one, or when the
>>> buffers doesn't overlap, it's safe to use memcpy() instead, which is
>>> optimized to use a bigger data size possible.
>>>
>>> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
>>> Reported-by: kernel test robot <lkp@intel.com>
>>> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
>>
>> I'd expect to have memmove handle both fw/bw copying and then memcpy being
>> an alias to memmove, to also take care when regions overlap and avoid
>> undefined behavior.
> 
> Hi Nick,
> 
> Here is somthing from man memcpy:
> 
> "void *memcpy(void dest[restrict .n], const void src[restrict .n],
>                      size_t n);
> 
> The  memcpy()  function copies n bytes from memory area src to memory area dest.
> The memory areas must not overlap.  Use memmove(3) if the memory areas do  over‐
> lap."
> 
> IMHO, the "restrict" implies that there's no overlap. If overlap
> happens, the manual doesn't say what will happen.
> 
>  From another side, I have a concern: currently, other arch don't have
> this alias behavior, IIUC(at least, per my understanding of arm and arm64
> memcpy implementations)they just copy forward. I want to keep similar behavior
> for riscv.
> 
> So I want to hear more before going towards alias-memcpy-to-memmove direction.
> 
> Thanks

If you read Matteo's original post that was also his suggestion, and 
Linus has also commented on that. In general it's better to handle the 
case where the regions provided to memcpy() overlap than to resort to 
"undefined behavior", I provided a backwards copy example that you can 
use so that we can have both fw and bw copying for memmove(), and use 
memmove() in any case. The [restrict .n] in the prototype is just there 
to say that the size of src is restricted by n (the next argument). If 
someone uses memcpy() with overlapping regions, which is always a 
possibility, in your case it'll result corrupted data, we won't even get 
a warning (still counts as undefined behavior) about it.

Regards,
Nick
  
Jisheng Zhang Jan. 31, 2024, 5:25 a.m. UTC | #7
On Tue, Jan 30, 2024 at 06:52:24PM +0200, Nick Kossifidis wrote:
> On 1/30/24 15:12, Jisheng Zhang wrote:
> > On Tue, Jan 30, 2024 at 01:39:10PM +0200, Nick Kossifidis wrote:
> > > On 1/28/24 13:10, Jisheng Zhang wrote:
> > > > From: Matteo Croce <mcroce@microsoft.com>
> > > > 
> > > > When the destination buffer is before the source one, or when the
> > > > buffers doesn't overlap, it's safe to use memcpy() instead, which is
> > > > optimized to use a bigger data size possible.
> > > > 
> > > > Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> > > > Reported-by: kernel test robot <lkp@intel.com>
> > > > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> > > 
> > > I'd expect to have memmove handle both fw/bw copying and then memcpy being
> > > an alias to memmove, to also take care when regions overlap and avoid
> > > undefined behavior.
> > 
> > Hi Nick,
> > 
> > Here is somthing from man memcpy:
> > 
> > "void *memcpy(void dest[restrict .n], const void src[restrict .n],
> >                      size_t n);
> > 
> > The  memcpy()  function copies n bytes from memory area src to memory area dest.
> > The memory areas must not overlap.  Use memmove(3) if the memory areas do  over‐
> > lap."
> > 
> > IMHO, the "restrict" implies that there's no overlap. If overlap
> > happens, the manual doesn't say what will happen.
> > 
> >  From another side, I have a concern: currently, other arch don't have
> > this alias behavior, IIUC(at least, per my understanding of arm and arm64
> > memcpy implementations)they just copy forward. I want to keep similar behavior
> > for riscv.
> > 
> > So I want to hear more before going towards alias-memcpy-to-memmove direction.
> > 
> > Thanks
> 

Hi Nick,

> If you read Matteo's original post that was also his suggestion, and Linus

I did read all discussions in Matteo's v1 ~ v5 before this renew. Per my
understanding, Matteo also concerned no such memcpy-alias-memmove behavior
in other arch's implementations.

> has also commented on that. In general it's better to handle the case where

Linus commented on https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132
about glibc alias memcpy to memove rather than the patch series.

> the regions provided to memcpy() overlap than to resort to "undefined
> behavior", I provided a backwards copy example that you can use so that we
> can have both fw and bw copying for memmove(), and use memmove() in any
> case. The [restrict .n] in the prototype is just there to say that the size
> of src is restricted by n (the next argument). If someone uses memcpy() with

I didn't have c99 spec in hand, but I found gcc explanations about
restrict keyword from [1]:

"the restrict declaration promises that the code will not access that
object in any other way--only through p."

So if there's overlap in memcpy, then it contradicts the restrict
implication.

[1] https://www.gnu.org/software/c-intro-and-ref/manual/html_node/restrict-Pointers.html

And from the manual, if the memcpy users must ensure "The memory areas
must not overlap." So I think all linux kernel's memcpy implementations(only copy
fw and don't take overlap into consideration) are right.

I did see the alias-memcpy-as-memmove in some libc implementations, but
this is not the style in current kernel's implementations.

Given current riscv asm implementation also doesn't do the alias and
copy-fw only, and this series improves performance and doesn't introduce the
Is it better to divide this into two steps: Firstly, merge this series
if there's no obvious bug; secondly, do the alias as you suggested,
since you have a basic implementation, you could even submit your patch
;) What do you think about this two steps solution?

Thanks
> overlapping regions, which is always a possibility, in your case it'll
> result corrupted data, we won't even get a warning (still counts as
> undefined behavior) about it.
> 
> Regards,
> Nick
>
  
Nick Kossifidis Jan. 31, 2024, 9:13 a.m. UTC | #8
On 1/31/24 07:25, Jisheng Zhang wrote:
> 
> I didn't have c99 spec in hand, but I found gcc explanations about
> restrict keyword from [1]:
> 
> "the restrict declaration promises that the code will not access that
> object in any other way--only through p."
> 
> So if there's overlap in memcpy, then it contradicts the restrict
> implication.
> 
> [1] https://www.gnu.org/software/c-intro-and-ref/manual/html_node/restrict-Pointers.html
> 
The union used in the code also contradicts this. BTW the restrict 
qualifier isn't used in kernel's lib/string.c nor in the current 
implementation 
(https://elixir.bootlin.com/linux/latest/source/arch/riscv/include/asm/string.h#L16).

> And from the manual, if the memcpy users must ensure "The memory areas
> must not overlap." So I think all linux kernel's memcpy implementations(only copy
> fw and don't take overlap into consideration) are right.
> 
> I did see the alias-memcpy-as-memmove in some libc implementations, but
> this is not the style in current kernel's implementations.
> 
> Given current riscv asm implementation also doesn't do the alias and
> copy-fw only, and this series improves performance and doesn't introduce the
> Is it better to divide this into two steps: Firstly, merge this series
> if there's no obvious bug; secondly, do the alias as you suggested,
> since you have a basic implementation, you could even submit your patch
> ;) What do you think about this two steps solution?
> 

I still don't understand why you prefer undefined behavior over just 
aliasing memcpy to memmove. Anyway, do as you wish, I don't have time to 
work on this unfortunately. Feel free to use the code I shared for bw 
copy etc.

Regards,
Nick
  

Patch

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index edf1d56e4f13..17c3b40382e1 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -18,8 +18,8 @@  extern void *memcpy(void *dest, const void *src, size_t count);
 extern void *__memcpy(void *dest, const void *src, size_t count);
 
 #define __HAVE_ARCH_MEMMOVE
-extern asmlinkage void *memmove(void *, const void *, size_t);
-extern asmlinkage void *__memmove(void *, const void *, size_t);
+extern void *memmove(void *dest, const void *src, size_t count);
+extern void *__memmove(void *dest, const void *src, size_t count);
 
 #define __HAVE_ARCH_STRCMP
 extern asmlinkage int strcmp(const char *cs, const char *ct);
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index c69dc74e0a27..76849d0906ef 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -10,9 +10,7 @@ 
  * Assembly functions that may be used (directly or indirectly) by modules
  */
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 5f2f94f6db17..5fa88c5a601c 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,7 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y			+= delay.o
 lib-y			+= memset.o
-lib-y			+= memmove.o
 lib-y			+= strcmp.o
 lib-y			+= strlen.o
 lib-y			+= string.o
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
deleted file mode 100644
index cb3e2e7ef0ba..000000000000
--- a/arch/riscv/lib/memmove.S
+++ /dev/null
@@ -1,317 +0,0 @@ 
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-SYM_FUNC_START(__memmove)
-	/*
-	 * Returns
-	 *   a0 - dest
-	 *
-	 * Parameters
-	 *   a0 - Inclusive first byte of dest
-	 *   a1 - Inclusive first byte of src
-	 *   a2 - Length of copy n
-	 *
-	 * Because the return matches the parameter register a0,
-	 * we will not clobber or modify that register.
-	 *
-	 * Note: This currently only works on little-endian.
-	 * To port to big-endian, reverse the direction of shifts
-	 * in the 2 misaligned fixup copy loops.
-	 */
-
-	/* Return if nothing to do */
-	beq a0, a1, .Lreturn_from_memmove
-	beqz a2, .Lreturn_from_memmove
-
-	/*
-	 * Register Uses
-	 *      Forward Copy: a1 - Index counter of src
-	 *      Reverse Copy: a4 - Index counter of src
-	 *      Forward Copy: t3 - Index counter of dest
-	 *      Reverse Copy: t4 - Index counter of dest
-	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
-	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
-	 *   Both Copy Modes: t0 - Link / Temporary for load-store
-	 *   Both Copy Modes: t1 - Temporary for load-store
-	 *   Both Copy Modes: t2 - Temporary for load-store
-	 *   Both Copy Modes: a5 - dest to src alignment offset
-	 *   Both Copy Modes: a6 - Shift ammount
-	 *   Both Copy Modes: a7 - Inverse Shift ammount
-	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
-	 */
-
-	/*
-	 * Solve for some register values now.
-	 * Byte copy does not need t5 or t6.
-	 */
-	mv   t3, a0
-	add  t4, a0, a2
-	add  a4, a1, a2
-
-	/*
-	 * Byte copy if copying less than (2 * SZREG) bytes. This can
-	 * cause problems with the bulk copy implementation and is
-	 * small enough not to bother.
-	 */
-	andi t0, a2, -(2 * SZREG)
-	beqz t0, .Lbyte_copy
-
-	/*
-	 * Now solve for t5 and t6.
-	 */
-	andi t5, t3, -SZREG
-	andi t6, t4, -SZREG
-	/*
-	 * If dest(Register t3) rounded down to the nearest naturally
-	 * aligned SZREG address, does not equal dest, then add SZREG
-	 * to find the low-bound of SZREG alignment in the dest memory
-	 * region.  Note that this could overshoot the dest memory
-	 * region if n is less than SZREG.  This is one reason why
-	 * we always byte copy if n is less than SZREG.
-	 * Otherwise, dest is already naturally aligned to SZREG.
-	 */
-	beq  t5, t3, 1f
-		addi t5, t5, SZREG
-	1:
-
-	/*
-	 * If the dest and src are co-aligned to SZREG, then there is
-	 * no need for the full rigmarole of a full misaligned fixup copy.
-	 * Instead, do a simpler co-aligned copy.
-	 */
-	xor  t0, a0, a1
-	andi t1, t0, (SZREG - 1)
-	beqz t1, .Lcoaligned_copy
-	/* Fall through to misaligned fixup copy */
-
-.Lmisaligned_fixup_copy:
-	bltu a1, a0, .Lmisaligned_fixup_copy_reverse
-
-.Lmisaligned_fixup_copy_forward:
-	jal  t0, .Lbyte_copy_until_aligned_forward
-
-	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
-	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
-	sub  a5, a1, t3 /* Find the difference between src and dest */
-	andi a1, a1, -SZREG /* Align the src pointer */
-	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
-
-	/*
-	 * Compute The Inverse Shift
-	 * a7 = XLEN - a6 = XLEN + -a6
-	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
-	 * Add that to XLEN.  XLEN = SZREG * 8.
-	 */
-	not  a7, a6
-	addi a7, a7, (SZREG * 8 + 1)
-
-	/*
-	 * Fix Misalignment Copy Loop - Forward
-	 * load_val0 = load_ptr[0];
-	 * do {
-	 * 	load_val1 = load_ptr[1];
-	 * 	store_ptr += 2;
-	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
-	 *
-	 * 	if (store_ptr == {a2})
-	 * 		break;
-	 *
-	 * 	load_val0 = load_ptr[2];
-	 * 	load_ptr += 2;
-	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
-	 *
-	 * } while (store_ptr != store_ptr_end);
-	 * store_ptr = store_ptr_end;
-	 */
-
-	REG_L t0, (0 * SZREG)(a1)
-	1:
-	REG_L t1, (1 * SZREG)(a1)
-	addi  t3, t3, (2 * SZREG)
-	srl   t0, t0, a6
-	sll   t2, t1, a7
-	or    t2, t0, t2
-	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
-
-	beq   t3, a2, 2f
-
-	REG_L t0, (2 * SZREG)(a1)
-	addi  a1, a1, (2 * SZREG)
-	srl   t1, t1, a6
-	sll   t2, t0, a7
-	or    t2, t1, t2
-	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
-
-	bne   t3, t6, 1b
-	2:
-	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
-
-	add  a1, t3, a5 /* Restore the src pointer */
-	j .Lbyte_copy_forward /* Copy any remaining bytes */
-
-.Lmisaligned_fixup_copy_reverse:
-	jal  t0, .Lbyte_copy_until_aligned_reverse
-
-	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
-	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
-	sub  a5, a4, t4 /* Find the difference between src and dest */
-	andi a4, a4, -SZREG /* Align the src pointer */
-	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
-
-	/*
-	 * Compute The Inverse Shift
-	 * a7 = XLEN - a6 = XLEN + -a6
-	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
-	 * Add that to XLEN.  XLEN = SZREG * 8.
-	 */
-	not  a7, a6
-	addi a7, a7, (SZREG * 8 + 1)
-
-	/*
-	 * Fix Misalignment Copy Loop - Reverse
-	 * load_val1 = load_ptr[0];
-	 * do {
-	 * 	load_val0 = load_ptr[-1];
-	 * 	store_ptr -= 2;
-	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
-	 *
-	 * 	if (store_ptr == {a2})
-	 * 		break;
-	 *
-	 * 	load_val1 = load_ptr[-2];
-	 * 	load_ptr -= 2;
-	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
-	 *
-	 * } while (store_ptr != store_ptr_end);
-	 * store_ptr = store_ptr_end;
-	 */
-
-	REG_L t1, ( 0 * SZREG)(a4)
-	1:
-	REG_L t0, (-1 * SZREG)(a4)
-	addi  t4, t4, (-2 * SZREG)
-	sll   t1, t1, a7
-	srl   t2, t0, a6
-	or    t2, t1, t2
-	REG_S t2, ( 1 * SZREG)(t4)
-
-	beq   t4, a2, 2f
-
-	REG_L t1, (-2 * SZREG)(a4)
-	addi  a4, a4, (-2 * SZREG)
-	sll   t0, t0, a7
-	srl   t2, t1, a6
-	or    t2, t0, t2
-	REG_S t2, ( 0 * SZREG)(t4)
-
-	bne   t4, t5, 1b
-	2:
-	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
-
-	add  a4, t4, a5 /* Restore the src pointer */
-	j .Lbyte_copy_reverse /* Copy any remaining bytes */
-
-/*
- * Simple copy loops for SZREG co-aligned memory locations.
- * These also make calls to do byte copies for any unaligned
- * data at their terminations.
- */
-.Lcoaligned_copy:
-	bltu a1, a0, .Lcoaligned_copy_reverse
-
-.Lcoaligned_copy_forward:
-	jal t0, .Lbyte_copy_until_aligned_forward
-
-	1:
-	REG_L t1, ( 0 * SZREG)(a1)
-	addi  a1, a1, SZREG
-	addi  t3, t3, SZREG
-	REG_S t1, (-1 * SZREG)(t3)
-	bne   t3, t6, 1b
-
-	j .Lbyte_copy_forward /* Copy any remaining bytes */
-
-.Lcoaligned_copy_reverse:
-	jal t0, .Lbyte_copy_until_aligned_reverse
-
-	1:
-	REG_L t1, (-1 * SZREG)(a4)
-	addi  a4, a4, -SZREG
-	addi  t4, t4, -SZREG
-	REG_S t1, ( 0 * SZREG)(t4)
-	bne   t4, t5, 1b
-
-	j .Lbyte_copy_reverse /* Copy any remaining bytes */
-
-/*
- * These are basically sub-functions within the function.  They
- * are used to byte copy until the dest pointer is in alignment.
- * At which point, a bulk copy method can be used by the
- * calling code.  These work on the same registers as the bulk
- * copy loops.  Therefore, the register values can be picked
- * up from where they were left and we avoid code duplication
- * without any overhead except the call in and return jumps.
- */
-.Lbyte_copy_until_aligned_forward:
-	beq  t3, t5, 2f
-	1:
-	lb   t1,  0(a1)
-	addi a1, a1, 1
-	addi t3, t3, 1
-	sb   t1, -1(t3)
-	bne  t3, t5, 1b
-	2:
-	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
-
-.Lbyte_copy_until_aligned_reverse:
-	beq  t4, t6, 2f
-	1:
-	lb   t1, -1(a4)
-	addi a4, a4, -1
-	addi t4, t4, -1
-	sb   t1,  0(t4)
-	bne  t4, t6, 1b
-	2:
-	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
-
-/*
- * Simple byte copy loops.
- * These will byte copy until they reach the end of data to copy.
- * At that point, they will call to return from memmove.
- */
-.Lbyte_copy:
-	bltu a1, a0, .Lbyte_copy_reverse
-
-.Lbyte_copy_forward:
-	beq  t3, t4, 2f
-	1:
-	lb   t1,  0(a1)
-	addi a1, a1, 1
-	addi t3, t3, 1
-	sb   t1, -1(t3)
-	bne  t3, t4, 1b
-	2:
-	ret
-
-.Lbyte_copy_reverse:
-	beq  t4, t3, 2f
-	1:
-	lb   t1, -1(a4)
-	addi a4, a4, -1
-	addi t4, t4, -1
-	sb   t1,  0(t4)
-	bne  t4, t3, 1b
-	2:
-
-.Lreturn_from_memmove:
-	ret
-
-SYM_FUNC_END(__memmove)
-SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
-SYM_FUNC_ALIAS(__pi_memmove, __memmove)
-SYM_FUNC_ALIAS(__pi___memmove, __memmove)
diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
index 5f9c83ec548d..20677c8067da 100644
--- a/arch/riscv/lib/string.c
+++ b/arch/riscv/lib/string.c
@@ -119,3 +119,28 @@  void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy)
 EXPORT_SYMBOL(memcpy);
 void *__pi_memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
 void *__pi___memcpy(void *dest, const void *src, size_t count) __alias(__memcpy);
+
+/*
+ * Simply check if the buffer overlaps an call memcpy() in case,
+ * otherwise do a simple one byte at time backward copy.
+ */
+void *__memmove(void *dest, const void *src, size_t count)
+{
+	if (dest < src || src + count <= dest)
+		return __memcpy(dest, src, count);
+
+	if (dest > src) {
+		const char *s = src + count;
+		char *tmp = dest + count;
+
+		while (count--)
+			*--tmp = *--s;
+	}
+	return dest;
+}
+EXPORT_SYMBOL(__memmove);
+
+void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
+EXPORT_SYMBOL(memmove);
+void *__pi_memmove(void *dest, const void *src, size_t count) __alias(__memmove);
+void *__pi___memmove(void *dest, const void *src, size_t count) __alias(__memmove);