[v2] LoongArch: add checksum optimization for 64-bit system

Message ID 20230209035839.2610277-1-maobibo@loongson.cn
State New
Headers
Series [v2] LoongArch: add checksum optimization for 64-bit system |

Commit Message

maobibo Feb. 9, 2023, 3:58 a.m. UTC
  loongArch platform is 64-bit system, which supports 8 bytes memory
accessing, generic checksum function uses 4 byte memory access.
This patch adds 8-bytes memory access optimization for checksum
function on loongArch. And the code comes from arm64 system.

When network hw checksum is disabled, iperf performance improves
about 10% with this patch.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
Changelog:
v2: use rotation API in csum_fold to reduce one instruction.
---
 arch/loongarch/include/asm/checksum.h |  65 ++++++++++++
 arch/loongarch/lib/Makefile           |   2 +-
 arch/loongarch/lib/csum.c             | 142 ++++++++++++++++++++++++++
 3 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 arch/loongarch/include/asm/checksum.h
 create mode 100644 arch/loongarch/lib/csum.c
  

Comments

David Laight Feb. 9, 2023, 9:35 a.m. UTC | #1
From: Bibo Mao
> Sent: 09 February 2023 03:59
> 
> loongArch platform is 64-bit system, which supports 8 bytes memory
> accessing, generic checksum function uses 4 byte memory access.
> This patch adds 8-bytes memory access optimization for checksum
> function on loongArch. And the code comes from arm64 system.

How fast do these functions actually run (in bytes/clock)?

It is quite possible that just adding 32bit values to a
64bit register is faster.
Any non-trivial cpu will run that at 4 bytes/clock
(for suitably unrolled and pipelined code).
On a more complex cpu adding to two registers will
give 8 bytes/clock (needs two memory loads/clock).

The fastest 64bit sum you'll get on anything mips-like
(no carry flag) is probably from something like:
	val = *mem++;  // 64bit read
	sum += val;
	carry = sum < val;
	carry_sum += carry;
which is 2 bytes/instruction again.
To get to 8 bytes/clock you need to execute all 4 instructions
every clock - so 1 read and 3 arithmetic.
(c/f 2 read and 2 arithmetic for 32bit adds.)

Arm has a carry flag so the code is:
	val = *mem++;
	temp,carry = sum + val;
	sum = sum + val + carry;
There are still two dependant arithmetic instructions for
each 8-byte word.
The dependencies on the flags register also make it harder
to get any benefit from interleaving adds to two registers.

x86-64 uses 64bit 'add with carry' chains.
No one ever noticed that they take two clocks each on
Intel cpu until (about) Haswell.
It is possible to get 12 bytes/clock with some strange
loops that use (IIRC) adxo and adxc.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
maobibo Feb. 9, 2023, 11:55 a.m. UTC | #2
在 2023/2/9 17:35, David Laight 写道:
> From: Bibo Mao
>> Sent: 09 February 2023 03:59
>>
>> loongArch platform is 64-bit system, which supports 8 bytes memory
>> accessing, generic checksum function uses 4 byte memory access.
>> This patch adds 8-bytes memory access optimization for checksum
>> function on loongArch. And the code comes from arm64 system.
> 
> How fast do these functions actually run (in bytes/clock)?
With uint128 method, there will unrolled loop, instruction
can execute in parallel. It gets the best result on loongarch
system where there is no neither carry flag nor post-index
addressing modes.

Here is the piece of disassemble code with uint128 method:
   120000a40:   28c0222f        ld.d    $r15,$r17,8(0x8)
   120000a44:   28c0622a        ld.d    $r10,$r17,24(0x18)
   120000a48:   28c0a230        ld.d    $r16,$r17,40(0x28)
   120000a4c:   28c0e232        ld.d    $r18,$r17,56(0x38)
   120000a50:   28c0022e        ld.d    $r14,$r17,0
   120000a54:   28c0422d        ld.d    $r13,$r17,16(0x10)
   120000a58:   28c0822b        ld.d    $r11,$r17,32(0x20)
   120000a5c:   28c0c22c        ld.d    $r12,$r17,48(0x30)
   120000a60:   0010b9f7        add.d   $r23,$r15,$r14
   120000a64:   0010b54d        add.d   $r13,$r10,$r13
   120000a68:   0010b24c        add.d   $r12,$r18,$r12
   120000a6c:   0010ae0b        add.d   $r11,$r16,$r11
   120000a70:   0012c992        sltu    $r18,$r12,$r18
   120000a74:   0012beee        sltu    $r14,$r23,$r15
   120000a78:   0012c170        sltu    $r16,$r11,$r16
   120000a7c:   0012a9aa        sltu    $r10,$r13,$r10
   120000a80:   0010ae0f        add.d   $r15,$r16,$r11
   120000a84:   0010ddce        add.d   $r14,$r14,$r23
   120000a88:   0010b250        add.d   $r16,$r18,$r12
   120000a8c:   0010b54d        add.d   $r13,$r10,$r13
   120000a90:   0010b5d2        add.d   $r18,$r14,$r13
   120000a94:   0010c1f0        add.d   $r16,$r15,$r16

> 
> It is quite possible that just adding 32bit values to a
> 64bit register is faster.
> Any non-trivial cpu will run that at 4 bytes/clock
> (for suitably unrolled and pipelined code).
> On a more complex cpu adding to two registers will
> give 8 bytes/clock (needs two memory loads/clock).
> 
> The fastest 64bit sum you'll get on anything mips-like
> (no carry flag) is probably from something like:
> 	val = *mem++;  // 64bit read
> 	sum += val;
> 	carry = sum < val;
> 	carry_sum += carry;
> which is 2 bytes/instruction again.
> To get to 8 bytes/clock you need to execute all 4 instructions
> every clock - so 1 read and 3 arithmetic.
There is no post-index addressing modes on loongarch, 
 	val = *mem;  // 64bit read
        mem++;
 	sum += val;
 	carry = sum < val;
 	carry_sum += carry;
it takes 5 instruction and these 5 instructions depends on previous instr.
There is the piece of disassemble code:
   120000d90:   28c001f0        ld.d    $r16,$r15,0
   120000d94:   0010c58c        add.d   $r12,$r12,$r17
   120000d98:   02c021ef        addi.d  $r15,$r15,8(0x8)
   120000d9c:   0010b20c        add.d   $r12,$r16,$r12
   120000da0:   0012c191        sltu    $r17,$r12,$r16
   120000da4:   5fffedf2        bne     $r15,$r18,-20(0x3ffec) # 120000d90 <do_csum_64+0x90>

regards
bibo, mao
          

> (c/f 2 read and 2 arithmetic for 32bit adds.)
> 
> Arm has a carry flag so the code is:
> 	val = *mem++;
> 	temp,carry = sum + val;
> 	sum = sum + val + carry;
> There are still two dependant arithmetic instructions for
> each 8-byte word.
> The dependencies on the flags register also make it harder
> to get any benefit from interleaving adds to two registers.
> 
> x86-64 uses 64bit 'add with carry' chains.
> No one ever noticed that they take two clocks each on
> Intel cpu until (about) Haswell.
> It is possible to get 12 bytes/clock with some strange
> loops that use (IIRC) adxo and adxc.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  
David Laight Feb. 9, 2023, 12:39 p.m. UTC | #3
From: maobibo
> Sent: 09 February 2023 11:55
> 
> 
> 在 2023/2/9 17:35, David Laight 写道:
> > From: Bibo Mao
> >> Sent: 09 February 2023 03:59
> >>
> >> loongArch platform is 64-bit system, which supports 8 bytes memory
> >> accessing, generic checksum function uses 4 byte memory access.
> >> This patch adds 8-bytes memory access optimization for checksum
> >> function on loongArch. And the code comes from arm64 system.
> >
> > How fast do these functions actually run (in bytes/clock)?
> With uint128 method, there will unrolled loop, instruction
> can execute in parallel. It gets the best result on loongarch
> system where there is no neither carry flag nor post-index
> addressing modes.

We're probably almost agreeing...

> Here is the piece of disassemble code with uint128 method:

Load 8 values:

>    120000a40:   28c0222f        ld.d    $r15,$r17,8(0x8)
>    120000a44:   28c0622a        ld.d    $r10,$r17,24(0x18)
>    120000a48:   28c0a230        ld.d    $r16,$r17,40(0x28)
>    120000a4c:   28c0e232        ld.d    $r18,$r17,56(0x38)
>    120000a50:   28c0022e        ld.d    $r14,$r17,0
>    120000a54:   28c0422d        ld.d    $r13,$r17,16(0x10)
>    120000a58:   28c0822b        ld.d    $r11,$r17,32(0x20)
>    120000a5c:   28c0c22c        ld.d    $r12,$r17,48(0x30)

Pairwise add them

>    120000a60:   0010b9f7        add.d   $r23,$r15,$r14
>    120000a64:   0010b54d        add.d   $r13,$r10,$r13
>    120000a68:   0010b24c        add.d   $r12,$r18,$r12
>    120000a6c:   0010ae0b        add.d   $r11,$r16,$r11

Generate 4 'carry' bits

>    120000a70:   0012c992        sltu    $r18,$r12,$r18
>    120000a74:   0012beee        sltu    $r14,$r23,$r15
>    120000a78:   0012c170        sltu    $r16,$r11,$r16
>    120000a7c:   0012a9aa        sltu    $r10,$r13,$r10

Add the carry bits onto the sums.
I've not quite worked out which add is which!
But I think you've missed a few adds here.

>    120000a80:   0010ae0f        add.d   $r15,$r16,$r11
>    120000a84:   0010ddce        add.d   $r14,$r14,$r23
>    120000a88:   0010b250        add.d   $r16,$r18,$r12
>    120000a8c:   0010b54d        add.d   $r13,$r10,$r13
>    120000a90:   0010b5d2        add.d   $r18,$r14,$r13
>    120000a94:   0010c1f0        add.d   $r16,$r15,$r16

Somewhere each value needs an add, an sltu to generate the 'carry',
and an add for the carry itself.
If you sum the carry bits into a separate register it is
possible to get a both adds and the sltu (for different values)
to run in the same clock (on a suitable cpu).
If there are 4 integer units you can also get the loop instructions
'for free' and unrolling 8 times may not be needed at all.

...
> There is no post-index addressing modes on loongarch,
>  	val = *mem;  // 64bit read
>         mem++;
>  	sum += val;
>  	carry = sum < val;
>  	carry_sum += carry;
> it takes 5 instruction and these 5 instructions depends on previous instr.

I'd assume the loop was unrolled enough so the address
increment doesn't matter.

> There is the piece of disassemble code:
>    120000d90:   28c001f0        ld.d    $r16,$r15,0
>    120000d94:   0010c58c        add.d   $r12,$r12,$r17
>    120000d98:   02c021ef        addi.d  $r15,$r15,8(0x8)

Those three instructions are independent.

>    120000d9c:   0010b20c        add.d   $r12,$r16,$r12

that one depends on the ld.d

>    120000da0:   0012c191        sltu    $r17,$r12,$r16

that depends on the add.d
but it could be execute after the 'bne' in parallel with the ld.d

>    120000da4:   5fffedf2        bne     $r15,$r18,-20(0x3ffec) # 120000d90 <do_csum_64+0x90>

If you tweak the code it is possible to get down to just
the addi.d and bne constraining the dependency chain.
(Assuming there is no delay on the read and there are an infinite
number of execution units.)
Unroll once and do:
	ld.d r,addr,0
	addi.d addr,16
	ld.d r,addr,-8
	bne addr,limit,loop_top
and you might get a loop that does a memory read every clock.

So you end up worrying about how the memory read delays affect
the instruction pipeline.
The Intel x86 cpu I've got just pile up the arithmetic instructions
waiting for the data to be read.
If you get a memory read requested every clock everything else
follows - provided you don't try to execute too many instrcutions
at once.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
Huacai Chen Feb. 10, 2023, 3:21 a.m. UTC | #4
This commit comes from the old internal kernel, I want to know which
one has better performance.

https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb

On Thu, Feb 9, 2023 at 8:39 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: maobibo
> > Sent: 09 February 2023 11:55
> >
> >
> > 在 2023/2/9 17:35, David Laight 写道:
> > > From: Bibo Mao
> > >> Sent: 09 February 2023 03:59
> > >>
> > >> loongArch platform is 64-bit system, which supports 8 bytes memory
> > >> accessing, generic checksum function uses 4 byte memory access.
> > >> This patch adds 8-bytes memory access optimization for checksum
> > >> function on loongArch. And the code comes from arm64 system.
> > >
> > > How fast do these functions actually run (in bytes/clock)?
> > With uint128 method, there will unrolled loop, instruction
> > can execute in parallel. It gets the best result on loongarch
> > system where there is no neither carry flag nor post-index
> > addressing modes.
>
> We're probably almost agreeing...
>
> > Here is the piece of disassemble code with uint128 method:
>
> Load 8 values:
>
> >    120000a40:   28c0222f        ld.d    $r15,$r17,8(0x8)
> >    120000a44:   28c0622a        ld.d    $r10,$r17,24(0x18)
> >    120000a48:   28c0a230        ld.d    $r16,$r17,40(0x28)
> >    120000a4c:   28c0e232        ld.d    $r18,$r17,56(0x38)
> >    120000a50:   28c0022e        ld.d    $r14,$r17,0
> >    120000a54:   28c0422d        ld.d    $r13,$r17,16(0x10)
> >    120000a58:   28c0822b        ld.d    $r11,$r17,32(0x20)
> >    120000a5c:   28c0c22c        ld.d    $r12,$r17,48(0x30)
>
> Pairwise add them
>
> >    120000a60:   0010b9f7        add.d   $r23,$r15,$r14
> >    120000a64:   0010b54d        add.d   $r13,$r10,$r13
> >    120000a68:   0010b24c        add.d   $r12,$r18,$r12
> >    120000a6c:   0010ae0b        add.d   $r11,$r16,$r11
>
> Generate 4 'carry' bits
>
> >    120000a70:   0012c992        sltu    $r18,$r12,$r18
> >    120000a74:   0012beee        sltu    $r14,$r23,$r15
> >    120000a78:   0012c170        sltu    $r16,$r11,$r16
> >    120000a7c:   0012a9aa        sltu    $r10,$r13,$r10
>
> Add the carry bits onto the sums.
> I've not quite worked out which add is which!
> But I think you've missed a few adds here.
>
> >    120000a80:   0010ae0f        add.d   $r15,$r16,$r11
> >    120000a84:   0010ddce        add.d   $r14,$r14,$r23
> >    120000a88:   0010b250        add.d   $r16,$r18,$r12
> >    120000a8c:   0010b54d        add.d   $r13,$r10,$r13
> >    120000a90:   0010b5d2        add.d   $r18,$r14,$r13
> >    120000a94:   0010c1f0        add.d   $r16,$r15,$r16
>
> Somewhere each value needs an add, an sltu to generate the 'carry',
> and an add for the carry itself.
> If you sum the carry bits into a separate register it is
> possible to get a both adds and the sltu (for different values)
> to run in the same clock (on a suitable cpu).
> If there are 4 integer units you can also get the loop instructions
> 'for free' and unrolling 8 times may not be needed at all.
>
> ...
> > There is no post-index addressing modes on loongarch,
> >       val = *mem;  // 64bit read
> >         mem++;
> >       sum += val;
> >       carry = sum < val;
> >       carry_sum += carry;
> > it takes 5 instruction and these 5 instructions depends on previous instr.
>
> I'd assume the loop was unrolled enough so the address
> increment doesn't matter.
>
> > There is the piece of disassemble code:
> >    120000d90:   28c001f0        ld.d    $r16,$r15,0
> >    120000d94:   0010c58c        add.d   $r12,$r12,$r17
> >    120000d98:   02c021ef        addi.d  $r15,$r15,8(0x8)
>
> Those three instructions are independent.
>
> >    120000d9c:   0010b20c        add.d   $r12,$r16,$r12
>
> that one depends on the ld.d
>
> >    120000da0:   0012c191        sltu    $r17,$r12,$r16
>
> that depends on the add.d
> but it could be execute after the 'bne' in parallel with the ld.d
>
> >    120000da4:   5fffedf2        bne     $r15,$r18,-20(0x3ffec) # 120000d90 <do_csum_64+0x90>
>
> If you tweak the code it is possible to get down to just
> the addi.d and bne constraining the dependency chain.
> (Assuming there is no delay on the read and there are an infinite
> number of execution units.)
> Unroll once and do:
>         ld.d r,addr,0
>         addi.d addr,16
>         ld.d r,addr,-8
>         bne addr,limit,loop_top
> and you might get a loop that does a memory read every clock.
>
> So you end up worrying about how the memory read delays affect
> the instruction pipeline.
> The Intel x86 cpu I've got just pile up the arithmetic instructions
> waiting for the data to be read.
> If you get a memory read requested every clock everything else
> follows - provided you don't try to execute too many instrcutions
> at once.
>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  
maobibo Feb. 10, 2023, 7:12 a.m. UTC | #5
在 2023/2/10 11:21, Huacai Chen 写道:
> This commit comes from the old internal kernel, I want to know which
> one has better performance.
> 
> https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb

There is no obvious performance difference between asm code csum_partial function
and uint128 c code. Tested with buffer size 1500/4096, uint128 c code method is
about 5% faster than asm code csum_partial.

regards
bibo, mao
> 
> On Thu, Feb 9, 2023 at 8:39 PM David Laight <David.Laight@aculab.com> wrote:
>>
>> From: maobibo
>>> Sent: 09 February 2023 11:55
>>>
>>>
>>> 在 2023/2/9 17:35, David Laight 写道:
>>>> From: Bibo Mao
>>>>> Sent: 09 February 2023 03:59
>>>>>
>>>>> loongArch platform is 64-bit system, which supports 8 bytes memory
>>>>> accessing, generic checksum function uses 4 byte memory access.
>>>>> This patch adds 8-bytes memory access optimization for checksum
>>>>> function on loongArch. And the code comes from arm64 system.
>>>>
>>>> How fast do these functions actually run (in bytes/clock)?
>>> With uint128 method, there will unrolled loop, instruction
>>> can execute in parallel. It gets the best result on loongarch
>>> system where there is no neither carry flag nor post-index
>>> addressing modes.
>>
>> We're probably almost agreeing...
>>
>>> Here is the piece of disassemble code with uint128 method:
>>
>> Load 8 values:
>>
>>>    120000a40:   28c0222f        ld.d    $r15,$r17,8(0x8)
>>>    120000a44:   28c0622a        ld.d    $r10,$r17,24(0x18)
>>>    120000a48:   28c0a230        ld.d    $r16,$r17,40(0x28)
>>>    120000a4c:   28c0e232        ld.d    $r18,$r17,56(0x38)
>>>    120000a50:   28c0022e        ld.d    $r14,$r17,0
>>>    120000a54:   28c0422d        ld.d    $r13,$r17,16(0x10)
>>>    120000a58:   28c0822b        ld.d    $r11,$r17,32(0x20)
>>>    120000a5c:   28c0c22c        ld.d    $r12,$r17,48(0x30)
>>
>> Pairwise add them
>>
>>>    120000a60:   0010b9f7        add.d   $r23,$r15,$r14
>>>    120000a64:   0010b54d        add.d   $r13,$r10,$r13
>>>    120000a68:   0010b24c        add.d   $r12,$r18,$r12
>>>    120000a6c:   0010ae0b        add.d   $r11,$r16,$r11
>>
>> Generate 4 'carry' bits
>>
>>>    120000a70:   0012c992        sltu    $r18,$r12,$r18
>>>    120000a74:   0012beee        sltu    $r14,$r23,$r15
>>>    120000a78:   0012c170        sltu    $r16,$r11,$r16
>>>    120000a7c:   0012a9aa        sltu    $r10,$r13,$r10
>>
>> Add the carry bits onto the sums.
>> I've not quite worked out which add is which!
>> But I think you've missed a few adds here.
>>
>>>    120000a80:   0010ae0f        add.d   $r15,$r16,$r11
>>>    120000a84:   0010ddce        add.d   $r14,$r14,$r23
>>>    120000a88:   0010b250        add.d   $r16,$r18,$r12
>>>    120000a8c:   0010b54d        add.d   $r13,$r10,$r13
>>>    120000a90:   0010b5d2        add.d   $r18,$r14,$r13
>>>    120000a94:   0010c1f0        add.d   $r16,$r15,$r16
>>
>> Somewhere each value needs an add, an sltu to generate the 'carry',
>> and an add for the carry itself.
>> If you sum the carry bits into a separate register it is
>> possible to get a both adds and the sltu (for different values)
>> to run in the same clock (on a suitable cpu).
>> If there are 4 integer units you can also get the loop instructions
>> 'for free' and unrolling 8 times may not be needed at all.
>>
>> ...
>>> There is no post-index addressing modes on loongarch,
>>>       val = *mem;  // 64bit read
>>>         mem++;
>>>       sum += val;
>>>       carry = sum < val;
>>>       carry_sum += carry;
>>> it takes 5 instruction and these 5 instructions depends on previous instr.
>>
>> I'd assume the loop was unrolled enough so the address
>> increment doesn't matter.
>>
>>> There is the piece of disassemble code:
>>>    120000d90:   28c001f0        ld.d    $r16,$r15,0
>>>    120000d94:   0010c58c        add.d   $r12,$r12,$r17
>>>    120000d98:   02c021ef        addi.d  $r15,$r15,8(0x8)
>>
>> Those three instructions are independent.
>>
>>>    120000d9c:   0010b20c        add.d   $r12,$r16,$r12
>>
>> that one depends on the ld.d
>>
>>>    120000da0:   0012c191        sltu    $r17,$r12,$r16
>>
>> that depends on the add.d
>> but it could be execute after the 'bne' in parallel with the ld.d
>>
>>>    120000da4:   5fffedf2        bne     $r15,$r18,-20(0x3ffec) # 120000d90 <do_csum_64+0x90>
>>
>> If you tweak the code it is possible to get down to just
>> the addi.d and bne constraining the dependency chain.
>> (Assuming there is no delay on the read and there are an infinite
>> number of execution units.)
>> Unroll once and do:
>>         ld.d r,addr,0
>>         addi.d addr,16
>>         ld.d r,addr,-8
>>         bne addr,limit,loop_top
>> and you might get a loop that does a memory read every clock.
>>
>> So you end up worrying about how the memory read delays affect
>> the instruction pipeline.
>> The Intel x86 cpu I've got just pile up the arithmetic instructions
>> waiting for the data to be read.
>> If you get a memory read requested every clock everything else
>> follows - provided you don't try to execute too many instrcutions
>> at once.
>>
>>         David
>>
>> -
>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
>> Registration No: 1397386 (Wales)
  
maobibo Feb. 10, 2023, 10:06 a.m. UTC | #6
With the test cases
  https://github.com/bibo-mao/bench/tree/master/csum

Tested with different buffer size 4096/1472/250/40, here is the output on my
loongarch machine. Loops times is 0x100000, and time cost unit is milliseconds,
and the smaller value will be better.

 
buf size[4096] loops[0x100000] times[us]: csum uint128 344473 asm method 373391 uint64 741412 
buf size[1472] loops[0x100000] times[us]: csum uint128 131849 asm method 138533 uint64 271317 
buf size[ 250] loops[0x100000] times[us]: csum uint128 34512 asm method 36294 uint64 51576 
buf size[  40] loops[0x100000] times[us]: csum uint128 12182 asm method 23874 uint64 15769 


Regards
Bibo, Mao

在 2023/2/10 11:21, Huacai Chen 写道:
> This commit comes from the old internal kernel, I want to know which
> one has better performance.
> 
> https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb
> 
> On Thu, Feb 9, 2023 at 8:39 PM David Laight <David.Laight@aculab.com> wrote:
>>
>> From: maobibo
>>> Sent: 09 February 2023 11:55
>>>
>>>
>>> 在 2023/2/9 17:35, David Laight 写道:
>>>> From: Bibo Mao
>>>>> Sent: 09 February 2023 03:59
>>>>>
>>>>> loongArch platform is 64-bit system, which supports 8 bytes memory
>>>>> accessing, generic checksum function uses 4 byte memory access.
>>>>> This patch adds 8-bytes memory access optimization for checksum
>>>>> function on loongArch. And the code comes from arm64 system.
>>>>
>>>> How fast do these functions actually run (in bytes/clock)?
>>> With uint128 method, there will unrolled loop, instruction
>>> can execute in parallel. It gets the best result on loongarch
>>> system where there is no neither carry flag nor post-index
>>> addressing modes.
>>
>> We're probably almost agreeing...
>>
>>> Here is the piece of disassemble code with uint128 method:
>>
>> Load 8 values:
>>
>>>    120000a40:   28c0222f        ld.d    $r15,$r17,8(0x8)
>>>    120000a44:   28c0622a        ld.d    $r10,$r17,24(0x18)
>>>    120000a48:   28c0a230        ld.d    $r16,$r17,40(0x28)
>>>    120000a4c:   28c0e232        ld.d    $r18,$r17,56(0x38)
>>>    120000a50:   28c0022e        ld.d    $r14,$r17,0
>>>    120000a54:   28c0422d        ld.d    $r13,$r17,16(0x10)
>>>    120000a58:   28c0822b        ld.d    $r11,$r17,32(0x20)
>>>    120000a5c:   28c0c22c        ld.d    $r12,$r17,48(0x30)
>>
>> Pairwise add them
>>
>>>    120000a60:   0010b9f7        add.d   $r23,$r15,$r14
>>>    120000a64:   0010b54d        add.d   $r13,$r10,$r13
>>>    120000a68:   0010b24c        add.d   $r12,$r18,$r12
>>>    120000a6c:   0010ae0b        add.d   $r11,$r16,$r11
>>
>> Generate 4 'carry' bits
>>
>>>    120000a70:   0012c992        sltu    $r18,$r12,$r18
>>>    120000a74:   0012beee        sltu    $r14,$r23,$r15
>>>    120000a78:   0012c170        sltu    $r16,$r11,$r16
>>>    120000a7c:   0012a9aa        sltu    $r10,$r13,$r10
>>
>> Add the carry bits onto the sums.
>> I've not quite worked out which add is which!
>> But I think you've missed a few adds here.
>>
>>>    120000a80:   0010ae0f        add.d   $r15,$r16,$r11
>>>    120000a84:   0010ddce        add.d   $r14,$r14,$r23
>>>    120000a88:   0010b250        add.d   $r16,$r18,$r12
>>>    120000a8c:   0010b54d        add.d   $r13,$r10,$r13
>>>    120000a90:   0010b5d2        add.d   $r18,$r14,$r13
>>>    120000a94:   0010c1f0        add.d   $r16,$r15,$r16
>>
>> Somewhere each value needs an add, an sltu to generate the 'carry',
>> and an add for the carry itself.
>> If you sum the carry bits into a separate register it is
>> possible to get a both adds and the sltu (for different values)
>> to run in the same clock (on a suitable cpu).
>> If there are 4 integer units you can also get the loop instructions
>> 'for free' and unrolling 8 times may not be needed at all.
>>
>> ...
>>> There is no post-index addressing modes on loongarch,
>>>       val = *mem;  // 64bit read
>>>         mem++;
>>>       sum += val;
>>>       carry = sum < val;
>>>       carry_sum += carry;
>>> it takes 5 instruction and these 5 instructions depends on previous instr.
>>
>> I'd assume the loop was unrolled enough so the address
>> increment doesn't matter.
>>
>>> There is the piece of disassemble code:
>>>    120000d90:   28c001f0        ld.d    $r16,$r15,0
>>>    120000d94:   0010c58c        add.d   $r12,$r12,$r17
>>>    120000d98:   02c021ef        addi.d  $r15,$r15,8(0x8)
>>
>> Those three instructions are independent.
>>
>>>    120000d9c:   0010b20c        add.d   $r12,$r16,$r12
>>
>> that one depends on the ld.d
>>
>>>    120000da0:   0012c191        sltu    $r17,$r12,$r16
>>
>> that depends on the add.d
>> but it could be execute after the 'bne' in parallel with the ld.d
>>
>>>    120000da4:   5fffedf2        bne     $r15,$r18,-20(0x3ffec) # 120000d90 <do_csum_64+0x90>
>>
>> If you tweak the code it is possible to get down to just
>> the addi.d and bne constraining the dependency chain.
>> (Assuming there is no delay on the read and there are an infinite
>> number of execution units.)
>> Unroll once and do:
>>         ld.d r,addr,0
>>         addi.d addr,16
>>         ld.d r,addr,-8
>>         bne addr,limit,loop_top
>> and you might get a loop that does a memory read every clock.
>>
>> So you end up worrying about how the memory read delays affect
>> the instruction pipeline.
>> The Intel x86 cpu I've got just pile up the arithmetic instructions
>> waiting for the data to be read.
>> If you get a memory read requested every clock everything else
>> follows - provided you don't try to execute too many instrcutions
>> at once.
>>
>>         David
>>
>> -
>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
>> Registration No: 1397386 (Wales)
  
David Laight Feb. 10, 2023, 11:08 a.m. UTC | #7
From: maobibo
> Sent: 10 February 2023 10:06
> 
> With the test cases
>   https://github.com/bibo-mao/bench/tree/master/csum
> 
> Tested with different buffer size 4096/1472/250/40, here is the output on my
> loongarch machine. Loops times is 0x100000, and time cost unit is milliseconds,
> and the smaller value will be better.
> 
> 
> buf size[4096] loops[0x100000] times[us]: csum uint128 344473 asm method 373391 uint64 741412
> buf size[1472] loops[0x100000] times[us]: csum uint128 131849 asm method 138533 uint64 271317
> buf size[ 250] loops[0x100000] times[us]: csum uint128 34512 asm method 36294 uint64 51576
> buf size[  40] loops[0x100000] times[us]: csum uint128 12182 asm method 23874 uint64 15769

What do those work out as in bytes/clock?

Rather than run 1000s of iterations (and be hit by interrupts etc)
I sometimes just use an accurate cycle counter and measure the
time for a single buffer (or varying length).
Save and print the values of 10 calls and you'll get pretty
consistent values after the first couple (cold cache).
Then you can work out how long each iteration of the main loop costs.

I think you have to execute 4 instructions for each 64bit word.
One memory read, the main add, a setle and the add of the carry.

For a simple cpu that is always going to be 4 clocks.
But if there are delay slots after the memory read you can
fill them with the alu instructions for an earlier read.
You also need an add and bne for the address for each iteration.
So unrolling the loop further will help.

OTOH if your cpu can execute multiple instructions in one clock
you can expect to do a lot better.
With 3 ALU instructions (and one read) you should be able to
find a code sequence that will run at 8 bytes/clock.
With 4 ALU it is likely that the loop instructions can also
execute in parallel - so you don't need massive loop unrolling.

Unless the cpu is massively 'out of order' (like some x86)
I'd expect the best code to interleave the reads and alu
operations for earlier values - rather than having all
the reads at the top of the loop.
So the loop would be a repeating pattern of instructions
with some values being carried between iterations.

I doubt you'll get a loop to execute every clock, but
a two clock loop is entirely possible.
It rather depends how fast the instruction decoder
handles the (predicted) branch.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
maobibo Feb. 10, 2023, 1:30 p.m. UTC | #8
On 2023/2/10 19:08, David Laight wrote:
> From: maobibo
>> Sent: 10 February 2023 10:06
>>
>> With the test cases
>>    https://github.com/bibo-mao/bench/tree/master/csum
>>
>> Tested with different buffer size 4096/1472/250/40, here is the output on my
>> loongarch machine. Loops times is 0x100000, and time cost unit is milliseconds,
>> and the smaller value will be better.
>>
>>
>> buf size[4096] loops[0x100000] times[us]: csum uint128 344473 asm method 373391 uint64 741412
>> buf size[1472] loops[0x100000] times[us]: csum uint128 131849 asm method 138533 uint64 271317
>> buf size[ 250] loops[0x100000] times[us]: csum uint128 34512 asm method 36294 uint64 51576
>> buf size[  40] loops[0x100000] times[us]: csum uint128 12182 asm method 23874 uint64 15769
> 
> What do those work out as in bytes/clock?
> 
> Rather than run 1000s of iterations (and be hit by interrupts etc)
> I sometimes just use an accurate cycle counter and measure the
> time for a single buffer (or varying length).
> Save and print the values of 10 calls and you'll get pretty
> consistent values after the first couple (cold cache).
> Then you can work out how long each iteration of the main loop costs.
> 
CPU freq is 2G, from result for buffer size 4096, it is 5.8 bytes/clock.
Freq of timestamp count on Loongarch is 100MHZ at constant ,and there is 
no cpu cycle count register on the platform.

> I think you have to execute 4 instructions for each 64bit word.
> One memory read, the main add, a setle and the add of the carry.
> 
> For a simple cpu that is always going to be 4 clocks.
> But if there are delay slots after the memory read you can
> fill them with the alu instructions for an earlier read.
> You also need an add and bne for the address for each iteration.
> So unrolling the loop further will help.
There is no delay slot on Loongarch platform, yes for 8bytes csum
calculation at least 4 clocks should be used.

> 
> OTOH if your cpu can execute multiple instructions in one clock
> you can expect to do a lot better.
> With 3 ALU instructions (and one read) you should be able to
> find a code sequence that will run at 8 bytes/clock.
> With 4 ALU it is likely that the loop instructions can also
> execute in parallel - so you don't need massive loop unrolling.
> 
> Unless the cpu is massively 'out of order' (like some x86)
> I'd expect the best code to interleave the reads and alu
> operations for earlier values - rather than having all
> the reads at the top of the loop.
I do not think so, like memcpy asm function, memory accessing is
put together, memory access can be stalled if not in L1 cache for
the first time. cacheline will be load at cpu read buffer, the next
memory read in the cache line will cost 1 cycle.
However I will try the method  interleave the reads and alu operations.

> So the loop would be a repeating pattern of instructions
> with some values being carried between iterations.
> 
> I doubt you'll get a loop to execute every clock, but
> a two clock loop is entirely possible.
> It rather depends how fast the instruction decoder
> handles the (predicted) branch.
yes, branch prediction is always expensive and hard to control  :(

Regards
Bibo, Mao
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  
maobibo Feb. 14, 2023, 1:31 a.m. UTC | #9
在 2023/2/10 19:08, David Laight 写道:
> From: maobibo
>> Sent: 10 February 2023 10:06
>>
>> With the test cases
>>   https://github.com/bibo-mao/bench/tree/master/csum
>>
>> Tested with different buffer size 4096/1472/250/40, here is the output on my
>> loongarch machine. Loops times is 0x100000, and time cost unit is milliseconds,
>> and the smaller value will be better.
>>
>>
>> buf size[4096] loops[0x100000] times[us]: csum uint128 344473 asm method 373391 uint64 741412
>> buf size[1472] loops[0x100000] times[us]: csum uint128 131849 asm method 138533 uint64 271317
>> buf size[ 250] loops[0x100000] times[us]: csum uint128 34512 asm method 36294 uint64 51576
>> buf size[  40] loops[0x100000] times[us]: csum uint128 12182 asm method 23874 uint64 15769
> 
> What do those work out as in bytes/clock?
> 
> Rather than run 1000s of iterations (and be hit by interrupts etc)
> I sometimes just use an accurate cycle counter and measure the
> time for a single buffer (or varying length).
> Save and print the values of 10 calls and you'll get pretty
> consistent values after the first couple (cold cache).
> Then you can work out how long each iteration of the main loop costs.
> 
> I think you have to execute 4 instructions for each 64bit word.
> One memory read, the main add, a setle and the add of the carry.
> 
> For a simple cpu that is always going to be 4 clocks.
> But if there are delay slots after the memory read you can
> fill them with the alu instructions for an earlier read.
> You also need an add and bne for the address for each iteration.
> So unrolling the loop further will help.
> 
> OTOH if your cpu can execute multiple instructions in one clock
> you can expect to do a lot better.
> With 3 ALU instructions (and one read) you should be able to
> find a code sequence that will run at 8 bytes/clock.
> With 4 ALU it is likely that the loop instructions can also
> execute in parallel - so you don't need massive loop unrolling.
> 
> Unless the cpu is massively 'out of order' (like some x86)
> I'd expect the best code to interleave the reads and alu
> operations for earlier values - rather than having all
> the reads at the top of the loop.
> So the loop would be a repeating pattern of instructions
> with some values being carried between iterations.
Part of asm code depends on previous intr in website
https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb,
such as macro ADDC
#define ADDC(sum,reg)                                           \
        ADD     sum, sum, reg;                                  \
        sltu    t8, sum, reg;                                   \
        ADD     sum, sum, t8;                                   \
these three instructions depends on each other, and can not execute
in parallel. 

The original of main loop about Lmove_128bytes is:
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
        LOAD    _t0, src, (offset + UNIT(0));                   \
        LOAD    _t1, src, (offset + UNIT(1));                   \
        LOAD    _t2, src, (offset + UNIT(2));                   \
        LOAD    _t3, src, (offset + UNIT(3));                   \
        ADDC(_t0, _t1);                                         \
        ADDC(_t2, _t3);                                         \
        ADDC(sum, _t0);                                         \
        ADDC(sum, _t2)

.Lmove_128bytes:
        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
        addi.d  t5, t5, -1
        addi.d  src, src, 0x80
        bnez    t5, .Lmove_128bytes

I modified the main loop with label .Lmove_128bytes to reduce
dependency between instructions like this, it can improve the
performance.
can improve the performance.
.Lmove_128bytes:
        LOAD    t0, src, 0
        LOAD    t1, src, 8
        LOAD    t3, src, 16
        LOAD    t4, src, 24
        LOAD    a3, src, 0 + 0x20
        LOAD    a4, src, 8 + 0x20
        LOAD    a5, src, 16 + 0x20
        LOAD    a6, src, 24 + 0x20
        ADD     t0, t0,  t1
        ADD     t3, t3,  t4
        ADD     a3, a3,  a4
        ADD     a5, a5,  a6
        sltu    t8, t0,  t1
        sltu    a7, t3,  t4
        ADD     t0, t0,  t8
        ADD     t3, t3,  a7
        sltu    t1, a3,  a4
        sltu    t4, a5,  a6
        ADD     a3, a3,  t1
        ADD     a5, a5,  t4
        ADD     t0, t0, t3
        ADD     a3, a3, a5
        sltu    t1, t0, t3
        sltu    t4, a3, a5
        ADD     t0, t0, t1
        ADD     a3, a3, t4
        ADD     sum, sum, t0
        sltu    t8,  sum, t0
        ADD     sum, sum,  t8
        ADD     sum, sum, a3
        sltu    t8,  sum, a3
        addi.d  t5, t5, -1
        ADD     sum, sum, t8

However the result and principle is almost the similar with
uint128 c code. And there is no performance impact interleaving
the reads and alu operations.

Regards
Bibo, Mao

> 
> I doubt you'll get a loop to execute every clock, but
> a two clock loop is entirely possible.
> It rather depends how fast the instruction decoder
> handles the (predicted) branch.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  
David Laight Feb. 14, 2023, 9:47 a.m. UTC | #10
From: maobibo
> Sent: 14 February 2023 01:31
...
> Part of asm code depends on previous intr in website
> https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb,
> such as macro ADDC
> #define ADDC(sum,reg)                                           \
>         ADD     sum, sum, reg;                                  \
>         sltu    t8, sum, reg;                                   \
>         ADD     sum, sum, t8;                                   \
> these three instructions depends on each other, and can not execute
> in parallel.

Right, but you can add the carry bits into a different register.
Since the aim is 8 bytes/clock limited by 1 memory read/clock
you can (probably) manage with all the word adds going to one
register and all the carry adds to a second. So:
#define ADDC(carry, sum, reg) \
	add	sum, sum, reg	\
	sltu	reg, sum, reg	\
	add	carry, carry, reg

> 
> The original of main loop about Lmove_128bytes is:
> #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
>         LOAD    _t0, src, (offset + UNIT(0));                   \
>         LOAD    _t1, src, (offset + UNIT(1));                   \
>         LOAD    _t2, src, (offset + UNIT(2));                   \
>         LOAD    _t3, src, (offset + UNIT(3));                   \
>         ADDC(_t0, _t1);                                         \
>         ADDC(_t2, _t3);                                         \
>         ADDC(sum, _t0);                                         \
>         ADDC(sum, _t2)
> 
> .Lmove_128bytes:
>         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
>         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
>         CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
>         CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
>         addi.d  t5, t5, -1
>         addi.d  src, src, 0x80
>         bnez    t5, .Lmove_128bytes
> 
> I modified the main loop with label .Lmove_128bytes to reduce
> dependency between instructions like this, it can improve the
> performance.
> can improve the performance.
> .Lmove_128bytes:
>         LOAD    t0, src, 0
>         LOAD    t1, src, 8
>         LOAD    t3, src, 16
>         LOAD    t4, src, 24
>         LOAD    a3, src, 0 + 0x20
>         LOAD    a4, src, 8 + 0x20
>         LOAD    a5, src, 16 + 0x20
>         LOAD    a6, src, 24 + 0x20
>         ADD     t0, t0,  t1
>         ADD     t3, t3,  t4
>         ADD     a3, a3,  a4
>         ADD     a5, a5,  a6
>         sltu    t8, t0,  t1
>         sltu    a7, t3,  t4
>         ADD     t0, t0,  t8
>         ADD     t3, t3,  a7
>         sltu    t1, a3,  a4
>         sltu    t4, a5,  a6
>         ADD     a3, a3,  t1
>         ADD     a5, a5,  t4
>         ADD     t0, t0, t3
>         ADD     a3, a3, a5
>         sltu    t1, t0, t3
>         sltu    t4, a3, a5
>         ADD     t0, t0, t1
>         ADD     a3, a3, t4
>         ADD     sum, sum, t0
>         sltu    t8,  sum, t0
>         ADD     sum, sum,  t8
>         ADD     sum, sum, a3
>         sltu    t8,  sum, a3
>         addi.d  t5, t5, -1
>         ADD     sum, sum, t8
> 
> However the result and principle is almost the similar with
> uint128 c code. And there is no performance impact interleaving
> the reads and alu operations.

You are still relying on the 'out of order' logic to execute
ALU instructions while the memory reads are going on.
Try something like:
	complex setup :-)
loop:
	sltu	c0, sum, v0
	load	v0, src, 0
	add	sum, v1
	add	carry, c3

	sltu	c1, sum, v1
	load	v1, src, 8
	add	sum, v2
	add	carry, c0

	sltu	c2, sum, v2
	load	v2, src, 16
	addi	src, 32
	add	sum, v3
	add	carry, c1

	sltu	c3, sum, v3
	load	v3, src, 24
	add	sum, v0
	add	carry, c2
	bne	src, limit, loop

	complex finalise

The idea being that each group of instructions executes
in one clock - so the loop is 4 clocks.
The above code allows for 2 delay clocks on reads.
They may not be needed, in that case the above may run
at 8 bytes/clock with just 2 blocks of instructions.

You'd give the cpu a bit more leeway by using two sum and
carry registers.

I'd time the loop without worrying about the setup/finalise
code.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
maobibo Feb. 14, 2023, 2:19 p.m. UTC | #11
On 2023/2/14 17:47, David Laight wrote:
> From: maobibo
>> Sent: 14 February 2023 01:31
> ...
>> Part of asm code depends on previous intr in website
>> https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb,
>> such as macro ADDC
>> #define ADDC(sum,reg)                                           \
>>          ADD     sum, sum, reg;                                  \
>>          sltu    t8, sum, reg;                                   \
>>          ADD     sum, sum, t8;                                   \
>> these three instructions depends on each other, and can not execute
>> in parallel.
> 
> Right, but you can add the carry bits into a different register.
> Since the aim is 8 bytes/clock limited by 1 memory read/clock
> you can (probably) manage with all the word adds going to one
> register and all the carry adds to a second. So:
> #define ADDC(carry, sum, reg) \
> 	add	sum, sum, reg	\
> 	sltu	reg, sum, reg	\
> 	add	carry, carry, reg
> 
>>
>> The original of main loop about Lmove_128bytes is:
>> #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
>>          LOAD    _t0, src, (offset + UNIT(0));                   \
>>          LOAD    _t1, src, (offset + UNIT(1));                   \
>>          LOAD    _t2, src, (offset + UNIT(2));                   \
>>          LOAD    _t3, src, (offset + UNIT(3));                   \
>>          ADDC(_t0, _t1);                                         \
>>          ADDC(_t2, _t3);                                         \
>>          ADDC(sum, _t0);                                         \
>>          ADDC(sum, _t2)
>>
>> .Lmove_128bytes:
>>          CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
>>          CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
>>          CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
>>          CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
>>          addi.d  t5, t5, -1
>>          addi.d  src, src, 0x80
>>          bnez    t5, .Lmove_128bytes
>>
>> I modified the main loop with label .Lmove_128bytes to reduce
>> dependency between instructions like this, it can improve the
>> performance.
>> can improve the performance.
>> .Lmove_128bytes:
>>          LOAD    t0, src, 0
>>          LOAD    t1, src, 8
>>          LOAD    t3, src, 16
>>          LOAD    t4, src, 24
>>          LOAD    a3, src, 0 + 0x20
>>          LOAD    a4, src, 8 + 0x20
>>          LOAD    a5, src, 16 + 0x20
>>          LOAD    a6, src, 24 + 0x20
>>          ADD     t0, t0,  t1
>>          ADD     t3, t3,  t4
>>          ADD     a3, a3,  a4
>>          ADD     a5, a5,  a6
>>          sltu    t8, t0,  t1
>>          sltu    a7, t3,  t4
>>          ADD     t0, t0,  t8
>>          ADD     t3, t3,  a7
>>          sltu    t1, a3,  a4
>>          sltu    t4, a5,  a6
>>          ADD     a3, a3,  t1
>>          ADD     a5, a5,  t4
>>          ADD     t0, t0, t3
>>          ADD     a3, a3, a5
>>          sltu    t1, t0, t3
>>          sltu    t4, a3, a5
>>          ADD     t0, t0, t1
>>          ADD     a3, a3, t4
>>          ADD     sum, sum, t0
>>          sltu    t8,  sum, t0
>>          ADD     sum, sum,  t8
>>          ADD     sum, sum, a3
>>          sltu    t8,  sum, a3
>>          addi.d  t5, t5, -1
>>          ADD     sum, sum, t8
>>
>> However the result and principle is almost the similar with
>> uint128 c code. And there is no performance impact interleaving
>> the reads and alu operations.
> 
> You are still relying on the 'out of order' logic to execute
> ALU instructions while the memory reads are going on.
> Try something like:
> 	complex setup :-)
> loop:
> 	sltu	c0, sum, v0
> 	load	v0, src, 0
> 	add	sum, v1
> 	add	carry, c3
> 
> 	sltu	c1, sum, v1
> 	load	v1, src, 8
> 	add	sum, v2
> 	add	carry, c0
> 
> 	sltu	c2, sum, v2
> 	load	v2, src, 16
> 	addi	src, 32
> 	add	sum, v3
> 	add	carry, c1
> 
> 	sltu	c3, sum, v3
> 	load	v3, src, 24
> 	add	sum, v0
> 	add	carry, c2
> 	bne	src, limit, loop
> 
> 	complex finalise
> 
> The idea being that each group of instructions executes
> in one clock - so the loop is 4 clocks.
> The above code allows for 2 delay clocks on reads.
> They may not be needed, in that case the above may run
> at 8 bytes/clock with just 2 blocks of instructions.
> 
> You'd give the cpu a bit more leeway by using two sum and
> carry registers.
Got it. It makes use of pipeline better, rather than number of ALUs for 
different micro-architectures. I will try this method, thanks again for 
kindly help and explanation with patience.

Regards
Bibo, mao

> 
> I'd time the loop without worrying about the setup/finalise
> code.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  
David Laight Feb. 16, 2023, 9:03 a.m. UTC | #12
From: maobibo
> Sent: 14 February 2023 14:19
...
> Got it. It makes use of pipeline better, rather than number of ALUs for
> different micro-architectures. I will try this method, thanks again for
> kindly help and explanation with patience.

It is also worth pointing out that if the cpu does 'out of order'
execution it may be just as good to just repeat blocks of:
	load	v0, addr, 0*8
	add	sum0, v0
	sltu	v0, sum0, v0
	add	carry0, v0

Assuming the prefetch/decode logic can predict the loop
and generate enough decoded instruction for all the alu units.

The add/sltu/add will be queued until the load completes
and then execute in the next three clocks.
The load for the next block will be scheduled as soon as
the load/store unit has finished processing the previous load.
So all the alu instructions just wait for the required input
to be available and a memory load executes every clock.

Multiple sum0 and carry0 registers aren't actually needed.
But having 2 of each (even if the loop is unrolled 4 times)
might help a bit.

If the cpu does 'register renaming' (as most x86 do) you
can use the same register name for 'v0' in all the blocks
(even though it is alive with multiple values).

But a simpler in-order multi-issue cpu will need you to
correctly interleave the instructions for maximum throughput.
It also does no hard for a very simple cpu that has delays
before a read value can be used.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
maobibo Feb. 16, 2023, 9:26 a.m. UTC | #13
在 2023/2/16 17:03, David Laight 写道:
> From: maobibo
>> Sent: 14 February 2023 14:19
> ...
>> Got it. It makes use of pipeline better, rather than number of ALUs for
>> different micro-architectures. I will try this method, thanks again for
>> kindly help and explanation with patience.
> 
> It is also worth pointing out that if the cpu does 'out of order'
> execution it may be just as good to just repeat blocks of:
> 	load	v0, addr, 0*8
> 	add	sum0, v0
> 	sltu	v0, sum0, v0
> 	add	carry0, v0
> 
It is strange that there is no performance improvement on my loongarch
machine when interleaving ALU instructions before load; however on x86
box the performance improvement is huge compared to uint128 function.

Here is the piece of of the code:
        while (len > 48) {
                len -= 48;
                tmp4 = *(unsigned long *)ptr;
                if (tmp1 < tmp2)
                        tmp1 += 1;
                tmp3 += tmp4;

                tmp0 = *(unsigned long *)(ptr + 1);
                if (tmp3 < tmp4)
                        tmp3 += 1;
                sum64 += tmp0;

                tmp2 = *(unsigned long *)(ptr + 2);
                if (sum64 < tmp0)
                        sum64 += 1;
                tmp1 += tmp2;

                tmp4 = *(unsigned long *)(ptr + 3);
                if (tmp1 < tmp2)
                        tmp1 +=1;
                tmp3 += tmp4;

                tmp0 = *(unsigned long *)(ptr + 4);
                if (tmp3 < tmp4)
                        tmp3 +=1;
                sum64 += tmp0;

                tmp2 = *(unsigned long *)(ptr + 5);
                if (sum64 < tmp0)
                        sum64 += 1;
                tmp1 += tmp2;
                ptr += 6;
        }

Regards
Bibo, Mao
> Assuming the prefetch/decode logic can predict the loop
> and generate enough decoded instruction for all the alu units.
> 
> The add/sltu/add will be queued until the load completes
> and then execute in the next three clocks.
> The load for the next block will be scheduled as soon as
> the load/store unit has finished processing the previous load.
> So all the alu instructions just wait for the required input
> to be available and a memory load executes every clock.
> 
> Multiple sum0 and carry0 registers aren't actually needed.
> But having 2 of each (even if the loop is unrolled 4 times)
> might help a bit.
> 
> If the cpu does 'register renaming' (as most x86 do) you
> can use the same register name for 'v0' in all the blocks
> (even though it is alive with multiple values).
> 
> But a simpler in-order multi-issue cpu will need you to
> correctly interleave the instructions for maximum throughput.
> It also does no hard for a very simple cpu that has delays
> before a read value can be used.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
  

Patch

diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h
new file mode 100644
index 000000000000..8a7d368d801d
--- /dev/null
+++ b/arch/loongarch/include/asm/checksum.h
@@ -0,0 +1,65 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2016 ARM Ltd.
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+#ifndef __ASM_CHECKSUM_H
+#define __ASM_CHECKSUM_H
+
+#include <linux/in6.h>
+
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum sum);
+
+/*
+ * turns a 32-bit partial checksum (e.g. from csum_partial) into a
+ * 1's complement 16-bit checksum.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+	u32 tmp = (__force u32)sum;
+
+	/*
+	 * swap the two 16-bit halves of sum
+	 * if there is a carry from adding the two 16-bit halves,
+	 * it will carry from the lower half into the upper half,
+	 * giving us the correct sum in the upper half.
+	 */
+	return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
+}
+#define csum_fold csum_fold
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.  ihl is the number
+ * of 32-bit words and is always >= 5.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	__uint128_t tmp;
+	u64 sum;
+	int n = ihl; /* we want it signed */
+
+	tmp = *(const __uint128_t *)iph;
+	iph += 16;
+	n -= 4;
+	tmp += ((tmp >> 64) | (tmp << 64));
+	sum = tmp >> 64;
+	do {
+		sum += *(const u32 *)iph;
+		iph += 4;
+	} while (--n > 0);
+
+	sum += ror64(sum, 32);
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+#define ip_fast_csum ip_fast_csum
+
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
+#include <asm-generic/checksum.h>
+
+#endif	/* __ASM_CHECKSUM_H */
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index 40bde632900f..6ba6df411f90 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -4,4 +4,4 @@ 
 #
 
 lib-y	+= delay.o memset.o memcpy.o memmove.o \
-	   clear_user.o copy_user.o dump_tlb.o unaligned.o
+	   clear_user.o copy_user.o dump_tlb.o unaligned.o csum.o
diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
new file mode 100644
index 000000000000..0f7e3a5ce96a
--- /dev/null
+++ b/arch/loongarch/lib/csum.c
@@ -0,0 +1,142 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 Arm Ltd.
+
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <net/checksum.h>
+
+/* Looks dumb, but generates nice-ish code */
+static u64 accumulate(u64 sum, u64 data)
+{
+	__uint128_t tmp;
+
+	tmp = (__uint128_t)sum + data;
+	return tmp + (tmp >> 64);
+}
+
+/*
+ * We over-read the buffer and this makes KASAN unhappy. Instead, disable
+ * instrumentation and call kasan explicitly.
+ */
+unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
+{
+	unsigned int offset, shift, sum;
+	const u64 *ptr;
+	u64 data, sum64 = 0;
+
+	if (unlikely(len == 0))
+		return 0;
+
+	offset = (unsigned long)buff & 7;
+	/*
+	 * This is to all intents and purposes safe, since rounding down cannot
+	 * result in a different page or cache line being accessed, and @buff
+	 * should absolutely not be pointing to anything read-sensitive. We do,
+	 * however, have to be careful not to piss off KASAN, which means using
+	 * unchecked reads to accommodate the head and tail, for which we'll
+	 * compensate with an explicit check up-front.
+	 */
+	kasan_check_read(buff, len);
+	ptr = (u64 *)(buff - offset);
+	len = len + offset - 8;
+
+	/*
+	 * Head: zero out any excess leading bytes. Shifting back by the same
+	 * amount should be at least as fast as any other way of handling the
+	 * odd/even alignment, and means we can ignore it until the very end.
+	 */
+	shift = offset * 8;
+	data = *ptr++;
+	data = (data >> shift) << shift;
+
+	/*
+	 * Body: straightforward aligned loads from here on (the paired loads
+	 * underlying the quadword type still only need dword alignment). The
+	 * main loop strictly excludes the tail, so the second loop will always
+	 * run at least once.
+	 */
+	while (unlikely(len > 64)) {
+		__uint128_t tmp1, tmp2, tmp3, tmp4;
+
+		tmp1 = *(__uint128_t *)ptr;
+		tmp2 = *(__uint128_t *)(ptr + 2);
+		tmp3 = *(__uint128_t *)(ptr + 4);
+		tmp4 = *(__uint128_t *)(ptr + 6);
+
+		len -= 64;
+		ptr += 8;
+
+		/* This is the "don't dump the carry flag into a GPR" idiom */
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | sum64;
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		sum64 = tmp1 >> 64;
+	}
+	while (len > 8) {
+		__uint128_t tmp;
+
+		sum64 = accumulate(sum64, data);
+		tmp = *(__uint128_t *)ptr;
+
+		len -= 16;
+		ptr += 2;
+
+		data = tmp >> 64;
+		sum64 = accumulate(sum64, tmp);
+	}
+	if (len > 0) {
+		sum64 = accumulate(sum64, data);
+		data = *ptr;
+		len -= 8;
+	}
+	/*
+	 * Tail: zero any over-read bytes similarly to the head, again
+	 * preserving odd/even alignment.
+	 */
+	shift = len * -8;
+	data = (data << shift) >> shift;
+	sum64 = accumulate(sum64, data);
+
+	/* Finally, folding */
+	sum64 += (sum64 >> 32) | (sum64 << 32);
+	sum = sum64 >> 32;
+	sum += (sum >> 16) | (sum << 16);
+	if (offset & 1)
+		return (u16)swab32(sum);
+
+	return sum >> 16;
+}
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum csum)
+{
+	__uint128_t src, dst;
+	u64 sum = (__force u64)csum;
+
+	src = *(const __uint128_t *)saddr->s6_addr;
+	dst = *(const __uint128_t *)daddr->s6_addr;
+
+	sum += (__force u32)htonl(len);
+	sum += (u32)proto << 24;
+	src += (src >> 64) | (src << 64);
+	dst += (dst >> 64) | (dst << 64);
+
+	sum = accumulate(sum, src >> 64);
+	sum = accumulate(sum, dst >> 64);
+
+	sum += ((sum >> 32) | (sum << 32));
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);