x86/APX: optimize MOVBE

Message ID c04b9d63-5bea-44fa-95b2-ec5a40e997b7@suse.com
State Unresolved
Headers
Series x86/APX: optimize MOVBE |

Checks

Context Check Description
snail/binutils-gdb-check warning Git am fail log

Commit Message

Jan Beulich Jan. 12, 2024, noon UTC
  With identical source and destination it can be covered by the NDD-to-
legacy conversion logic as well, even if in this case the original insn
doesn't use an NDD encoding. The size savings are even better here, for
the replacement (BSWAP) not having a ModR/M byte.
  

Comments

Jan Beulich Jan. 12, 2024, 12:28 p.m. UTC | #1
On 12.01.2024 13:00, Jan Beulich wrote:
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -210,6 +210,9 @@ mov, 0xf24, i386&No64, D|RegMem|IgnoreSi
>  // Move after swapping the bytes
>  movbe, 0x0f38f0, Movbe, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  movbe, 0x60, Movbe&APX_F, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
> +// This really is BSWAP, but wants "duplicating" here for easy EVEX -> REX2
> +// conversion, restarting with the next sequential template.
> +movbe, 0xfc8, Movbe&APX_F, No_bSuf|No_wSuf|No_sSuf, { Reg32|Reg64 }

Thinking of it - while not the end of the world, it would certainly be nice
if this new template wasn't available for "ordinary" use. I'll see if I can
come up with something.

Jan
  
Hu, Lin1 Jan. 16, 2024, 2:01 a.m. UTC | #2
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Friday, January 12, 2024 8:28 PM
> To: Binutils <binutils@sourceware.org>
> Cc: H.J. Lu <hjl.tools@gmail.com>; Hu, Lin1 <lin1.hu@intel.com>
> Subject: Re: [PATCH] x86/APX: optimize MOVBE
> 
> On 12.01.2024 13:00, Jan Beulich wrote:
> > --- a/opcodes/i386-opc.tbl
> > +++ b/opcodes/i386-opc.tbl
> > @@ -210,6 +210,9 @@ mov, 0xf24, i386&No64, D|RegMem|IgnoreSi  //
> Move
> > after swapping the bytes  movbe, 0x0f38f0, Movbe,
> > D|Modrm|CheckOperandSize|No_bSuf|No_sSuf, {
> > Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }  movbe,
> > 0x60, Movbe&APX_F,
> D|Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4,
> > { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
> > +// This really is BSWAP, but wants "duplicating" here for easy EVEX
> > +-> REX2 // conversion, restarting with the next sequential template.
> > +movbe, 0xfc8, Movbe&APX_F, No_bSuf|No_wSuf|No_sSuf, { Reg32|Reg64 }
> 
> Thinking of it - while not the end of the world, it would certainly be nice if this
> new template wasn't available for "ordinary" use. I'll see if I can come up with
> something.
> 

Maybe just put a flag in _i386_insn and add a new attribute in .tbl can solve the problem. Or add an already existing attribute which can disturb the "ordinary" use, and we change some value after optimization to match the template. Others are OK.

BRs,
Lin
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -7872,7 +7872,8 @@  match_template (char mnem_suffix)
 	  && t + 1 < current_templates.end
 	  && !t[1].opcode_modifier.evex
 	  && t[1].opcode_space <= SPACE_0F38
-	  && t->opcode_modifier.vexvvvv == VexVVVV_DST
+	  && (t->opcode_modifier.vexvvvv == VexVVVV_DST
+	      || t->mnem_off == MN_movbe)
 	  && (i.types[i.operands - 1].bitfield.dword
 	      || i.types[i.operands - 1].bitfield.qword))
 	{
--- a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
@@ -118,6 +118,9 @@  Disassembly of section .text:
 \s*[a-f0-9]+:\s*67 0f 4d 90 90 90 90 90 	cmovge -0x6f6f6f70\(%eax\),%edx
 \s*[a-f0-9]+:\s*67 0f 4e 90 90 90 90 90 	cmovle -0x6f6f6f70\(%eax\),%edx
 \s*[a-f0-9]+:\s*67 0f 4f 90 90 90 90 90 	cmovg  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*62 f4 7d 08 60 c0    	movbe  %ax,%ax
+\s*[a-f0-9]+:\s*49 0f c8             	bswap  %r8
+\s*[a-f0-9]+:\s*d5 98 c8             	bswap  %r16
 \s*[a-f0-9]+:\s*66 0f 38 f6 c3       	adcx   %ebx,%eax
 \s*[a-f0-9]+:\s*66 0f 38 f6 c3       	adcx   %ebx,%eax
 \s*[a-f0-9]+:\s*62 f4 fd 18 66 c3    	adcx   %rbx,%rax,%rax
--- a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
@@ -111,6 +111,9 @@  cmovl  0x90909090(%eax),%edx,%edx
 cmovge 0x90909090(%eax),%edx,%edx
 cmovle 0x90909090(%eax),%edx,%edx
 cmovg  0x90909090(%eax),%edx,%edx
+movbe  %ax,%ax
+movbe  %r8,%r8
+movbe  %r16,%r16
 adcx   %ebx,%eax,%eax
 adcx   %eax,%ebx,%eax
 adcx   %rbx,%rax,%rax
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -210,6 +210,9 @@  mov, 0xf24, i386&No64, D|RegMem|IgnoreSi
 // Move after swapping the bytes
 movbe, 0x0f38f0, Movbe, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 movbe, 0x60, Movbe&APX_F, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
+// This really is BSWAP, but wants "duplicating" here for easy EVEX -> REX2
+// conversion, restarting with the next sequential template.
+movbe, 0xfc8, Movbe&APX_F, No_bSuf|No_wSuf|No_sSuf, { Reg32|Reg64 }
 
 // Move with sign extend.
 movsb, 0xfbe, i386, Modrm|No_bSuf|No_sSuf, { Reg8|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }