diff mbox series

[v3,8/9] Support APX NDD optimized encoding.

Message ID	20231124070213.3886483-8-lili.cui@intel.com
State	Unresolved
Headers	Received-SPF: pass (google.com: domain of binutils-bounces+ouuuleilei=gmail.com@sourceware.org designates 8.43.85.97 as permitted sender) client-ip=8.43.85.97; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 7B1D8385829E From: "Cui, Lili" <lili.cui@intel.com> To: binutils@sourceware.org Cc: jbeulich@suse.com, hongjiu.lu@intel.com, "Hu, Lin1" <lin1.hu@intel.com> Subject: [PATCH v3 8/9] Support APX NDD optimized encoding. Date: Fri, 24 Nov 2023 07:02:12 +0000 Message-Id: <20231124070213.3886483-8-lili.cui@intel.com> In-Reply-To: <20231124070213.3886483-1-lili.cui@intel.com> References: <20231124070213.3886483-1-lili.cui@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: binutils-bounces+ouuuleilei=gmail.com@sourceware.org X-getmail-retrieved-from-mailbox: INBOX
Series	[1/9] Make const_1_mode print $1 in AT&T syntax \| [1/9] Make const_1_mode print $1 in AT&T syntax [v3,2/9] Support APX GPR32 with rex2 prefix [v3,4/9] Support APX GPR32 with extend evex prefix [v3,5/9] Add tests for APX GPR32 with extend evex prefix [v3,6/9] Support APX NDD [v3,7/9] Support APX Push2/Pop2 [v3,8/9] Support APX NDD optimized encoding. [v3,9/9] Support APX JMPABS for disassembler

Checks

Context	Check	Description
snail/binutils-gdb-check	warning	Git am fail log

Commit Message

Cui, Lili Nov. 24, 2023, 7:02 a.m. UTC

  From: "Hu, Lin1" <lin1.hu@intel.com>

This patch aims to optimize:

add %r16, %r15, %r15 -> add %r16, %r15

gas/ChangeLog:

	* config/tc-i386.c (check_RexOperands): New function.
	(can_convert_NDD_to_legacy): Ditto.
	(match_template): If we can optimzie APX NDD insns, so rematch
	template.
	* testsuite/gas/i386/x86-64.exp: Add test.
	* testsuite/gas/i386/x86-64-apx-ndd-optimize.d: New test.
	* testsuite/gas/i386/x86-64-apx-ndd-optimize.s: Ditto.
---
 gas/config/tc-i386.c                          | 107 ++++++++++++++
 .../gas/i386/x86-64-apx-ndd-optimize.d        | 130 ++++++++++++++++++
 .../gas/i386/x86-64-apx-ndd-optimize.s        | 123 +++++++++++++++++
 gas/testsuite/gas/i386/x86-64.exp             |   1 +
 4 files changed, 361 insertions(+)
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s

Comments

Jan Beulich Dec. 11, 2023, 12:27 p.m. UTC | #1

On 24.11.2023 08:02, Cui, Lili wrote:
> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
>    return 0;
>  }
>  
> +/* Check if the instruction use the REX registers.  */
> +static bool
> +check_RexOperands ()
> +{
> +  for (unsigned int op = 0; op < i.operands; op++)
> +    {
> +      if (i.types[op].bitfield.class != Reg)
> +	continue;
> +
> +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
> +	return true;
> +    }
> +
> +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
> +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
> +    return true;
> +
> +  /* Check pseudo prefix {rex} are valid.  */
> +  return i.rex_encoding;

Can this actually happen, when we're converting from EVEX to legacy?
(Initially I wanted to ask about "rex" and alike prefixes, i.e. the non-
pseudo ones.)

> +}
> +
> +/* Optimize APX NDD insns to legacy insns.  */
> +static unsigned int
> +can_convert_NDD_to_legacy (const insn_template *t)
> +{
> +  unsigned int match_dest_op = ~0;
> +
> +  if (t->opcode_modifier.vexvvvv == VexVVVV_DST

No new callers are expected to appear (any time soon) and the sole caller
has checked this already.

Also with this check, ...

> +      && t->opcode_space == SPACE_EVEXMAP4

... what (further) effect is this one intended to have?

> +      && !i.has_nf
> +      && i.reg_operands >= 2)
> +    {
> +      unsigned int dest = i.operands - 1;
> +      unsigned int src1 = i.operands - 2;
> +      unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
> +
> +      if (i.types[src1].bitfield.class == Reg
> +	  && i.op[src1].regs == i.op[dest].regs)
> +	match_dest_op = src1;
> +      /* If the first operand is the same as the third operand,
> +	 these instructions need to support the ability to commutative
> +	 the first two operands and still not change the semantics in order
> +	 to be optimized.  */
> +      else if (i.types[src2].bitfield.class == Reg
> +	       && i.op[src2].regs == i.op[dest].regs
> +	       && optimize > 1
> +	       && t->opcode_modifier.commutative)

Based on the "cheap conditions first" principle and to also be better in
line with the comment, may I suggest

+      else if (optimize > 1
+	       && t->opcode_modifier.commutative
+	       && i.types[src2].bitfield.class == Reg
+	       && i.op[src2].regs == i.op[dest].regs)

?

> +	match_dest_op = src2;
> +    }
> +  return match_dest_op;
> +}
> +
>  /* Helper function for the progress() macro in match_template().  */
>  static INLINE enum i386_error progress (enum i386_error new,
>  					enum i386_error last,
> @@ -7675,6 +7727,61 @@ match_template (char mnem_suffix)
>  	  i.memshift = memshift;
>  	}
>  
> +      /* If we can optimize a NDD insn to legacy insn, like
> +	 add %r16, %r8, %r8 -> add %r16, %r8,
> +	 add  %r8, %r16, %r8 -> add %r16, %r8, then rematch template.
> +	 Note that the semantics have not been changed.  */
> +      if (optimize
> +	  && !i.no_optimize
> +	  && i.vec_encoding != vex_encoding_evex
> +	  && t + 1 < current_templates->end
> +	  && !t[1].opcode_modifier.evex
> +	  && t[1].opcode_space <= SPACE_0F38
> +	  && t->opcode_modifier.vexvvvv == VexVVVV_DST)
> +	{
> +	  unsigned int match_dest_op = can_convert_NDD_to_legacy (t);
> +	  size_match = true;

This would perhaps better ...

> +	  if (match_dest_op != (unsigned int) ~0)
> +	    {

... live here

> +	      /* We ensure that the next template has the same input
> +		 operands as the original matching template by the first
> +		 opernd (ATT), thus avoiding the error caused by the wrong order
> +		 of insns in i386.tbl.  */

I'm sorry, but I (still) can't make sense of this last part of the comment,
after the comma.

> +	      overlap0 = operand_type_and (i.types[0],
> +					   t[1].operand_types[0]);
> +	      if (t->opcode_modifier.d)
> +		overlap1 = operand_type_and (i.types[0],
> +					     t[1].operand_types[1]);
> +	      if (!operand_type_match (overlap0, i.types[0])
> +		  && (!t->opcode_modifier.d
> +		      || (t->opcode_modifier.d
> +			  && !operand_type_match (overlap1, i.types[0]))))

What's wrong with the simpler

		  && (!t->opcode_modifier.d
		      || !operand_type_match (overlap1, i.types[0])))

?

> +		size_match = false;

Yet still, and despite the improved comment, I don't really see what all of
this is about. What cases would be mis-handled if this wasn't there?

> +	      if (size_match
> +		  /* Optimizing some non-legacy-map0/1 without REX/REX2 prefix will be valuable.  */
> +		  && (t[1].opcode_space <= SPACE_0F

Where a comment is placed is meaningful to understanding what it's about. The
wayy you have it, is says "non-legacy-map0/1" on a check that the (next)
encoding is map0 or map1. I think this wants moving down by a line, and even
then also re-wording: If I didn't (vaguely) recall context, I don't think I
could derive what is meant. Iirc this is about legacy encodings being one
byte shorter for certain 0f38 space insns when they don't require a REX
prefix to encode. How about something like "Some non-legacy-map0/1 insns can
be shorter when legacy-encoded and when no REX prefix is required"?

> +		      || (!check_EgprOperands (t + 1)
> +			  && !check_RexOperands ()

I'm not going to insist that you adjust this, but these two calls side by
side demonstrate a curious inconsistency: The former requires t to be passed
in. If you keep it like that, I may change this down the road, the more that
the t-related aspect isn't relevant here at all (and could hence be moved
out of the function to the single place where it is needed).

> +			  && !i.op[i.operands - 1].regs->reg_type.bitfield.qword)))
> +		{
> +		  unsigned int src1 = i.operands - 2;
> +		  unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
> +
> +		  if (match_dest_op == src2)
> +		    swap_2_operands (match_dest_op, src1);

Isn't it wrong (albeit benign) to swap when i.operands == 2? IOW wouldn't

		  if (i.reg_operands > 2 && match_dest_op == i.operands - 3)
		    swap_2_operands (match_dest_op, i.operands - 2);

be more in line with what's actually wanted?

> --- /dev/null
> +++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
> @@ -0,0 +1,123 @@
> +# Check 64bit APX NDD instructions with optimized encoding
> +
> +	.text
> +_start:
> +add    %r31,%r8,%r8
> +addb   %r31b,%r8b,%r8b
> +{store} add    %r31,%r8,%r8
> +{load}  add    %r31,%r8,%r8
> +add    %r31,(%r8),%r31
> +add    (%r31),%r8,%r8
> +add    $0x12344433,%r15,%r15
> +add    $0xfffffffff4332211,%r8,%r8
> +inc    %r31,%r31
> +incb   %r31b,%r31b
> +sub    %r15,%r17,%r17
> +subb   %r15b,%r17b,%r17b
> +sub    %r15,(%r8),%r15
> +sub    (%r15,%rax,1),%r16,%r16
> +sub    $0x1234,%r30,%r30
> +dec    %r17,%r17
> +decb   %r17b,%r17b
> +sbb    %r15,%r17,%r17
> +sbbb   %r15b,%r17b,%r17b
> +sbb    %r15,(%r8),%r15
> +sbb    (%r15,%rax,1),%r16,%r16
> +sbb    $0x1234,%r30,%r30
> +and    %r15,%r17,%r17
> +andb   %r15b,%r17b,%r17b
> +and    %r15,(%r8),%r15
> +and    (%r15,%rax,1),%r16,%r16
> +and    $0x1234,%r30,%r30
> +or     %r15,%r17,%r17
> +orb    %r15b,%r17b,%r17b
> +or     %r15,(%r8),%r15
> +or     (%r15,%rax,1),%r16,%r16
> +or     $0x1234,%r30,%r30
> +xor    %r15,%r17,%r17
> +xorb   %r15b,%r17b,%r17b
> +xor    %r15,(%r8),%r15
> +xor    (%r15,%rax,1),%r16,%r16
> +xor    $0x1234,%r30,%r30
> +adc    %r15,%r17,%r17
> +adcb   %r15b,%r17b,%r17b
> +adc    %r15,(%r8),%r15
> +adc    (%r15,%rax,1),%r16,%r16
> +adc    $0x1234,%r30,%r30
> +neg    %r17,%r17
> +negb   %r17b,%r17b
> +not    %r17,%r17
> +notb   %r17b,%r17b
> +imul   0x90909(%eax),%edx,%edx
> +imul   0x909(%rax,%r31,8),%rdx,%rdx
> +imul   %rdx,%rax,%rdx
> +rol    %r31,%r31
> +rolb   %r31b,%r31b
> +rol    $0x2,%r12,%r12
> +rolb   $0x2,%r12b,%r12b
> +ror    %r31,%r31
> +rorb   %r31b,%r31b
> +ror    $0x2,%r12,%r12
> +rorb   $0x2,%r12b,%r12b
> +rcl    %r31,%r31
> +rclb   %r31b,%r31b
> +rcl    $0x2,%r12,%r12
> +rclb   $0x2,%r12b,%r12b
> +rcr    %r31,%r31
> +rcrb   %r31b,%r31b
> +rcr    $0x2,%r12,%r12
> +rcrb   $0x2,%r12b,%r12b
> +sal    %r31,%r31
> +salb   %r31b,%r31b
> +sal    $0x2,%r12,%r12
> +salb   $0x2,%r12b,%r12b
> +shl    %r31,%r31
> +shlb   %r31b,%r31b
> +shl    $0x2,%r12,%r12
> +shlb   $0x2,%r12b,%r12b
> +shr    %r31,%r31
> +shrb   %r31b,%r31b
> +shr    $0x2,%r12,%r12
> +shrb   $0x2,%r12b,%r12b
> +sar    %r31,%r31
> +sarb   %r31b,%r31b
> +sar    $0x2,%r12,%r12
> +sarb   $0x2,%r12b,%r12b
> +shld   $0x1,%r12,(%rax),%r12
> +shld   $0x2,%r8,%r12,%r12
> +shld   $0x2,%r8,%r12,%r8
> +shld   %cl,%r9,(%rax),%r9
> +shld   %cl,%r12,%r16,%r16
> +shld   %cl,%r12,%r16,%r12
> +shrd   $0x1,%r12,(%rax),%r12
> +shrd   $0x1,%r13,%r12,%r12
> +shrd   $0x1,%r13,%r12,%r13
> +shrd   %cl,%r9,(%rax),%r9
> +shrd   %cl,%r12,%r16,%r16
> +shrd   %cl,%r12,%r16,%r12
> +cmovo  0x90909090(%eax),%edx,%edx
> +cmovno 0x90909090(%eax),%edx,%edx
> +cmovb  0x90909090(%eax),%edx,%edx
> +cmovae 0x90909090(%eax),%edx,%edx
> +cmove  0x90909090(%eax),%edx,%edx
> +cmovne 0x90909090(%eax),%edx,%edx
> +cmovbe 0x90909090(%eax),%edx,%edx
> +cmova  0x90909090(%eax),%edx,%edx
> +cmovs  0x90909090(%eax),%edx,%edx
> +cmovns 0x90909090(%eax),%edx,%edx
> +cmovp  0x90909090(%eax),%edx,%edx
> +cmovnp 0x90909090(%eax),%edx,%edx
> +cmovl  0x90909090(%eax),%edx,%edx
> +cmovge 0x90909090(%eax),%edx,%edx
> +cmovle 0x90909090(%eax),%edx,%edx
> +cmovg  0x90909090(%eax),%edx,%edx
> +adcx   %ebx,%eax,%eax
> +adcx   %eax,%ebx,%eax
> +adcx   %rbx,%rax,%rax
> +adcx   %r15,%r8,%r8

Might this better be

adcx   %r15d,%r8d,%r8d

to avoid having two exclusion criteria (REX register use and REX.W set)?
Or maybe even split to further separate source and destination:

adcx   %eax,%r8d,%r8d
adcx   %r15d,%eax,%eax

?

> +adcx   (%edx,%ecx,1),%eax,%eax
> +adox   %ebx,%eax,%eax
> +adox   %eax,%ebx,%eax
> +adox   %rbx,%rax,%rax
> +adox   %r15,%r8,%r8
> +adox   (%edx,%ecx,1),%eax,%eax

Same here then.

Jan

Hu, Lin1 Dec. 12, 2023, 3:18 a.m. UTC | #2

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Monday, December 11, 2023 8:28 PM
> To: Cui, Lili <lili.cui@intel.com>
> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; Hu, Lin1 <lin1.hu@intel.com>;
> binutils@sourceware.org
> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
> 
> On 24.11.2023 08:02, Cui, Lili wrote:
> > --- a/gas/config/tc-i386.c
> > +++ b/gas/config/tc-i386.c
> > @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
> >    return 0;
> >  }
> >
> > +/* Check if the instruction use the REX registers.  */ static bool
> > +check_RexOperands () {
> > +  for (unsigned int op = 0; op < i.operands; op++)
> > +    {
> > +      if (i.types[op].bitfield.class != Reg)
> > +	continue;
> > +
> > +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
> > +	return true;
> > +    }
> > +
> > +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
> > +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
> > +    return true;
> > +
> > +  /* Check pseudo prefix {rex} are valid.  */  return i.rex_encoding;
> 
> Can this actually happen, when we're converting from EVEX to legacy?
> (Initially I wanted to ask about "rex" and alike prefixes, i.e. the non- pseudo
> ones.)
>

This is to align with check_EgprOperands. I hope the function be more general. Not just for this optimization problem.
 
>
> > +}
> > +
> > +/* Optimize APX NDD insns to legacy insns.  */ static unsigned int
> > +can_convert_NDD_to_legacy (const insn_template *t) {
> > +  unsigned int match_dest_op = ~0;
> > +
> > +  if (t->opcode_modifier.vexvvvv == VexVVVV_DST
> 
> No new callers are expected to appear (any time soon) and the sole caller has
> checked this already.
>

I will remove the condition.
 
>
> Also with this check, ...
> 
> > +      && t->opcode_space == SPACE_EVEXMAP4
> 
> ... what (further) effect is this one intended to have?
>

At first it was because of map4+vexvvvvv in order to locate the instructions that needed to be optimized. It seems that it's useless now. I will remove it.
 
>
> > +      && !i.has_nf
> > +      && i.reg_operands >= 2)
> > +    {
> > +      unsigned int dest = i.operands - 1;
> > +      unsigned int src1 = i.operands - 2;
> > +      unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
> > +
> > +      if (i.types[src1].bitfield.class == Reg
> > +	  && i.op[src1].regs == i.op[dest].regs)
> > +	match_dest_op = src1;
> > +      /* If the first operand is the same as the third operand,
> > +	 these instructions need to support the ability to commutative
> > +	 the first two operands and still not change the semantics in order
> > +	 to be optimized.  */
> > +      else if (i.types[src2].bitfield.class == Reg
> > +	       && i.op[src2].regs == i.op[dest].regs
> > +	       && optimize > 1
> > +	       && t->opcode_modifier.commutative)
> 
> Based on the "cheap conditions first" principle and to also be better in line with
> the comment, may I suggest
> 
> +      else if (optimize > 1
> +	       && t->opcode_modifier.commutative
> +	       && i.types[src2].bitfield.class == Reg
> +	       && i.op[src2].regs == i.op[dest].regs)
> 
> ?
>

Thanks your advice, I have modified.

> 
> > +	match_dest_op = src2;
> > +    }
> > +  return match_dest_op;
> > +}
> > +
> >  /* Helper function for the progress() macro in match_template().  */
> > static INLINE enum i386_error progress (enum i386_error new,
> >  					enum i386_error last,
> > @@ -7675,6 +7727,61 @@ match_template (char mnem_suffix)
> >  	  i.memshift = memshift;
> >  	}
> >
> > +      /* If we can optimize a NDD insn to legacy insn, like
> > +	 add %r16, %r8, %r8 -> add %r16, %r8,
> > +	 add  %r8, %r16, %r8 -> add %r16, %r8, then rematch template.
> > +	 Note that the semantics have not been changed.  */
> > +      if (optimize
> > +	  && !i.no_optimize
> > +	  && i.vec_encoding != vex_encoding_evex
> > +	  && t + 1 < current_templates->end
> > +	  && !t[1].opcode_modifier.evex
> > +	  && t[1].opcode_space <= SPACE_0F38
> > +	  && t->opcode_modifier.vexvvvv == VexVVVV_DST)
> > +	{
> > +	  unsigned int match_dest_op = can_convert_NDD_to_legacy (t);
> > +	  size_match = true;
> 
> This would perhaps better ...
> 
> > +	  if (match_dest_op != (unsigned int) ~0)
> > +	    {
> 
> ... live here
>

OK.
 
>
> > +	      /* We ensure that the next template has the same input
> > +		 operands as the original matching template by the first
> > +		 opernd (ATT), thus avoiding the error caused by the wrong
> order
> > +		 of insns in i386.tbl.  */
> 
> I'm sorry, but I (still) can't make sense of this last part of the comment, after the
> comma.
>

I mean if someone support new NDD insns and put it in the wrong position, so the part will try to avoid to optimize the insn.
 
>
> > +	      overlap0 = operand_type_and (i.types[0],
> > +					   t[1].operand_types[0]);
> > +	      if (t->opcode_modifier.d)
> > +		overlap1 = operand_type_and (i.types[0],
> > +					     t[1].operand_types[1]);
> > +	      if (!operand_type_match (overlap0, i.types[0])
> > +		  && (!t->opcode_modifier.d
> > +		      || (t->opcode_modifier.d
> > +			  && !operand_type_match (overlap1, i.types[0]))))
> 
> What's wrong with the simpler
> 
> 		  && (!t->opcode_modifier.d
> 		      || !operand_type_match (overlap1, i.types[0])))
> 
> ?
>

Yes, the simpler is useful, I have modified these conditions.
 
>
> > +		size_match = false;
> 
> Yet still, and despite the improved comment, I don't really see what all of this is
> about. What cases would be mis-handled if this wasn't there?
>

Not currently. It's for someone support new NDD instructions with wrong order. My idea is to prefer not to optimize, but try to avoid reporting error.
 
>
> > +	      if (size_match
> > +		  /* Optimizing some non-legacy-map0/1 without REX/REX2
> prefix will be valuable.  */
> > +		  && (t[1].opcode_space <= SPACE_0F
> 
> Where a comment is placed is meaningful to understanding what it's about. The
> wayy you have it, is says "non-legacy-map0/1" on a check that the (next)
> encoding is map0 or map1. I think this wants moving down by a line, and even
> then also re-wording: If I didn't (vaguely) recall context, I don't think I could
> derive what is meant. Iirc this is about legacy encodings being one byte shorter
> for certain 0f38 space insns when they don't require a REX prefix to encode.
> How about something like "Some non-legacy-map0/1 insns can be shorter when
> legacy-encoded and when no REX prefix is required"?
>

OK, I have modified. 
 
>
> > +		      || (!check_EgprOperands (t + 1)
> > +			  && !check_RexOperands ()
> 
> I'm not going to insist that you adjust this, but these two calls side by side
> demonstrate a curious inconsistency: The former requires t to be passed in. If
> you keep it like that, I may change this down the road, the more that the t-
> related aspect isn't relevant here at all (and could hence be moved out of the
> function to the single place where it is needed).
>

I will keep the part of code. I prefer to modified check_EgprOperands in another patch.
 
>
> > +			  && !i.op[i.operands - 1].regs-
> >reg_type.bitfield.qword)))
> > +		{
> > +		  unsigned int src1 = i.operands - 2;
> > +		  unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
> > +
> > +		  if (match_dest_op == src2)
> > +		    swap_2_operands (match_dest_op, src1);
> 
> Isn't it wrong (albeit benign) to swap when i.operands == 2? IOW wouldn't
> 
> 		  if (i.reg_operands > 2 && match_dest_op == i.operands - 3)
> 		    swap_2_operands (match_dest_op, i.operands - 2);
> 
> be more in line with what's actually wanted?
> 

Because some insn use memory (like add %r8, (%rsp), %r8), so I change the code be
		  if (i.operands > 2 && match_dest_op == i.operands - 3)
		    swap_2_operands (match_dest_op, i.operands - 2);

>
> > --- /dev/null
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
> > @@ -0,0 +1,123 @@
> > +# Check 64bit APX NDD instructions with optimized encoding
> > +
> > +	.text
> > +_start:
> > +add    %r31,%r8,%r8
> > +addb   %r31b,%r8b,%r8b
> > +{store} add    %r31,%r8,%r8
> > +{load}  add    %r31,%r8,%r8
> > +add    %r31,(%r8),%r31
> > +add    (%r31),%r8,%r8
> > +add    $0x12344433,%r15,%r15
> > +add    $0xfffffffff4332211,%r8,%r8
> > +inc    %r31,%r31
> > +incb   %r31b,%r31b
> > +sub    %r15,%r17,%r17
> > +subb   %r15b,%r17b,%r17b
> > +sub    %r15,(%r8),%r15
> > +sub    (%r15,%rax,1),%r16,%r16
> > +sub    $0x1234,%r30,%r30
> > +dec    %r17,%r17
> > +decb   %r17b,%r17b
> > +sbb    %r15,%r17,%r17
> > +sbbb   %r15b,%r17b,%r17b
> > +sbb    %r15,(%r8),%r15
> > +sbb    (%r15,%rax,1),%r16,%r16
> > +sbb    $0x1234,%r30,%r30
> > +and    %r15,%r17,%r17
> > +andb   %r15b,%r17b,%r17b
> > +and    %r15,(%r8),%r15
> > +and    (%r15,%rax,1),%r16,%r16
> > +and    $0x1234,%r30,%r30
> > +or     %r15,%r17,%r17
> > +orb    %r15b,%r17b,%r17b
> > +or     %r15,(%r8),%r15
> > +or     (%r15,%rax,1),%r16,%r16
> > +or     $0x1234,%r30,%r30
> > +xor    %r15,%r17,%r17
> > +xorb   %r15b,%r17b,%r17b
> > +xor    %r15,(%r8),%r15
> > +xor    (%r15,%rax,1),%r16,%r16
> > +xor    $0x1234,%r30,%r30
> > +adc    %r15,%r17,%r17
> > +adcb   %r15b,%r17b,%r17b
> > +adc    %r15,(%r8),%r15
> > +adc    (%r15,%rax,1),%r16,%r16
> > +adc    $0x1234,%r30,%r30
> > +neg    %r17,%r17
> > +negb   %r17b,%r17b
> > +not    %r17,%r17
> > +notb   %r17b,%r17b
> > +imul   0x90909(%eax),%edx,%edx
> > +imul   0x909(%rax,%r31,8),%rdx,%rdx
> > +imul   %rdx,%rax,%rdx
> > +rol    %r31,%r31
> > +rolb   %r31b,%r31b
> > +rol    $0x2,%r12,%r12
> > +rolb   $0x2,%r12b,%r12b
> > +ror    %r31,%r31
> > +rorb   %r31b,%r31b
> > +ror    $0x2,%r12,%r12
> > +rorb   $0x2,%r12b,%r12b
> > +rcl    %r31,%r31
> > +rclb   %r31b,%r31b
> > +rcl    $0x2,%r12,%r12
> > +rclb   $0x2,%r12b,%r12b
> > +rcr    %r31,%r31
> > +rcrb   %r31b,%r31b
> > +rcr    $0x2,%r12,%r12
> > +rcrb   $0x2,%r12b,%r12b
> > +sal    %r31,%r31
> > +salb   %r31b,%r31b
> > +sal    $0x2,%r12,%r12
> > +salb   $0x2,%r12b,%r12b
> > +shl    %r31,%r31
> > +shlb   %r31b,%r31b
> > +shl    $0x2,%r12,%r12
> > +shlb   $0x2,%r12b,%r12b
> > +shr    %r31,%r31
> > +shrb   %r31b,%r31b
> > +shr    $0x2,%r12,%r12
> > +shrb   $0x2,%r12b,%r12b
> > +sar    %r31,%r31
> > +sarb   %r31b,%r31b
> > +sar    $0x2,%r12,%r12
> > +sarb   $0x2,%r12b,%r12b
> > +shld   $0x1,%r12,(%rax),%r12
> > +shld   $0x2,%r8,%r12,%r12
> > +shld   $0x2,%r8,%r12,%r8
> > +shld   %cl,%r9,(%rax),%r9
> > +shld   %cl,%r12,%r16,%r16
> > +shld   %cl,%r12,%r16,%r12
> > +shrd   $0x1,%r12,(%rax),%r12
> > +shrd   $0x1,%r13,%r12,%r12
> > +shrd   $0x1,%r13,%r12,%r13
> > +shrd   %cl,%r9,(%rax),%r9
> > +shrd   %cl,%r12,%r16,%r16
> > +shrd   %cl,%r12,%r16,%r12
> > +cmovo  0x90909090(%eax),%edx,%edx
> > +cmovno 0x90909090(%eax),%edx,%edx
> > +cmovb  0x90909090(%eax),%edx,%edx
> > +cmovae 0x90909090(%eax),%edx,%edx
> > +cmove  0x90909090(%eax),%edx,%edx
> > +cmovne 0x90909090(%eax),%edx,%edx
> > +cmovbe 0x90909090(%eax),%edx,%edx
> > +cmova  0x90909090(%eax),%edx,%edx
> > +cmovs  0x90909090(%eax),%edx,%edx
> > +cmovns 0x90909090(%eax),%edx,%edx
> > +cmovp  0x90909090(%eax),%edx,%edx
> > +cmovnp 0x90909090(%eax),%edx,%edx
> > +cmovl  0x90909090(%eax),%edx,%edx
> > +cmovge 0x90909090(%eax),%edx,%edx
> > +cmovle 0x90909090(%eax),%edx,%edx
> > +cmovg  0x90909090(%eax),%edx,%edx
> > +adcx   %ebx,%eax,%eax
> > +adcx   %eax,%ebx,%eax
> > +adcx   %rbx,%rax,%rax
> > +adcx   %r15,%r8,%r8
> 
> Might this better be
> 
> adcx   %r15d,%r8d,%r8d
> 
> to avoid having two exclusion criteria (REX register use and REX.W set)?
> Or maybe even split to further separate source and destination:
> 
> adcx   %eax,%r8d,%r8d
> adcx   %r15d,%eax,%eax
> 
> ?
>

OK, I have split adcx and adox.
 
BR,
Lin

Jan Beulich Dec. 12, 2023, 8:41 a.m. UTC | #3

On 12.12.2023 04:18, Hu, Lin1 wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Monday, December 11, 2023 8:28 PM
>>
>> On 24.11.2023 08:02, Cui, Lili wrote:
>>> @@ -7675,6 +7727,61 @@ match_template (char mnem_suffix)
>>>  	  i.memshift = memshift;
>>>  	}
>>>
>>> +      /* If we can optimize a NDD insn to legacy insn, like
>>> +	 add %r16, %r8, %r8 -> add %r16, %r8,
>>> +	 add  %r8, %r16, %r8 -> add %r16, %r8, then rematch template.
>>> +	 Note that the semantics have not been changed.  */
>>> +      if (optimize
>>> +	  && !i.no_optimize
>>> +	  && i.vec_encoding != vex_encoding_evex
>>> +	  && t + 1 < current_templates->end
>>> +	  && !t[1].opcode_modifier.evex
>>> +	  && t[1].opcode_space <= SPACE_0F38
>>> +	  && t->opcode_modifier.vexvvvv == VexVVVV_DST)
>>> +	{
>>> +	  unsigned int match_dest_op = can_convert_NDD_to_legacy (t);
>>> +	  size_match = true;
>>
>> This would perhaps better ...
>>
>>> +	  if (match_dest_op != (unsigned int) ~0)
>>> +	    {
>>
>> ... live here
>>
> 
> OK.
>  
>>
>>> +	      /* We ensure that the next template has the same input
>>> +		 operands as the original matching template by the first
>>> +		 opernd (ATT), thus avoiding the error caused by the wrong
>> order
>>> +		 of insns in i386.tbl.  */
>>
>> I'm sorry, but I (still) can't make sense of this last part of the comment, after the
>> comma.
>>
> 
> I mean if someone support new NDD insns and put it in the wrong position, so the part will try to avoid to optimize the insn.

If this is about hypothetical new templates, that would want saying so in
the comment. Thus clarifying that there's no functional effect right now.
I wonder what H.J.'s view on such effectively dead code is.

However, there's a bigger problem with this patch as I realized only a
few minutes ago when looking into Lili's reply on the NDD patch thread:
NDD insns are implicitly zero-upper. Hence converting NDD to legacy insns
needs to be limited to 32- and 64-bit operand size. For 8- and 16-bit
operand size the results would differ, which isn't acceptable under any
-O<n>. It may be okay to do such a conversion even for the smaller sizes,
but then under a separate option explicitly permitting such a functional
difference.

Jan

Jan Beulich Dec. 12, 2023, 8:45 a.m. UTC | #4

On 12.12.2023 04:18, Hu, Lin1 wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Monday, December 11, 2023 8:28 PM
>>
>> On 24.11.2023 08:02, Cui, Lili wrote:
>>> --- a/gas/config/tc-i386.c
>>> +++ b/gas/config/tc-i386.c
>>> @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
>>>    return 0;
>>>  }
>>>
>>> +/* Check if the instruction use the REX registers.  */ static bool
>>> +check_RexOperands () {
>>> +  for (unsigned int op = 0; op < i.operands; op++)
>>> +    {
>>> +      if (i.types[op].bitfield.class != Reg)
>>> +	continue;
>>> +
>>> +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
>>> +	return true;
>>> +    }
>>> +
>>> +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
>>> +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
>>> +    return true;
>>> +
>>> +  /* Check pseudo prefix {rex} are valid.  */  return i.rex_encoding;
>>
>> Can this actually happen, when we're converting from EVEX to legacy?
>> (Initially I wanted to ask about "rex" and alike prefixes, i.e. the non- pseudo
>> ones.)
>>
> 
> This is to align with check_EgprOperands. I hope the function be more general. Not just for this optimization problem.

But then the comment shouldn't say "REX registers", and "Operands" in
its name isn't quite right either.

Also you want to make the function be a proper modern declaration, by
adding "void" between the parentheses.

Jan

Hu, Lin1 Dec. 13, 2023, 5:31 a.m. UTC | #5

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, December 12, 2023 4:42 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; binutils@sourceware.org; Cui, Lili
> <lili.cui@intel.com>
> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
> 
> On 12.12.2023 04:18, Hu, Lin1 wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Monday, December 11, 2023 8:28 PM
> >>
> >> On 24.11.2023 08:02, Cui, Lili wrote:
> >>> @@ -7675,6 +7727,61 @@ match_template (char mnem_suffix)
> >>>  	  i.memshift = memshift;
> >>>  	}
> >>>
> >>> +      /* If we can optimize a NDD insn to legacy insn, like
> >>> +	 add %r16, %r8, %r8 -> add %r16, %r8,
> >>> +	 add  %r8, %r16, %r8 -> add %r16, %r8, then rematch template.
> >>> +	 Note that the semantics have not been changed.  */
> >>> +      if (optimize
> >>> +	  && !i.no_optimize
> >>> +	  && i.vec_encoding != vex_encoding_evex
> >>> +	  && t + 1 < current_templates->end
> >>> +	  && !t[1].opcode_modifier.evex
> >>> +	  && t[1].opcode_space <= SPACE_0F38
> >>> +	  && t->opcode_modifier.vexvvvv == VexVVVV_DST)
> >>> +	{
> >>> +	  unsigned int match_dest_op = can_convert_NDD_to_legacy (t);
> >>> +	  size_match = true;
> >>
> >> This would perhaps better ...
> >>
> >>> +	  if (match_dest_op != (unsigned int) ~0)
> >>> +	    {
> >>
> >> ... live here
> >>
> >
> > OK.
> >
> >>
> >>> +	      /* We ensure that the next template has the same input
> >>> +		 operands as the original matching template by the first
> >>> +		 opernd (ATT), thus avoiding the error caused by the wrong
> >> order
> >>> +		 of insns in i386.tbl.  */
> >>
> >> I'm sorry, but I (still) can't make sense of this last part of the
> >> comment, after the comma.
> >>
> >
> > I mean if someone support new NDD insns and put it in the wrong position, so
> the part will try to avoid to optimize the insn.
> 
> If this is about hypothetical new templates, that would want saying so in the
> comment. Thus clarifying that there's no functional effect right now.
> I wonder what H.J.'s view on such effectively dead code is.
> 
> However, there's a bigger problem with this patch as I realized only a few
> minutes ago when looking into Lili's reply on the NDD patch thread:
> NDD insns are implicitly zero-upper. Hence converting NDD to legacy insns needs
> to be limited to 32- and 64-bit operand size. For 8- and 16-bit operand size the
> results would differ, which isn't acceptable under any -O<n>. It may be okay to
> do such a conversion even for the smaller sizes, but then under a separate
> option explicitly permitting such a functional difference.
> 

We can constraint the optimization by add some conditions, like i.types[i.operands - 1].bitfield.dword || i.types[i.operands - 1].bitfield.qword.

BRs,
Lin

Hu, Lin1 Dec. 13, 2023, 6:06 a.m. UTC | #6

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, December 12, 2023 4:46 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; binutils@sourceware.org; Cui, Lili
> <lili.cui@intel.com>
> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
> 
> On 12.12.2023 04:18, Hu, Lin1 wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Monday, December 11, 2023 8:28 PM
> >>
> >> On 24.11.2023 08:02, Cui, Lili wrote:
> >>> --- a/gas/config/tc-i386.c
> >>> +++ b/gas/config/tc-i386.c
> >>> @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
> >>>    return 0;
> >>>  }
> >>>
> >>> +/* Check if the instruction use the REX registers.  */ static bool
> >>> +check_RexOperands () {
> >>> +  for (unsigned int op = 0; op < i.operands; op++)
> >>> +    {
> >>> +      if (i.types[op].bitfield.class != Reg)
> >>> +	continue;
> >>> +
> >>> +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
> >>> +	return true;
> >>> +    }
> >>> +
> >>> +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
> >>> +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
> >>> +    return true;
> >>> +
> >>> +  /* Check pseudo prefix {rex} are valid.  */  return
> >>> + i.rex_encoding;
> >>
> >> Can this actually happen, when we're converting from EVEX to legacy?
> >> (Initially I wanted to ask about "rex" and alike prefixes, i.e. the
> >> non- pseudo
> >> ones.)
> >>
> >
> > This is to align with check_EgprOperands. I hope the function be more general.
> Not just for this optimization problem.
> 
> But then the comment shouldn't say "REX registers", and "Operands" in its name
> isn't quite right either.
> 
> Also you want to make the function be a proper modern declaration, by adding
> "void" between the parentheses.
> 

I have modified the comment like " Check if the instruction use the REX registers or REX prefix." And function name is check_RexOperands_or_RexPrefix.

BRs,
Lin

Jan Beulich Dec. 13, 2023, 8:19 a.m. UTC | #7

On 13.12.2023 07:06, Hu, Lin1 wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Tuesday, December 12, 2023 4:46 PM
>> To: Hu, Lin1 <lin1.hu@intel.com>
>> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; binutils@sourceware.org; Cui, Lili
>> <lili.cui@intel.com>
>> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
>>
>> On 12.12.2023 04:18, Hu, Lin1 wrote:
>>>> -----Original Message-----
>>>> From: Jan Beulich <jbeulich@suse.com>
>>>> Sent: Monday, December 11, 2023 8:28 PM
>>>>
>>>> On 24.11.2023 08:02, Cui, Lili wrote:
>>>>> --- a/gas/config/tc-i386.c
>>>>> +++ b/gas/config/tc-i386.c
>>>>> @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
>>>>>    return 0;
>>>>>  }
>>>>>
>>>>> +/* Check if the instruction use the REX registers.  */ static bool
>>>>> +check_RexOperands () {
>>>>> +  for (unsigned int op = 0; op < i.operands; op++)
>>>>> +    {
>>>>> +      if (i.types[op].bitfield.class != Reg)
>>>>> +	continue;
>>>>> +
>>>>> +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
>>>>> +	return true;
>>>>> +    }
>>>>> +
>>>>> +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
>>>>> +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
>>>>> +    return true;
>>>>> +
>>>>> +  /* Check pseudo prefix {rex} are valid.  */  return
>>>>> + i.rex_encoding;
>>>>
>>>> Can this actually happen, when we're converting from EVEX to legacy?
>>>> (Initially I wanted to ask about "rex" and alike prefixes, i.e. the
>>>> non- pseudo
>>>> ones.)
>>>>
>>>
>>> This is to align with check_EgprOperands. I hope the function be more general.
>> Not just for this optimization problem.
>>
>> But then the comment shouldn't say "REX registers", and "Operands" in its name
>> isn't quite right either.
>>
>> Also you want to make the function be a proper modern declaration, by adding
>> "void" between the parentheses.
>>
> 
> I have modified the comment like " Check if the instruction use the REX registers or REX prefix." And function name is check_RexOperands_or_RexPrefix.

How about check_Rex_required() or is_rex_required() or some such? No need
to have two "Rex" in a single name.

Jan

Hu, Lin1 Dec. 13, 2023, 8:34 a.m. UTC | #8

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Wednesday, December 13, 2023 4:19 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; binutils@sourceware.org; Cui, Lili
> <lili.cui@intel.com>
> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
> 
> On 13.12.2023 07:06, Hu, Lin1 wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Tuesday, December 12, 2023 4:46 PM
> >> To: Hu, Lin1 <lin1.hu@intel.com>
> >> Cc: Lu, Hongjiu <hongjiu.lu@intel.com>; binutils@sourceware.org; Cui,
> >> Lili <lili.cui@intel.com>
> >> Subject: Re: [PATCH v3 8/9] Support APX NDD optimized encoding.
> >>
> >> On 12.12.2023 04:18, Hu, Lin1 wrote:
> >>>> -----Original Message-----
> >>>> From: Jan Beulich <jbeulich@suse.com>
> >>>> Sent: Monday, December 11, 2023 8:28 PM
> >>>>
> >>>> On 24.11.2023 08:02, Cui, Lili wrote:
> >>>>> --- a/gas/config/tc-i386.c
> >>>>> +++ b/gas/config/tc-i386.c
> >>>>> @@ -7148,6 +7148,58 @@ check_APX_operands (const insn_template *t)
> >>>>>    return 0;
> >>>>>  }
> >>>>>
> >>>>> +/* Check if the instruction use the REX registers.  */ static
> >>>>> +bool check_RexOperands () {
> >>>>> +  for (unsigned int op = 0; op < i.operands; op++)
> >>>>> +    {
> >>>>> +      if (i.types[op].bitfield.class != Reg)
> >>>>> +	continue;
> >>>>> +
> >>>>> +      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
> >>>>> +	return true;
> >>>>> +    }
> >>>>> +
> >>>>> +  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
> >>>>> +      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
> >>>>> +    return true;
> >>>>> +
> >>>>> +  /* Check pseudo prefix {rex} are valid.  */  return
> >>>>> + i.rex_encoding;
> >>>>
> >>>> Can this actually happen, when we're converting from EVEX to legacy?
> >>>> (Initially I wanted to ask about "rex" and alike prefixes, i.e. the
> >>>> non- pseudo
> >>>> ones.)
> >>>>
> >>>
> >>> This is to align with check_EgprOperands. I hope the function be more
> general.
> >> Not just for this optimization problem.
> >>
> >> But then the comment shouldn't say "REX registers", and "Operands" in
> >> its name isn't quite right either.
> >>
> >> Also you want to make the function be a proper modern declaration, by
> >> adding "void" between the parentheses.
> >>
> >
> > I have modified the comment like " Check if the instruction use the REX
> registers or REX prefix." And function name is check_RexOperands_or_RexPrefix.
> 
> How about check_Rex_required() or is_rex_required() or some such? No need to
> have two "Rex" in a single name.
> 

OK, thanks for your advice, Have modified to check_Rex_required().

BRs,
Lin

diff mbox series

Patch

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index e7e104dba07..aa66f704c48 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -7148,6 +7148,58 @@  check_APX_operands (const insn_template *t)
   return 0;
 }
 
+/* Check if the instruction use the REX registers.  */
+static bool
+check_RexOperands ()
+{
+  for (unsigned int op = 0; op < i.operands; op++)
+    {
+      if (i.types[op].bitfield.class != Reg)
+	continue;
+
+      if (i.op[op].regs->reg_flags & (RegRex | RegRex64))
+	return true;
+    }
+
+  if ((i.index_reg && (i.index_reg->reg_flags & (RegRex | RegRex64)))
+      || (i.base_reg && (i.base_reg->reg_flags & (RegRex | RegRex64))))
+    return true;
+
+  /* Check pseudo prefix {rex} are valid.  */
+  return i.rex_encoding;
+}
+
+/* Optimize APX NDD insns to legacy insns.  */
+static unsigned int
+can_convert_NDD_to_legacy (const insn_template *t)
+{
+  unsigned int match_dest_op = ~0;
+
+  if (t->opcode_modifier.vexvvvv == VexVVVV_DST
+      && t->opcode_space == SPACE_EVEXMAP4
+      && !i.has_nf
+      && i.reg_operands >= 2)
+    {
+      unsigned int dest = i.operands - 1;
+      unsigned int src1 = i.operands - 2;
+      unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
+
+      if (i.types[src1].bitfield.class == Reg
+	  && i.op[src1].regs == i.op[dest].regs)
+	match_dest_op = src1;
+      /* If the first operand is the same as the third operand,
+	 these instructions need to support the ability to commutative
+	 the first two operands and still not change the semantics in order
+	 to be optimized.  */
+      else if (i.types[src2].bitfield.class == Reg
+	       && i.op[src2].regs == i.op[dest].regs
+	       && optimize > 1
+	       && t->opcode_modifier.commutative)
+	match_dest_op = src2;
+    }
+  return match_dest_op;
+}
+
 /* Helper function for the progress() macro in match_template().  */
 static INLINE enum i386_error progress (enum i386_error new,
 					enum i386_error last,
@@ -7675,6 +7727,61 @@  match_template (char mnem_suffix)
 	  i.memshift = memshift;
 	}
 
+      /* If we can optimize a NDD insn to legacy insn, like
+	 add %r16, %r8, %r8 -> add %r16, %r8,
+	 add  %r8, %r16, %r8 -> add %r16, %r8, then rematch template.
+	 Note that the semantics have not been changed.  */
+      if (optimize
+	  && !i.no_optimize
+	  && i.vec_encoding != vex_encoding_evex
+	  && t + 1 < current_templates->end
+	  && !t[1].opcode_modifier.evex
+	  && t[1].opcode_space <= SPACE_0F38
+	  && t->opcode_modifier.vexvvvv == VexVVVV_DST)
+	{
+	  unsigned int match_dest_op = can_convert_NDD_to_legacy (t);
+	  size_match = true;
+
+	  if (match_dest_op != (unsigned int) ~0)
+	    {
+	      /* We ensure that the next template has the same input
+		 operands as the original matching template by the first
+		 opernd (ATT), thus avoiding the error caused by the wrong order
+		 of insns in i386.tbl.  */
+	      overlap0 = operand_type_and (i.types[0],
+					   t[1].operand_types[0]);
+	      if (t->opcode_modifier.d)
+		overlap1 = operand_type_and (i.types[0],
+					     t[1].operand_types[1]);
+	      if (!operand_type_match (overlap0, i.types[0])
+		  && (!t->opcode_modifier.d
+		      || (t->opcode_modifier.d
+			  && !operand_type_match (overlap1, i.types[0]))))
+		size_match = false;
+
+	      if (size_match
+		  /* Optimizing some non-legacy-map0/1 without REX/REX2 prefix will be valuable.  */
+		  && (t[1].opcode_space <= SPACE_0F
+		      || (!check_EgprOperands (t + 1)
+			  && !check_RexOperands ()
+			  && !i.op[i.operands - 1].regs->reg_type.bitfield.qword)))
+		{
+		  unsigned int src1 = i.operands - 2;
+		  unsigned int src2 = (i.operands > 3) ? i.operands - 3 : 0;
+
+		  if (match_dest_op == src2)
+		    swap_2_operands (match_dest_op, src1);
+
+		  --i.operands;
+		  --i.reg_operands;
+
+		  specific_error = progress (internal_error);
+		  continue;
+		}
+
+	    }
+	}
+
       /* We've found a match; break out of loop.  */
       break;
     }
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
new file mode 100644
index 00000000000..6f841a807a9
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
@@ -0,0 +1,130 @@ 
+#as: -Os
+#objdump: -drw
+#name: x86-64 APX NDD optimized encoding
+#source: x86-64-apx-ndd-optimize.s
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+\s*[a-f0-9]+:\s*d5 4d 01 f8          	add    %r31,%r8
+\s*[a-f0-9]+:\s*d5 45 00 f8          	add    %r31b,%r8b
+\s*[a-f0-9]+:\s*d5 4d 01 f8          	add    %r31,%r8
+\s*[a-f0-9]+:\s*d5 1d 03 c7          	add    %r31,%r8
+\s*[a-f0-9]+:\s*d5 4d 03 38          	add    \(%r8\),%r31
+\s*[a-f0-9]+:\s*d5 1d 03 07          	add    \(%r31\),%r8
+\s*[a-f0-9]+:\s*49 81 c7 33 44 34 12 	add    \$0x12344433,%r15
+\s*[a-f0-9]+:\s*49 81 c0 11 22 33 f4 	add    \$0xfffffffff4332211,%r8
+\s*[a-f0-9]+:\s*d5 19 ff c7          	inc    %r31
+\s*[a-f0-9]+:\s*d5 11 fe c7          	inc    %r31b
+\s*[a-f0-9]+:\s*d5 1c 29 f9          	sub    %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 28 f9          	sub    %r15b,%r17b
+\s*[a-f0-9]+:\s*62 54 84 18 29 38    	sub    %r15,\(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 2b 04 07       	sub    \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 ee 34 12 00 00 	sub    \$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 18 ff c9          	dec    %r17
+\s*[a-f0-9]+:\s*d5 10 fe c9          	dec    %r17b
+\s*[a-f0-9]+:\s*d5 1c 19 f9          	sbb    %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 18 f9          	sbb    %r15b,%r17b
+\s*[a-f0-9]+:\s*62 54 84 18 19 38    	sbb    %r15,\(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 1b 04 07       	sbb    \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 de 34 12 00 00 	sbb    \$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 21 f9          	and    %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 20 f9          	and    %r15b,%r17b
+\s*[a-f0-9]+:\s*4d 23 38             	and    \(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 23 04 07       	and    \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 11 81 e6 34 12 00 00 	and    \$0x1234,%r30d
+\s*[a-f0-9]+:\s*d5 1c 09 f9          	or     %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 08 f9          	or     %r15b,%r17b
+\s*[a-f0-9]+:\s*4d 0b 38             	or     \(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 0b 04 07       	or     \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 ce 34 12 00 00 	or     \$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 31 f9          	xor    %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 30 f9          	xor    %r15b,%r17b
+\s*[a-f0-9]+:\s*4d 33 38             	xor    \(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 33 04 07       	xor    \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 f6 34 12 00 00 	xor    \$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 11 f9          	adc    %r15,%r17
+\s*[a-f0-9]+:\s*d5 14 10 f9          	adc    %r15b,%r17b
+\s*[a-f0-9]+:\s*4d 13 38             	adc    \(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 13 04 07       	adc    \(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 d6 34 12 00 00 	adc    \$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 18 f7 d9          	neg    %r17
+\s*[a-f0-9]+:\s*d5 10 f6 d9          	neg    %r17b
+\s*[a-f0-9]+:\s*d5 18 f7 d1          	not    %r17
+\s*[a-f0-9]+:\s*d5 10 f6 d1          	not    %r17b
+\s*[a-f0-9]+:\s*67 0f af 90 09 09 09 00 	imul   0x90909\(%eax\),%edx
+\s*[a-f0-9]+:\s*d5 aa af 94 f8 09 09 00 00 	imul   0x909\(%rax,%r31,8\),%rdx
+\s*[a-f0-9]+:\s*48 0f af d0          	imul   %rax,%rdx
+\s*[a-f0-9]+:\s*d5 19 d1 c7          	rol    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 c7          	rol    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 c4 02          	rol    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 c4 02          	rol    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 cf          	ror    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 cf          	ror    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 cc 02          	ror    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 cc 02          	ror    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 d7          	rcl    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 d7          	rcl    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 d4 02          	rcl    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 d4 02          	rcl    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 df          	rcr    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 df          	rcr    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 dc 02          	rcr    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 dc 02          	rcr    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 e7          	shl    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 e7          	shl    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 e4 02          	shl    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 e4 02          	shl    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 e7          	shl    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 e7          	shl    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 e4 02          	shl    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 e4 02          	shl    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 ef          	shr    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 ef          	shr    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 ec 02          	shr    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 ec 02          	shr    \$0x2,%r12b
+\s*[a-f0-9]+:\s*d5 19 d1 ff          	sar    \$1,%r31
+\s*[a-f0-9]+:\s*d5 11 d0 ff          	sar    \$1,%r31b
+\s*[a-f0-9]+:\s*49 c1 fc 02          	sar    \$0x2,%r12
+\s*[a-f0-9]+:\s*41 c0 fc 02          	sar    \$0x2,%r12b
+\s*[a-f0-9]+:\s*62 74 9c 18 24 20 01 	shld   \$0x1,%r12,\(%rax\),%r12
+\s*[a-f0-9]+:\s*4d 0f a4 c4 02       	shld   \$0x2,%r8,%r12
+\s*[a-f0-9]+:\s*62 54 bc 18 24 c4 02 	shld   \$0x2,%r8,%r12,%r8
+\s*[a-f0-9]+:\s*62 74 b4 18 a5 08    	shld   %cl,%r9,\(%rax\),%r9
+\s*[a-f0-9]+:\s*d5 9c a5 e0          	shld   %cl,%r12,%r16
+\s*[a-f0-9]+:\s*62 7c 9c 18 a5 e0    	shld   %cl,%r12,%r16,%r12
+\s*[a-f0-9]+:\s*62 74 9c 18 2c 20 01 	shrd   \$0x1,%r12,\(%rax\),%r12
+\s*[a-f0-9]+:\s*4d 0f ac ec 01       	shrd   \$0x1,%r13,%r12
+\s*[a-f0-9]+:\s*62 54 94 18 2c ec 01 	shrd   \$0x1,%r13,%r12,%r13
+\s*[a-f0-9]+:\s*62 74 b4 18 ad 08    	shrd   %cl,%r9,\(%rax\),%r9
+\s*[a-f0-9]+:\s*d5 9c ad e0          	shrd   %cl,%r12,%r16
+\s*[a-f0-9]+:\s*62 7c 9c 18 ad e0    	shrd   %cl,%r12,%r16,%r12
+\s*[a-f0-9]+:\s*67 0f 40 90 90 90 90 90 	cmovo  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 41 90 90 90 90 90 	cmovno -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 42 90 90 90 90 90 	cmovb  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 43 90 90 90 90 90 	cmovae -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 44 90 90 90 90 90 	cmove  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 45 90 90 90 90 90 	cmovne -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 46 90 90 90 90 90 	cmovbe -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 47 90 90 90 90 90 	cmova  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 48 90 90 90 90 90 	cmovs  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 49 90 90 90 90 90 	cmovns -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4a 90 90 90 90 90 	cmovp  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4b 90 90 90 90 90 	cmovnp -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4c 90 90 90 90 90 	cmovl  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4d 90 90 90 90 90 	cmovge -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4e 90 90 90 90 90 	cmovle -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4f 90 90 90 90 90 	cmovg  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*66 0f 38 f6 c3       	adcx   %ebx,%eax
+\s*[a-f0-9]+:\s*66 0f 38 f6 c3       	adcx   %ebx,%eax
+\s*[a-f0-9]+:\s*62 f4 fd 18 66 c3    	adcx   %rbx,%rax,%rax
+\s*[a-f0-9]+:\s*62 54 bd 18 66 c7    	adcx   %r15,%r8,%r8
+\s*[a-f0-9]+:\s*67 66 0f 38 f6 04 0a 	adcx   \(%edx,%ecx,1\),%eax
+\s*[a-f0-9]+:\s*f3 0f 38 f6 c3       	adox   %ebx,%eax
+\s*[a-f0-9]+:\s*f3 0f 38 f6 c3       	adox   %ebx,%eax
+\s*[a-f0-9]+:\s*62 f4 fe 18 66 c3    	adox   %rbx,%rax,%rax
+\s*[a-f0-9]+:\s*62 54 be 18 66 c7    	adox   %r15,%r8,%r8
+\s*[a-f0-9]+:\s*67 f3 0f 38 f6 04 0a 	adox   \(%edx,%ecx,1\),%eax
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
new file mode 100644
index 00000000000..4335ee6d7ae
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
@@ -0,0 +1,123 @@ 
+# Check 64bit APX NDD instructions with optimized encoding
+
+	.text
+_start:
+add    %r31,%r8,%r8
+addb   %r31b,%r8b,%r8b
+{store} add    %r31,%r8,%r8
+{load}  add    %r31,%r8,%r8
+add    %r31,(%r8),%r31
+add    (%r31),%r8,%r8
+add    $0x12344433,%r15,%r15
+add    $0xfffffffff4332211,%r8,%r8
+inc    %r31,%r31
+incb   %r31b,%r31b
+sub    %r15,%r17,%r17
+subb   %r15b,%r17b,%r17b
+sub    %r15,(%r8),%r15
+sub    (%r15,%rax,1),%r16,%r16
+sub    $0x1234,%r30,%r30
+dec    %r17,%r17
+decb   %r17b,%r17b
+sbb    %r15,%r17,%r17
+sbbb   %r15b,%r17b,%r17b
+sbb    %r15,(%r8),%r15
+sbb    (%r15,%rax,1),%r16,%r16
+sbb    $0x1234,%r30,%r30
+and    %r15,%r17,%r17
+andb   %r15b,%r17b,%r17b
+and    %r15,(%r8),%r15
+and    (%r15,%rax,1),%r16,%r16
+and    $0x1234,%r30,%r30
+or     %r15,%r17,%r17
+orb    %r15b,%r17b,%r17b
+or     %r15,(%r8),%r15
+or     (%r15,%rax,1),%r16,%r16
+or     $0x1234,%r30,%r30
+xor    %r15,%r17,%r17
+xorb   %r15b,%r17b,%r17b
+xor    %r15,(%r8),%r15
+xor    (%r15,%rax,1),%r16,%r16
+xor    $0x1234,%r30,%r30
+adc    %r15,%r17,%r17
+adcb   %r15b,%r17b,%r17b
+adc    %r15,(%r8),%r15
+adc    (%r15,%rax,1),%r16,%r16
+adc    $0x1234,%r30,%r30
+neg    %r17,%r17
+negb   %r17b,%r17b
+not    %r17,%r17
+notb   %r17b,%r17b
+imul   0x90909(%eax),%edx,%edx
+imul   0x909(%rax,%r31,8),%rdx,%rdx
+imul   %rdx,%rax,%rdx
+rol    %r31,%r31
+rolb   %r31b,%r31b
+rol    $0x2,%r12,%r12
+rolb   $0x2,%r12b,%r12b
+ror    %r31,%r31
+rorb   %r31b,%r31b
+ror    $0x2,%r12,%r12
+rorb   $0x2,%r12b,%r12b
+rcl    %r31,%r31
+rclb   %r31b,%r31b
+rcl    $0x2,%r12,%r12
+rclb   $0x2,%r12b,%r12b
+rcr    %r31,%r31
+rcrb   %r31b,%r31b
+rcr    $0x2,%r12,%r12
+rcrb   $0x2,%r12b,%r12b
+sal    %r31,%r31
+salb   %r31b,%r31b
+sal    $0x2,%r12,%r12
+salb   $0x2,%r12b,%r12b
+shl    %r31,%r31
+shlb   %r31b,%r31b
+shl    $0x2,%r12,%r12
+shlb   $0x2,%r12b,%r12b
+shr    %r31,%r31
+shrb   %r31b,%r31b
+shr    $0x2,%r12,%r12
+shrb   $0x2,%r12b,%r12b
+sar    %r31,%r31
+sarb   %r31b,%r31b
+sar    $0x2,%r12,%r12
+sarb   $0x2,%r12b,%r12b
+shld   $0x1,%r12,(%rax),%r12
+shld   $0x2,%r8,%r12,%r12
+shld   $0x2,%r8,%r12,%r8
+shld   %cl,%r9,(%rax),%r9
+shld   %cl,%r12,%r16,%r16
+shld   %cl,%r12,%r16,%r12
+shrd   $0x1,%r12,(%rax),%r12
+shrd   $0x1,%r13,%r12,%r12
+shrd   $0x1,%r13,%r12,%r13
+shrd   %cl,%r9,(%rax),%r9
+shrd   %cl,%r12,%r16,%r16
+shrd   %cl,%r12,%r16,%r12
+cmovo  0x90909090(%eax),%edx,%edx
+cmovno 0x90909090(%eax),%edx,%edx
+cmovb  0x90909090(%eax),%edx,%edx
+cmovae 0x90909090(%eax),%edx,%edx
+cmove  0x90909090(%eax),%edx,%edx
+cmovne 0x90909090(%eax),%edx,%edx
+cmovbe 0x90909090(%eax),%edx,%edx
+cmova  0x90909090(%eax),%edx,%edx
+cmovs  0x90909090(%eax),%edx,%edx
+cmovns 0x90909090(%eax),%edx,%edx
+cmovp  0x90909090(%eax),%edx,%edx
+cmovnp 0x90909090(%eax),%edx,%edx
+cmovl  0x90909090(%eax),%edx,%edx
+cmovge 0x90909090(%eax),%edx,%edx
+cmovle 0x90909090(%eax),%edx,%edx
+cmovg  0x90909090(%eax),%edx,%edx
+adcx   %ebx,%eax,%eax
+adcx   %eax,%ebx,%eax
+adcx   %rbx,%rax,%rax
+adcx   %r15,%r8,%r8
+adcx   (%edx,%ecx,1),%eax,%eax
+adox   %ebx,%eax,%eax
+adox   %eax,%ebx,%eax
+adox   %rbx,%rax,%rax
+adox   %r15,%r8,%r8
+adox   (%edx,%ecx,1),%eax,%eax
diff --git a/gas/testsuite/gas/i386/x86-64.exp b/gas/testsuite/gas/i386/x86-64.exp
index b834379a491..034fc49b180 100644
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -558,6 +558,7 @@  run_dump_test "x86-64-optimize-6"
 run_list_test "x86-64-optimize-7a" "-I${srcdir}/$subdir -march=+noavx -al"
 run_dump_test "x86-64-optimize-7b"
 run_list_test "x86-64-optimize-8" "-I${srcdir}/$subdir -march=+noavx2 -al"
+run_dump_test "x86-64-apx-ndd-optimize"
 run_dump_test "x86-64-align-branch-1a"
 run_dump_test "x86-64-align-branch-1b"
 run_dump_test "x86-64-align-branch-1c"