[v3,6/9] Support APX NDD

Message ID 20231124070213.3886483-6-lili.cui@intel.com
State Unresolved
Headers
Series [1/9] Make const_1_mode print $1 in AT&T syntax |

Checks

Context Check Description
snail/binutils-gdb-check warning Git am fail log

Commit Message

Cui, Lili Nov. 24, 2023, 7:02 a.m. UTC
  From: konglin1 <lingling.kong@intel.com>

opcodes/ChangeLog:

	* opcodes/i386-dis-evex-prefix.h: Add NDD decode for adox/adcx.
	* opcodes/i386-dis-evex-reg.h: Handle for REG_EVEX_MAP4_80,
	REG_EVEX_MAP4_81, REG_EVEX_MAP4_83,  REG_EVEX_MAP4_F6,
	REG_EVEX_MAP4_F7, REG_EVEX_MAP4_FE, REG_EVEX_MAP4_FF.
	* opcodes/i386-dis-evex.h: Add NDD insn.
	* opcodes/i386-dis.c (VexGb): Add new define.
	(VexGv): Ditto.
	(get_valid_dis386): Change for NDD decode.
	(print_insn): Ditto.
	(print_register): Ditto.
	(intel_operand_size): Ditto.
	(OP_E_memory): Ditto.
	(OP_VEX): Ditto.
	* opcodes/i386-opc.h (VexVVVV_SRC): New.
	VexVVVV_DST):  Ditto.
	* opcodes/i386-opc.tbl: Add APX NDD instructions and adjust VexVVVV.
	* opcodes/i386-tbl.h: Regenerated.

gas/ChangeLog:

	* gas/config/tc-i386.c (is_any_apx_evex_encoding): Add legacy insn
	promote to SPACE_EVEXMAP4.
	(md_assemble): Change for ndd encode.
	(process_operands): Ditto.
	(build_modrm_byte): Ditto.
	(operand_size_match):
	Support APX NDD that the number of operands is 3.
	(match_template): Support swap the first two operands for
	APX NDD.
	reg_table
	* testsuite/gas/i386/x86-64.exp: Add x86-64-apx-ndd.
	* testsuite/gas/i386/x86-64-apx-ndd.d: New test.
	* testsuite/gas/i386/x86-64-apx-ndd.s: Ditto.
	* testsuite/gas/i386/x86-64-pseudos.d: Add test.
	* testsuite/gas/i386/x86-64-pseudos.s: Ditto.
	* testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d : Ditto.
	* testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s : Ditto.
---
 gas/config/tc-i386.c                          |  82 ++++++---
 .../gas/i386/x86-64-apx-evex-promoted-bad.d   |   2 +
 .../gas/i386/x86-64-apx-evex-promoted-bad.s   |   2 +
 gas/testsuite/gas/i386/x86-64-apx-ndd.d       | 160 +++++++++++++++++
 gas/testsuite/gas/i386/x86-64-apx-ndd.s       | 155 ++++++++++++++++
 gas/testsuite/gas/i386/x86-64-pseudos.d       |  42 +++++
 gas/testsuite/gas/i386/x86-64-pseudos.s       |  43 +++++
 gas/testsuite/gas/i386/x86-64.exp             |   1 +
 opcodes/i386-dis-evex-reg.h                   |  55 ++++++
 opcodes/i386-dis-evex.h                       | 124 ++++++-------
 opcodes/i386-dis.c                            | 169 +++++++++++-------
 opcodes/i386-opc.h                            |   6 +-
 opcodes/i386-opc.tbl                          |  89 +++++++++
 13 files changed, 775 insertions(+), 155 deletions(-)
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd.d
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd.s
  

Comments

Jan Beulich Dec. 8, 2023, 2:12 p.m. UTC | #1
On 24.11.2023 08:02, Cui, Lili wrote:
> @@ -8870,25 +8890,33 @@ build_modrm_byte (void)
>  				     || i.vec_encoding == vex_encoding_evex));
>      }
>  
> -  for (v = source + 1; v < dest; ++v)
> -    if (v != reg_slot)
> -      break;
> -  if (v >= dest)
> -    v = ~0;
> -  if (i.tm.extension_opcode != None)
> +  if (i.tm.opcode_modifier.vexvvvv == VexVVVV_DST)
>      {
> -      if (dest != source)
> -	v = dest;
> -      dest = ~0;
> +      v = dest;
> +      dest-- ;

Nit: Stray blank.

>      }
> -  gas_assert (source < dest);

Starting from this line, do you really need to move that into the "else"
branch? It looks to me as it it could stay here. (Maybe I'm wrong with
the assertion itself, but ...

> -  if (i.tm.opcode_modifier.operandconstraint == SWAP_SOURCES
> -      && source != op)

... this entire if() pretty surely can stay as is, as there are no
templates with both DstVVVV and SwapSources afaict. (Thing is - as
before - that it isn't easy to see that what is happening here is
really just re-indentation. Iirc in an earlier version there actually
were hidden changes.) If you want this moved as an optimization,
please do so in a separate patch.

> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
> @@ -27,4 +27,6 @@ Disassembly of section .text:
>  [ 	]*[a-f0-9]+:[ 	]+c8 ff ff ff[ 	]+enter  \$0xffff,\$0xff
>  [ 	]*[a-f0-9]+:[ 	]+67 62 f2 7c 18 f5[ 	]+addr32 \(bad\)
>  [ 	]*[a-f0-9]+:[ 	]+0b ff[ 	]+or     %edi,%edi
> +[ 	]*[a-f0-9]+:[ 	]+62 f4 fc 08 ff[ 	]+\(bad\)
> +[ 	]*[a-f0-9]+:[ 	]+d8[ 	]+.byte 0xd8
>  #pass
> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
> @@ -26,3 +26,5 @@ _start:
>  	#EVEX from VEX bzhi %ebx,%eax,%ecx EVEX.P[20](EVEX.b) == 1 (illegal value).
>  	.insn EVEX.L0.NP.0f38.W0 0xf5, %eax ,(%ebx){1to8}, %ecx
>  	.byte 0xff
> +	#{evex} inc %rax %rbx EVEX.vvvv' != 1111 && EVEX.ND = 0.
> +	.insn EVEX.L0.NP.M4.W1 0xff, %rax, %rbx

I don't think this does what you want. In the .d file the 4 bits are
all set. I think you mean something like

	.insn EVEX.L0.NP.M4.W1 0xff/0, %rcx, %rbx

(i.e. ModR/M.reg specified as opcode extension _and_ the first operand
not the accumulator). The reason disassembly fails for what you've used
looks to be ModR/M.reg == 0b011 (resulting from the use of %rbx).

(Also, nit: What's EVEX.vvvv' ? I.e. what's the ' there about?)

> --- /dev/null
> +++ b/gas/testsuite/gas/i386/x86-64-apx-ndd.s
> @@ -0,0 +1,155 @@
> +# Check 64bit APX NDD instructions with evex prefix encoding
> +
> +	.allow_index_reg
> +	.text
> +_start:
> +	adc    $0x1234,%ax,%r30w
> +	adc    %r15b,%r17b,%r18b
> +	adc    %r15d,(%r8),%r18d
> +	adc    (%r15,%rax,1),%r16b,%r8b
> +	adc    (%r15,%rax,1),%r16w,%r8w
> +	adcl   $0x11,(%r19,%rax,4),%r20d
> +	adcx   %r15d,%r8d,%r18d
> +	adcx   (%r15,%r31,1),%r8
> +	adcx   (%r15,%r31,1),%r8d,%r18d
> +	add    $0x1234,%ax,%r30w
> +	add    $0x12344433,%r15,%r16
> +	add    $0x34,%r13b,%r17b
> +	add    $0xfffffffff4332211,%rax,%r8
> +	add    %r31,%r8,%r16
> +	add    %r31,(%r8),%r16
> +	add    %r31,(%r8,%r16,8),%r16
> +	add    %r31b,%r8b,%r16b
> +	add    %r31d,%r8d,%r16d
> +	add    %r31w,%r8w,%r16w
> +	add    (%r31),%r8,%r16
> +	add    0x9090(%r31,%r16,1),%r8,%r16
> +	addb    %r31b,%r8b,%r16b
> +	addl    %r31d,%r8d,%r16d
> +	addl   $0x11,(%r19,%rax,4),%r20d
> +	addq    %r31,%r8,%r16
> +	addq   $0x12344433,(%r15,%rcx,4),%r16
> +	addw    %r31w,%r8w,%r16w
> +	adox   %r15d,%r8d,%r18d

Nit: Inconsistent blank padding.

> +	{load}  add    %r31,%r8,%r16
> +	{store} add    %r31,%r8,%r16
> +	adox   (%r15,%r31,1),%r8
> +	adox   (%r15,%r31,1),%r8d,%r18d
> +	and    $0x1234,%ax,%r30w
> +	and    %r15b,%r17b,%r18b
> +	and    %r15d,(%r8),%r18d
> +	and    (%r15,%rax,1),%r16b,%r8b
> +	and    (%r15,%rax,1),%r16w,%r8w
> +	andl   $0x11,(%r19,%rax,4),%r20d
> +	cmova  0x90909090(%eax),%edx,%r8d
> +	cmovae 0x90909090(%eax),%edx,%r8d
> +	cmovb  0x90909090(%eax),%edx,%r8d
> +	cmovbe 0x90909090(%eax),%edx,%r8d
> +	cmove  0x90909090(%eax),%edx,%r8d
> +	cmovg  0x90909090(%eax),%edx,%r8d
> +	cmovge 0x90909090(%eax),%edx,%r8d
> +	cmovl  0x90909090(%eax),%edx,%r8d
> +	cmovle 0x90909090(%eax),%edx,%r8d
> +	cmovne 0x90909090(%eax),%edx,%r8d
> +	cmovno 0x90909090(%eax),%edx,%r8d
> +	cmovnp 0x90909090(%eax),%edx,%r8d
> +	cmovns 0x90909090(%eax),%edx,%r8d
> +	cmovo  0x90909090(%eax),%edx,%r8d
> +	cmovp  0x90909090(%eax),%edx,%r8d
> +	cmovs  0x90909090(%eax),%edx,%r8d
> +	dec    %rax,%r17
> +	decb   (%r31,%r12,1),%r8b
> +	imul   0x909(%rax,%r31,8),%rdx,%r25
> +	imul   0x90909(%eax),%edx,%r8d
> +	inc    %r31,%r16
> +	inc    %r31,%r8
> +	inc    %rax,%rbx
> +	neg    %rax,%r17
> +	negb   (%r31,%r12,1),%r8b
> +	not    %rax,%r17
> +	notb   (%r31,%r12,1),%r8b
> +	or     $0x1234,%ax,%r30w
> +	or     %r15b,%r17b,%r18b
> +	or     %r15d,(%r8),%r18d
> +	or     (%r15,%rax,1),%r16b,%r8b
> +	or     (%r15,%rax,1),%r16w,%r8w
> +	orl    $0x11,(%r19,%rax,4),%r20d
> +	rcl    $0x2,%r12b,%r31b
> +	rcl    %cl,%r16b,%r8b
> +	rclb   $0x1, (%rax),%r31b
> +	rcll   $0x2,(%rax),%r31d
> +	rclw   $0x1, (%rax),%r31w

Nit: Would be nice if there consistently were or were not blanks after
the commas.

> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -139,9 +139,13 @@
>  #define Vsz256 Vsz=VSZ256
>  #define Vsz512 Vsz=VSZ512
>  
> +#define DstVVVV VexVVVV=VexVVVV_DST
> +
>  // The EVEX purpose of StaticRounding appears only together with SAE. Re-use
>  // the bit to mark commutative VEX encodings where swapping the source
>  // operands may allow to switch from 3-byte to 2-byte VEX encoding.
> +// And re-use the bit to mark some NDD insns that swapping the source operands
> +// may allow to switch from EVEX encoding to REX2 encoding.
>  #define C StaticRounding
>  
>  #define FP 387|287|8087
> @@ -288,26 +292,40 @@ std, 0xfd, 0, NoSuf, {}
>  sti, 0xfb, 0, NoSuf, {}
>  
>  // Arithmetic.
> +add, 0x0, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }

There is _still_ Byte|Word|Dword|Qword in here (and below), when I think I
pointed out more than once before that in new templates such redundancy
wants omitting.

Since this isn't the first instance of earlier review comments not taken
care of, may I please ask that you make reasonably sure that new versions
aren't sent out like this?

>  add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +add, 0x83/0, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +add, 0x80/0, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
>  add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
>  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> +inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, {Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
>  inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Optimize|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }

Here and elsewhere, what's Optimize for? It not being there on other templates,
it can't be for the EVEX->REX2 optimization? If there are further optimization
plans, that's (again) something to mention in the description. Yet better would
be if such attributes were added only when respective optimizations are actually
introduced. Unlike e.g. NF, which would mean another bulk update if not added
right away, new optimizations typically affect only a few templates at a time.

>  sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sub, 0x83/5, APX_F, Modrm|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  sub, 0x2c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +sub, 0x80/5, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  sub, 0x80/5, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }

There are still only 3 new templates here (and also above for add, plus for
other similar insns), when ...

>  dec, 0x48, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> +dec, 0xfe/1, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  dec, 0xfe/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +sbb, 0x18, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x18, APX_F, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x83/3, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x83/3, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +sbb, 0x80/3, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x80/3, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }

... there are 6 new templates here. This is again an aspect I had pointed
out before. You cannot defer the addition of the other 3 until the NF patch,
as you want to make sure that with just this patch in place something both

    {evex} sbb %eax, %eax

and

    {evex} sub %eax, %eax

actually assemble, and to EVEX encodings. I can't see how that would work
in the latter case without those further templates.

The alternative is to also defer adding the 2-operand SBB templates (and
any others you add here which don't use DstVVVV).

>  cmp, 0x38, 0, D|W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  cmp, 0x83/7, 0, Modrm|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> @@ -318,31 +336,50 @@ test, 0x84, 0, D|W|C|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64, R
>  test, 0xa8, 0, W|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
>  test, 0xf6/0, 0, W|Modrm|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +and, 0x20, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  and, 0x20, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +and, 0x83/4, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  and, 0x83/4, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock|Optimize, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  and, 0x24, 0, W|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +and, 0x80/4, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  and, 0x80/4, 0, W|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +or, 0x8, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  or, 0x8, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +or, 0x83/1, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  or, 0x83/1, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  or, 0xc, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +or, 0x80/1, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  or, 0x80/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +xor, 0x30, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  xor, 0x30, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +xor, 0x83/6, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  xor, 0x34, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +xor, 0x80/6, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  xor, 0x80/6, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
>  // clr with 1 operand is really xor with 2 operands.
>  clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, { Reg8|Reg16|Reg32|Reg64 }

Btw., for consistency this may also want accompanying with an EVEX counterpart.

> +adc, 0x10, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  adc, 0x10, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x10, APX_F, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x83/2, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  adc, 0x83/2, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x83/2, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>  adc, 0x14, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> +adc, 0x80/2, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  adc, 0x80/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x80/2, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
> +neg, 0xf6/3, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  neg, 0xf6/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +
> +not, 0xf6/2, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  not, 0xf6/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +not, 0xf6/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  
>  aaa, 0x37, No64, NoSuf, {}
>  aas, 0x3f, No64, NoSuf, {}
> @@ -375,6 +412,7 @@ cqto, 0x99, x64, Size64|NoSuf, {}
>  // These multiplies can only be selected with single operand forms.
>  mul, 0xf6/4, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  imul, 0xf6/5, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +imul, 0xaf, APX_F, C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg16|Reg32|Reg64|Unspecified|Word|Dword|Qword|BaseIndex, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }

Missing NF?

>  imul, 0xfaf, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|Word|Dword|Qword|BaseIndex, Reg16|Reg32|Reg64 }
>  imul, 0x6b, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  imul, 0x69, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
> @@ -389,52 +427,98 @@ div, 0xf6/6, 0, W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|
>  idiv, 0xf6/7, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  idiv, 0xf6/7, 0, W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Acc|Byte|Word|Dword|Qword }
>  
> +rol, 0xd0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  rol, 0xd0/0, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +rol, 0xc0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  rol, 0xc0/0, i186, W|Modrm|No_sSuf, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +rol, 0xd2/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  rol, 0xd2/0, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +rol, 0xd0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }

Didn't we agree to avoid adding this (and its sibling) template, for the omitted
shift count being ambiguous? Consider

    rol %cl, %al

Is this a rotate by %cl, or a 1-bit NDD rotate?

Jan
  
Jan Beulich Dec. 8, 2023, 2:27 p.m. UTC | #2
On 24.11.2023 08:02, Cui, Lili wrote:
> --- a/opcodes/i386-dis-evex-reg.h
> +++ b/opcodes/i386-dis-evex-reg.h
> @@ -56,3 +56,58 @@
>      { "blsmskS",	{ VexGdq, Edq }, 0 },
>      { "blsiS",	{ VexGdq, Edq }, 0 },
>    },
> +  /* REG_EVEX_MAP4_80 */
> +  {
> +    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "adcA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "sbbA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "andA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "subA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "xorA",	{ VexGb, Eb, Ib }, NO_PREFIX },

Don't these need to use PREFIX_NP_OR_DATA? The doc clearly says
".IGNORED" there. (Applies to other byte ops as well then, of course.)

Jan
  
Cui, Lili Dec. 11, 2023, 1:36 p.m. UTC | #3
> On 24.11.2023 08:02, Cui, Lili wrote:
> > @@ -8870,25 +8890,33 @@ build_modrm_byte (void)
> >  				     || i.vec_encoding == vex_encoding_evex));
> >      }
> >
> > -  for (v = source + 1; v < dest; ++v)
> > -    if (v != reg_slot)
> > -      break;
> > -  if (v >= dest)
> > -    v = ~0;
> > -  if (i.tm.extension_opcode != None)
> > +  if (i.tm.opcode_modifier.vexvvvv == VexVVVV_DST)
> >      {
> > -      if (dest != source)
> > -	v = dest;
> > -      dest = ~0;
> > +      v = dest;
> > +      dest-- ;
> 
> Nit: Stray blank.
> 

Done.

> >      }
> > -  gas_assert (source < dest);
> 
> Starting from this line, do you really need to move that into the "else"
> branch? It looks to me as it it could stay here. (Maybe I'm wrong with the
> assertion itself, but ...
> 
> > -  if (i.tm.opcode_modifier.operandconstraint == SWAP_SOURCES
> > -      && source != op)
> 
> ... this entire if() pretty surely can stay as is, as there are no templates with
> both DstVVVV and SwapSources afaict. (Thing is - as before - that it isn't easy
> to see that what is happening here is really just re-indentation. Iirc in an
> earlier version there actually were hidden changes.) If you want this moved as
> an optimization, please do so in a separate patch.
> 

Moved "i.tm.extension_opcode != None" and SWAP_SOURCES.

  if (i.tm.opcode_modifier.vexvvvv == VexVVVV_DST)
    {
      v = dest;
      dest-- ;
    }
  else
    {
      for (v = source + 1; v < dest; ++v)
        if (v != reg_slot)
          break;
      if (v >= dest)
        v = ~0;
    }
  if (i.tm.extension_opcode != None)
    {
      if (dest != source)
        v = dest;
      dest = ~0;
    }
  gas_assert (source < dest);
  if (i.tm.opcode_modifier.operandconstraint == SWAP_SOURCES
      && source != op)
    {
      unsigned int tmp = source;

      source = v;
      v = tmp;
    }

> > --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
> > @@ -27,4 +27,6 @@ Disassembly of section .text:
> >  [ 	]*[a-f0-9]+:[ 	]+c8 ff ff ff[ 	]+enter  \$0xffff,\$0xff
> >  [ 	]*[a-f0-9]+:[ 	]+67 62 f2 7c 18 f5[ 	]+addr32 \(bad\)
> >  [ 	]*[a-f0-9]+:[ 	]+0b ff[ 	]+or     %edi,%edi
> > +[ 	]*[a-f0-9]+:[ 	]+62 f4 fc 08 ff[ 	]+\(bad\)
> > +[ 	]*[a-f0-9]+:[ 	]+d8[ 	]+.byte 0xd8
> >  #pass
> > --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
> > @@ -26,3 +26,5 @@ _start:
> >  	#EVEX from VEX bzhi %ebx,%eax,%ecx EVEX.P[20](EVEX.b) == 1 (illegal
> value).
> >  	.insn EVEX.L0.NP.0f38.W0 0xf5, %eax ,(%ebx){1to8}, %ecx
> >  	.byte 0xff
> > +	#{evex} inc %rax %rbx EVEX.vvvv' != 1111 && EVEX.ND = 0.
> > +	.insn EVEX.L0.NP.M4.W1 0xff, %rax, %rbx
> 
> I don't think this does what you want. In the .d file the 4 bits are all set. I think
> you mean something like
> 
> 	.insn EVEX.L0.NP.M4.W1 0xff/0, %rcx, %rbx
> 
> (i.e. ModR/M.reg specified as opcode extension _and_ the first operand not
> the accumulator). The reason disassembly fails for what you've used looks to
> be ModR/M.reg == 0b011 (resulting from the use of %rbx).
> 

Change it to SIB so don’t need to add 0xff.

.insn EVEX.L0.NP.M4.W1 0xff/0, (%rax,%rcx), %rbx

0000000000000000 <_start>:
   0:   62 f4 e4                (bad)
   3:   08 ff                      or     %bh,%bh
   5:   04 08                   add    $0x8,%al

> (Also, nit: What's EVEX.vvvv' ? I.e. what's the ' there about?)
> 

Oh, it should be EVEX.vvvv.

> > --- /dev/null
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-ndd.s
> > @@ -0,0 +1,155 @@
> > +# Check 64bit APX NDD instructions with evex prefix encoding
> > +
> > +	.allow_index_reg
> > +	.text
> > +_start:
> > +	adc    $0x1234,%ax,%r30w
> > +	adc    %r15b,%r17b,%r18b
> > +	adc    %r15d,(%r8),%r18d
> > +	adc    (%r15,%rax,1),%r16b,%r8b
> > +	adc    (%r15,%rax,1),%r16w,%r8w
> > +	adcl   $0x11,(%r19,%rax,4),%r20d
> > +	adcx   %r15d,%r8d,%r18d
> > +	adcx   (%r15,%r31,1),%r8
> > +	adcx   (%r15,%r31,1),%r8d,%r18d
> > +	add    $0x1234,%ax,%r30w
> > +	add    $0x12344433,%r15,%r16
> > +	add    $0x34,%r13b,%r17b
> > +	add    $0xfffffffff4332211,%rax,%r8
> > +	add    %r31,%r8,%r16
> > +	add    %r31,(%r8),%r16
> > +	add    %r31,(%r8,%r16,8),%r16
> > +	add    %r31b,%r8b,%r16b
> > +	add    %r31d,%r8d,%r16d
> > +	add    %r31w,%r8w,%r16w
> > +	add    (%r31),%r8,%r16
> > +	add    0x9090(%r31,%r16,1),%r8,%r16
> > +	addb    %r31b,%r8b,%r16b
> > +	addl    %r31d,%r8d,%r16d
> > +	addl   $0x11,(%r19,%rax,4),%r20d
> > +	addq    %r31,%r8,%r16
> > +	addq   $0x12344433,(%r15,%rcx,4),%r16
> > +	addw    %r31w,%r8w,%r16w
> > +	adox   %r15d,%r8d,%r18d
> 
> Nit: Inconsistent blank padding.
> 

Done.

> > +	{load}  add    %r31,%r8,%r16
> > +	{store} add    %r31,%r8,%r16
> > +	adox   (%r15,%r31,1),%r8
> > +	adox   (%r15,%r31,1),%r8d,%r18d
> > +	and    $0x1234,%ax,%r30w
> > +	and    %r15b,%r17b,%r18b
> > +	and    %r15d,(%r8),%r18d
> > +	and    (%r15,%rax,1),%r16b,%r8b
> > +	and    (%r15,%rax,1),%r16w,%r8w
> > +	andl   $0x11,(%r19,%rax,4),%r20d
> > +	cmova  0x90909090(%eax),%edx,%r8d
> > +	cmovae 0x90909090(%eax),%edx,%r8d
> > +	cmovb  0x90909090(%eax),%edx,%r8d
> > +	cmovbe 0x90909090(%eax),%edx,%r8d
> > +	cmove  0x90909090(%eax),%edx,%r8d
> > +	cmovg  0x90909090(%eax),%edx,%r8d
> > +	cmovge 0x90909090(%eax),%edx,%r8d
> > +	cmovl  0x90909090(%eax),%edx,%r8d
> > +	cmovle 0x90909090(%eax),%edx,%r8d
> > +	cmovne 0x90909090(%eax),%edx,%r8d
> > +	cmovno 0x90909090(%eax),%edx,%r8d
> > +	cmovnp 0x90909090(%eax),%edx,%r8d
> > +	cmovns 0x90909090(%eax),%edx,%r8d
> > +	cmovo  0x90909090(%eax),%edx,%r8d
> > +	cmovp  0x90909090(%eax),%edx,%r8d
> > +	cmovs  0x90909090(%eax),%edx,%r8d
> > +	dec    %rax,%r17
> > +	decb   (%r31,%r12,1),%r8b
> > +	imul   0x909(%rax,%r31,8),%rdx,%r25
> > +	imul   0x90909(%eax),%edx,%r8d
> > +	inc    %r31,%r16
> > +	inc    %r31,%r8
> > +	inc    %rax,%rbx
> > +	neg    %rax,%r17
> > +	negb   (%r31,%r12,1),%r8b
> > +	not    %rax,%r17
> > +	notb   (%r31,%r12,1),%r8b
> > +	or     $0x1234,%ax,%r30w
> > +	or     %r15b,%r17b,%r18b
> > +	or     %r15d,(%r8),%r18d
> > +	or     (%r15,%rax,1),%r16b,%r8b
> > +	or     (%r15,%rax,1),%r16w,%r8w
> > +	orl    $0x11,(%r19,%rax,4),%r20d
> > +	rcl    $0x2,%r12b,%r31b
> > +	rcl    %cl,%r16b,%r8b
> > +	rclb   $0x1, (%rax),%r31b
> > +	rcll   $0x2,(%rax),%r31d
> > +	rclw   $0x1, (%rax),%r31w
> 
> Nit: Would be nice if there consistently were or were not blanks after the
> commas.
> 

Done.

> > --- a/opcodes/i386-opc.tbl
> > +++ b/opcodes/i386-opc.tbl
> > @@ -139,9 +139,13 @@
> >  #define Vsz256 Vsz=VSZ256
> >  #define Vsz512 Vsz=VSZ512
> >
> > +#define DstVVVV VexVVVV=VexVVVV_DST
> > +
> >  // The EVEX purpose of StaticRounding appears only together with SAE.
> > Re-use  // the bit to mark commutative VEX encodings where swapping
> > the source  // operands may allow to switch from 3-byte to 2-byte VEX
> encoding.
> > +// And re-use the bit to mark some NDD insns that swapping the source
> > +operands // may allow to switch from EVEX encoding to REX2 encoding.
> >  #define C StaticRounding
> >
> >  #define FP 387|287|8087
> > @@ -288,26 +292,40 @@ std, 0xfd, 0, NoSuf, {}  sti, 0xfb, 0, NoSuf, {}
> >
> >  // Arithmetic.
> > +add, 0x0, APX_F,
> >
> +D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|N
> F, {
> > +Reg8|Reg16|Reg32|Reg64,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> 
> There is _still_ Byte|Word|Dword|Qword in here (and below), when I think I
> pointed out more than once before that in new templates such redundancy
> wants omitting.
> 
> Since this isn't the first instance of earlier review comments not taken care of,
> may I please ask that you make reasonably sure that new versions aren't sent
> out like this?
> 

This part could indeed be omitted, but I really don't remember you mentioning it on the APX patches. There are still a lot of redundant Byte|Word|Dword|Qword in the opcode table, APX just added some flags on top of the old ones. Do you mind if I create a patch first to remove the redundant parts of master?

> >  add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +add, 0x83/0, APX_F,
> >
> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|
> NF, {
> > +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> > +Reg16|Reg32|Reg64 }
> >  add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  add,
> 0x4,
> > 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> Acc|Byte|Word|Dword|Qword }
> > +add, 0x80/0, APX_F,
> > +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> > +Imm8|Imm16|Imm32|Imm32S,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64}
> >  add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Imm8|Imm16|Imm32|Imm32S,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> >
> >  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> > +inc, 0xfe/0, APX_F,
> > +W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF,
> >
> +{Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64}
> >  inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> >
> > +sub, 0x28, APX_F,
> >
> +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Opti
> mize|
> > +NF, { Reg8|Reg16|Reg32|Reg64,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64, }
> 
> Here and elsewhere, what's Optimize for? It not being there on other
> templates, it can't be for the EVEX->REX2 optimization? If there are further
> optimization plans, that's (again) something to mention in the description. Yet
> better would be if such attributes were added only when respective
> optimizations are actually introduced. Unlike e.g. NF, which would mean
> another bulk update if not added right away, new optimizations typically affect
> only a few templates at a time.
> 

Optimize is not new.

sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Optimize|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }

> >  sub, 0x28, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> > Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +sub, 0x83/5, APX_F,
> > +Modrm|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S,
> > +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> > +Reg16|Reg32|Reg64 }
> >  sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  sub,
> 0x2c,
> > 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> Acc|Byte|Word|Dword|Qword }
> > +sub, 0x80/5, APX_F,
> > +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> > +Imm8|Imm16|Imm32|Imm32S,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> >  sub, 0x80/5, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Imm8|Imm16|Imm32|Imm32S,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> 
> There are still only 3 new templates here (and also above for add, plus for
> other similar insns), when ...
> 
> >  dec, 0x48, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> > +dec, 0xfe/1, APX_F,
> > +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> >  dec, 0xfe/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> >
> > +sbb, 0x18, APX_F,
> > +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, {
> > +Reg8|Reg16|Reg32|Reg64,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> >  sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +sbb, 0x18, APX_F,
> > +D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, {
> > +Reg8|Reg16|Reg32|Reg64,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x }
> > +sbb, 0x83/3, APX_F,
> >
> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4,
> {
> > +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> > +Reg16|Reg32|Reg64 }
> >  sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> > +sbb, 0x83/3, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf,
> { Imm8S,
> > +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> >  sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> > Acc|Byte|Word|Dword|Qword }
> > +sbb, 0x80/3, APX_F,
> > +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, {
> > +Imm8|Imm16|Imm32|Imm32S,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> >  sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Imm8|Imm16|Imm32|Imm32S,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +sbb, 0x80/3, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, {
> > +Imm8|Imm16|Imm32|Imm32S,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x }
> 
> ... there are 6 new templates here. This is again an aspect I had pointed out
> before. You cannot defer the addition of the other 3 until the NF patch, as you
> want to make sure that with just this patch in place something both
> 
>     {evex} sbb %eax, %eax
> 
> and
> 
>     {evex} sub %eax, %eax
> 
> actually assemble, and to EVEX encodings. I can't see how that would work in
> the latter case without those further templates.
> 
> The alternative is to also defer adding the 2-operand SBB templates (and any
> others you add here which don't use DstVVVV).
> 

I'm having a headache with this, some instructions like sbb don't support NF, originally they were in the 4/9 patch, but their disassemblers are in the NDD patch, and you agreed to put them in the NDD patch. Now I really don't know where to move. Moving encoding, decoding, and especially test cases for instructions between patches is cumbersome and I really don't think it makes much sense.

> >  xor, 0x30, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> > Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +xor, 0x83/6, APX_F,
> >
> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|
> NF, {
> > +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> > +Reg16|Reg32|Reg64 }
> >  xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  xor,
> 0x34,
> > 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> Acc|Byte|Word|Dword|Qword }
> > +xor, 0x80/6, APX_F,
> > +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> > +Imm8|Imm16|Imm32|Imm32S,
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> >  xor, 0x80/6, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Imm8|Imm16|Imm32|Imm32S,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> >
> >  // clr with 1 operand is really xor with 2 operands.
> >  clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, {
> > Reg8|Reg16|Reg32|Reg64 }
> 
> Btw., for consistency this may also want accompanying with an EVEX
> counterpart.
> 

Do you mean to add an entry like this? It should belong to the previous patch.

// clr with 1 operand is really xor with 2 operands.
clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, { Reg8|Reg16|Reg32|Reg64 }
clr, 0x30, APX_F, W|Modrm|No_sSuf|RegKludge|EVex128|EVexMap4|Optimize, { Reg8|Reg16|Reg32|Reg64 }

> >  mul, 0xf6/4, 0, W|Modrm|No_sSuf, {
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > imul, 0xf6/5, 0, W|Modrm|No_sSuf, {
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +imul, 0xaf, APX_F,
> >
> +C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap
> 4, {
> > +Reg16|Reg32|Reg64|Unspecified|Word|Dword|Qword|BaseIndex,
> > +Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }
> 
> Missing NF?
> 

Oh, when I rebase the NF patch, I found this missing and fixed it.

> >  rol, 0xd2/0, 0, W|Modrm|No_sSuf, { ShiftCount,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> > +rol, 0xd0/0, APX_F,
> > +W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, {
> >
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> x,
> > +Reg8|Reg16|Reg32|Reg64 }
> 
> Didn't we agree to avoid adding this (and its sibling) template, for the omitted
> shift count being ambiguous? Consider
> 
>     rol %cl, %al
> 
> Is this a rotate by %cl, or a 1-bit NDD rotate?
> 

These entries should be deleted.

rol, 0xd0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }

Thanks,
Lili.
  
Jan Beulich Dec. 11, 2023, 4:50 p.m. UTC | #4
On 11.12.2023 14:36, Cui, Lili wrote:
>> On 24.11.2023 08:02, Cui, Lili wrote:
>>> --- a/opcodes/i386-opc.tbl
>>> +++ b/opcodes/i386-opc.tbl
>>> @@ -139,9 +139,13 @@
>>>  #define Vsz256 Vsz=VSZ256
>>>  #define Vsz512 Vsz=VSZ512
>>>
>>> +#define DstVVVV VexVVVV=VexVVVV_DST
>>> +
>>>  // The EVEX purpose of StaticRounding appears only together with SAE.
>>> Re-use  // the bit to mark commutative VEX encodings where swapping
>>> the source  // operands may allow to switch from 3-byte to 2-byte VEX
>> encoding.
>>> +// And re-use the bit to mark some NDD insns that swapping the source
>>> +operands // may allow to switch from EVEX encoding to REX2 encoding.
>>>  #define C StaticRounding
>>>
>>>  #define FP 387|287|8087
>>> @@ -288,26 +292,40 @@ std, 0xfd, 0, NoSuf, {}  sti, 0xfb, 0, NoSuf, {}
>>>
>>>  // Arithmetic.
>>> +add, 0x0, APX_F,
>>>
>> +D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|N
>> F, {
>>> +Reg8|Reg16|Reg32|Reg64,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>
>> There is _still_ Byte|Word|Dword|Qword in here (and below), when I think I
>> pointed out more than once before that in new templates such redundancy
>> wants omitting.
>>
>> Since this isn't the first instance of earlier review comments not taken care of,
>> may I please ask that you make reasonably sure that new versions aren't sent
>> out like this?
>>
> 
> This part could indeed be omitted, but I really don't remember you mentioning it on the APX patches.

Already in e.g.
https://sourceware.org/pipermail/binutils/2023-November/130422.html
I pointed out that such earlier comments in e.g.
https://sourceware.org/pipermail/binutils/2023-September/129590.html
were not addressed.

> There are still a lot of redundant Byte|Word|Dword|Qword in the opcode table, APX just added some flags on top of the old ones. Do you mind if I create a patch first to remove the redundant parts of master?

I don't mind you cleaning up first. It's just that normally I wouldn't do
so in a separate patch (one of the reasons being that such non-functional
changes get in the way of using "git blame" or alike when trying to find
the most recent real change to a line), unless it was only a handful of
instances left. Instead I typically do such tidying as lines are touched
anyway. Thing here simply is that new templates shouldn't have such
anomalies anymore.

>>>  add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
>>> Reg8|Reg16|Reg32|Reg64,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>> +add, 0x83/0, APX_F,
>>>
>> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|
>> NF, {
>>> +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
>>> +Reg16|Reg32|Reg64 }
>>>  add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
>>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  add,
>> 0x4,
>>> 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
>> Acc|Byte|Word|Dword|Qword }
>>> +add, 0x80/0, APX_F,
>>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
>>> +Imm8|Imm16|Imm32|Imm32S,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64}
>>>  add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>> Imm8|Imm16|Imm32|Imm32S,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>>
>>>  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
>>> +inc, 0xfe/0, APX_F,
>>> +W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF,
>>>
>> +{Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64}
>>>  inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>>
>>> +sub, 0x28, APX_F,
>>>
>> +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Opti
>> mize|
>>> +NF, { Reg8|Reg16|Reg32|Reg64,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64, }
>>
>> Here and elsewhere, what's Optimize for? It not being there on other
>> templates, it can't be for the EVEX->REX2 optimization? If there are further
>> optimization plans, that's (again) something to mention in the description. Yet
>> better would be if such attributes were added only when respective
>> optimizations are actually introduced. Unlike e.g. NF, which would mean
>> another bulk update if not added right away, new optimizations typically affect
>> only a few templates at a time.
>>
> 
> Optimize is not new.
> 
> sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Optimize|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
> sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }

Optimize is legitimately there for the legacy template. If the new template
also wants it, there needs to be some reason. Otherwise it is part of the
tranformation to APX/EVEX to drop it.

>>>  sub, 0x28, 0,
>>> D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
>>> Reg8|Reg16|Reg32|Reg64,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>> +sub, 0x83/5, APX_F,
>>> +Modrm|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S,
>>> +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
>>> +Reg16|Reg32|Reg64 }
>>>  sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
>>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  sub,
>> 0x2c,
>>> 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
>> Acc|Byte|Word|Dword|Qword }
>>> +sub, 0x80/5, APX_F,
>>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
>>> +Imm8|Imm16|Imm32|Imm32S,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>>  sub, 0x80/5, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>> Imm8|Imm16|Imm32|Imm32S,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>
>> There are still only 3 new templates here (and also above for add, plus for
>> other similar insns), when ...
>>
>>>  dec, 0x48, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
>>> +dec, 0xfe/1, APX_F,
>>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>>  dec, 0xfe/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>>
>>> +sbb, 0x18, APX_F,
>>> +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, {
>>> +Reg8|Reg16|Reg32|Reg64,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>>  sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
>>> Reg8|Reg16|Reg32|Reg64,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>> +sbb, 0x18, APX_F,
>>> +D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, {
>>> +Reg8|Reg16|Reg32|Reg64,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x }
>>> +sbb, 0x83/3, APX_F,
>>>
>> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4,
>> {
>>> +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
>>> +Reg16|Reg32|Reg64 }
>>>  sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
>>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>>> +sbb, 0x83/3, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf,
>> { Imm8S,
>>> +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
>>>  sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
>>> Acc|Byte|Word|Dword|Qword }
>>> +sbb, 0x80/3, APX_F,
>>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, {
>>> +Imm8|Imm16|Imm32|Imm32S,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>>  sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>> Imm8|Imm16|Imm32|Imm32S,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>> +sbb, 0x80/3, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, {
>>> +Imm8|Imm16|Imm32|Imm32S,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x }
>>
>> ... there are 6 new templates here. This is again an aspect I had pointed out
>> before. You cannot defer the addition of the other 3 until the NF patch, as you
>> want to make sure that with just this patch in place something both
>>
>>     {evex} sbb %eax, %eax
>>
>> and
>>
>>     {evex} sub %eax, %eax
>>
>> actually assemble, and to EVEX encodings. I can't see how that would work in
>> the latter case without those further templates.
>>
>> The alternative is to also defer adding the 2-operand SBB templates (and any
>> others you add here which don't use DstVVVV).
>>
> 
> I'm having a headache with this, some instructions like sbb don't support NF, originally they were in the 4/9 patch, but their disassemblers are in the NDD patch, and you agreed to put them in the NDD patch.

Right, yet still the overall result wants to be consistent. Hence why I'm
not demanding that you move these templates yet later (which is one
option). Instead I've indicated that moving the others ahead would also
be okay.

Like with any series, you want it to be in a shape where it can be committed
piecemeal. Which is even more important with a release around the corner.
If we end up with just partial APX support in 2.42, that partial support
should be in a shape that's predictable to users.

> Now I really don't know where to move. Moving encoding, decoding, and especially test cases for instructions between patches is cumbersome and I really don't think it makes much sense.

I can see your point, and I'm sorry for the hassle. Part of the problem of
the moving being troublesome is (imo) that many of the patches simply were
(are) doing too many things at a time anyway.

>>>  xor, 0x30, 0,
>>> D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
>>> Reg8|Reg16|Reg32|Reg64,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>> +xor, 0x83/6, APX_F,
>>>
>> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|
>> NF, {
>>> +Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
>>> +Reg16|Reg32|Reg64 }
>>>  xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
>>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  xor,
>> 0x34,
>>> 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
>> Acc|Byte|Word|Dword|Qword }
>>> +xor, 0x80/6, APX_F,
>>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
>>> +Imm8|Imm16|Imm32|Imm32S,
>>>
>> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
>> x,
>>> +Reg8|Reg16|Reg32|Reg64 }
>>>  xor, 0x80/6, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
>>> Imm8|Imm16|Imm32|Imm32S,
>>>
>> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
>> }
>>>
>>>  // clr with 1 operand is really xor with 2 operands.
>>>  clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, {
>>> Reg8|Reg16|Reg32|Reg64 }
>>
>> Btw., for consistency this may also want accompanying with an EVEX
>> counterpart.
>>
> 
> Do you mean to add an entry like this? It should belong to the previous patch.
> 
> // clr with 1 operand is really xor with 2 operands.
> clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, { Reg8|Reg16|Reg32|Reg64 }
> clr, 0x30, APX_F, W|Modrm|No_sSuf|RegKludge|EVex128|EVexMap4|Optimize, { Reg8|Reg16|Reg32|Reg64 }

Yes, something like this. And possibly indeed not the patch here; the
template simply happened to be in context. Where exactly it wants to
go depends - see above - on where other similar templates are
introduced. Note however that the corresponding XOR templates are
introduced here, just above and still in context.

Jan
  
Cui, Lili Dec. 12, 2023, 5:53 a.m. UTC | #5
> On 24.11.2023 08:02, Cui, Lili wrote:
> > --- a/opcodes/i386-dis-evex-reg.h
> > +++ b/opcodes/i386-dis-evex-reg.h
> > @@ -56,3 +56,58 @@
> >      { "blsmskS",	{ VexGdq, Edq }, 0 },
> >      { "blsiS",	{ VexGdq, Edq }, 0 },
> >    },
> > +  /* REG_EVEX_MAP4_80 */
> > +  {
> > +    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "adcA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "sbbA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "andA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "subA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "xorA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> 
> Don't these need to use PREFIX_NP_OR_DATA? The doc clearly says
> ".IGNORED" there. (Applies to other byte ops as well then, of course.)
> 

I'm confused here, "IGNORED" means the W bit in the EVEX payload is ignored. Why is 0x66 allowed?

Thanks,
Lili.
  
Jan Beulich Dec. 12, 2023, 8:28 a.m. UTC | #6
On 12.12.2023 06:53, Cui, Lili wrote:
>> On 24.11.2023 08:02, Cui, Lili wrote:
>>> --- a/opcodes/i386-dis-evex-reg.h
>>> +++ b/opcodes/i386-dis-evex-reg.h
>>> @@ -56,3 +56,58 @@
>>>      { "blsmskS",	{ VexGdq, Edq }, 0 },
>>>      { "blsiS",	{ VexGdq, Edq }, 0 },
>>>    },
>>> +  /* REG_EVEX_MAP4_80 */
>>> +  {
>>> +    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "adcA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "sbbA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "andA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "subA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>> +    { "xorA",	{ VexGb, Eb, Ib }, NO_PREFIX },
>>
>> Don't these need to use PREFIX_NP_OR_DATA? The doc clearly says
>> ".IGNORED" there. (Applies to other byte ops as well then, of course.)
>>
> 
> I'm confused here, "IGNORED" means the W bit in the EVEX payload is ignored. Why is 0x66 allowed?

Hmm, looks like I have been confused. Earlier communication had led me to
the impression that pp == 0b01 would be ignored here. In fact I had my
own disassembler library the other way originally, and I then changed it
to this model. Looks like I need to change it back. And I'm sorry for
causing confusion here.

Jan
  
Cui, Lili Dec. 13, 2023, 10:42 a.m. UTC | #7
> >>> +add, 0x0, APX_F,
> +D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|N
> >> F, {
> >>> +Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>
> >> There is _still_ Byte|Word|Dword|Qword in here (and below), when I
> >> think I pointed out more than once before that in new templates such
> >> redundancy wants omitting.
> >>
> >> Since this isn't the first instance of earlier review comments not
> >> taken care of, may I please ask that you make reasonably sure that
> >> new versions aren't sent out like this?
> >>
> >
> > This part could indeed be omitted, but I really don't remember you
> mentioning it on the APX patches.
> 
> Already in e.g.
> https://sourceware.org/pipermail/binutils/2023-November/130422.html
> I pointed out that such earlier comments in e.g.
> https://sourceware.org/pipermail/binutils/2023-September/129590.html
> were not addressed.
> 

Sorry, movbe was indeed caused by the reg I added, I didn't notice that the leagcy template have this issue as well.  when you said I had something need to change, I didn't realize it was here at all.

> > There are still a lot of redundant Byte|Word|Dword|Qword in the opcode
> table, APX just added some flags on top of the old ones. Do you mind if I
> create a patch first to remove the redundant parts of master?
> 
> I don't mind you cleaning up first. It's just that normally I wouldn't do so in a
> separate patch (one of the reasons being that such non-functional changes get
> in the way of using "git blame" or alike when trying to find the most recent
> real change to a line), unless it was only a handful of instances left. Instead I
> typically do such tidying as lines are touched anyway. Thing here simply is that
> new templates shouldn't have such anomalies anymore.
> 

I still want to change them. It's easy to be misled.

> +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Opti
> >> mize|
> >>> +NF, { Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64, }
> >>
> >> Here and elsewhere, what's Optimize for? It not being there on other
> >> templates, it can't be for the EVEX->REX2 optimization? If there are
> >> further optimization plans, that's (again) something to mention in
> >> the description. Yet better would be if such attributes were added
> >> only when respective optimizations are actually introduced. Unlike
> >> e.g. NF, which would mean another bulk update if not added right
> >> away, new optimizations typically affect only a few templates at a time.
> >>
> >
> > Optimize is not new.
> >
> > sub, 0x28, APX_F,
> >
> D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Opti
> mize|N
> > F, { Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64, } sub, 0x28, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> > Reg8|Reg16|Reg32|Reg64,
> >
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> }
> 
> Optimize is legitimately there for the legacy template. If the new template also
> wants it, there needs to be some reason. Otherwise it is part of the
> tranformation to APX/EVEX to drop it.
> 

Dropped Optimize, thanks.

> >>>  sub, 0x28, 0,
> >>> D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> >>> Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>> +sub, 0x83/5, APX_F,
> >>> +Modrm|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S,
> >>> +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> >>> +Reg16|Reg32|Reg64 }
> >>>  sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> >>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  sub,
> >> 0x2c,
> >>> 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> >> Acc|Byte|Word|Dword|Qword }
> >>> +sub, 0x80/5, APX_F,
> >>>
> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> >>> +Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>>  sub, 0x80/5, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >>> Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>
> >> There are still only 3 new templates here (and also above for add,
> >> plus for other similar insns), when ...
> >>
> >>>  dec, 0x48, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> >>> +dec, 0xfe/1, APX_F,
> >>>
> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>>  dec, 0xfe/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>>
> >>> +sbb, 0x18, APX_F,
> >>> +D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4,
> {
> >>> +Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>>  sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
> >>> Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>> +sbb, 0x18, APX_F,
> >>> +D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, {
> >>> +Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x }
> >>> +sbb, 0x83/3, APX_F,
> >>>
> >>
> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4,
> >> {
> >>> +Imm8S,
> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> >>> +Reg16|Reg32|Reg64 }
> >>>  sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> >>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> >>> +sbb, 0x83/3, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf,
> >> { Imm8S,
> >>> +Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> >>>  sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> >>> Acc|Byte|Word|Dword|Qword }
> >>> +sbb, 0x80/3, APX_F,
> >>> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, {
> >>> +Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>>  sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >>> Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>> +sbb, 0x80/3, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, {
> >>> +Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x }
> >>
> >> ... there are 6 new templates here. This is again an aspect I had
> >> pointed out before. You cannot defer the addition of the other 3
> >> until the NF patch, as you want to make sure that with just this
> >> patch in place something both
> >>
> >>     {evex} sbb %eax, %eax
> >>
> >> and
> >>
> >>     {evex} sub %eax, %eax
> >>
> >> actually assemble, and to EVEX encodings. I can't see how that would
> >> work in the latter case without those further templates.
> >>
> >> The alternative is to also defer adding the 2-operand SBB templates
> >> (and any others you add here which don't use DstVVVV).
> >>
> >
> > I'm having a headache with this, some instructions like sbb don't support NF,
> originally they were in the 4/9 patch, but their disassemblers are in the NDD
> patch, and you agreed to put them in the NDD patch.
> 
> Right, yet still the overall result wants to be consistent. Hence why I'm not
> demanding that you move these templates yet later (which is one option).
> Instead I've indicated that moving the others ahead would also be okay.
> 
 
I'd like to move them into the NF patch, only need to move the templates.  However, the second method is more cumbersome and requires moving the encoder, decoder, and test cases to the evex egpr patch.

> Like with any series, you want it to be in a shape where it can be committed
> piecemeal. Which is even more important with a release around the corner.
> If we end up with just partial APX support in 2.42, that partial support should
> be in a shape that's predictable to users.
> 
> > Now I really don't know where to move. Moving encoding, decoding, and
> especially test cases for instructions between patches is cumbersome and I
> really don't think it makes much sense.
> 
> I can see your point, and I'm sorry for the hassle. Part of the problem of the
> moving being troublesome is (imo) that many of the patches simply were
> (are) doing too many things at a time anyway.
> 
> >>>  xor, 0x30, 0,
> >>> D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> >>> Reg8|Reg16|Reg32|Reg64,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>> +xor, 0x83/6, APX_F,
> >>>
> >>
> +Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|
> >> NF, {
> >>> +Imm8S,
> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex,
> >>> +Reg16|Reg32|Reg64 }
> >>>  xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> >>> Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }  xor,
> >> 0x34,
> >>> 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> >> Acc|Byte|Word|Dword|Qword }
> >>> +xor, 0x80/6, APX_F,
> >>>
> +W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, {
> >>> +Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> +Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseInde
> >> x,
> >>> +Reg8|Reg16|Reg32|Reg64 }
> >>>  xor, 0x80/6, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> >>> Imm8|Imm16|Imm32|Imm32S,
> >>>
> >>
> Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex
> >> }
> >>>
> >>>  // clr with 1 operand is really xor with 2 operands.
> >>>  clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, {
> >>> Reg8|Reg16|Reg32|Reg64 }
> >>
> >> Btw., for consistency this may also want accompanying with an EVEX
> >> counterpart.
> >>
> >
> > Do you mean to add an entry like this? It should belong to the previous
> patch.
> >
> > // clr with 1 operand is really xor with 2 operands.
> > clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, {
> > Reg8|Reg16|Reg32|Reg64 } clr, 0x30, APX_F,
> > W|Modrm|No_sSuf|RegKludge|EVex128|EVexMap4|Optimize, {
> > Reg8|Reg16|Reg32|Reg64 }
> 
> Yes, something like this. And possibly indeed not the patch here; the template
> simply happened to be in context. Where exactly it wants to go depends - see
> above - on where other similar templates are introduced. Note however that
> the corresponding XOR templates are introduced here, just above and still in
> context.
> 

For clr's evex format template, I think it should be in the NF patch, since xor's evex format template is also in that patch.

Thanks,
Lili.
  

Patch

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index ba8001fe1c8..1efda914150 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -2242,8 +2242,10 @@  operand_size_match (const insn_template *t)
       unsigned int given = i.operands - j - 1;
 
       /* For FMA4 and XOP insns VEX.W controls just the first two
-	 register operands.  */
-      if (is_cpu (t, CpuFMA4) || is_cpu (t, CpuXOP))
+	 register operands. And APX_F insns just swap the two source operands,
+	 with the 3rd one being the destination.  */
+      if (is_cpu (t, CpuFMA4) || is_cpu (t, CpuXOP)
+	  || is_cpu (t, CpuAPX_F))
 	given = j < 2 ? 1 - j : j;
 
       if (t->operand_types[j].bitfield.class == Reg
@@ -4180,6 +4182,11 @@  build_apx_evex_prefix (void)
   if (i.vex.register_specifier
       && i.vex.register_specifier->reg_flags & RegRex2)
     i.vex.bytes[3] &= ~0x08;
+
+  /* Encode the NDD bit of the instruction promoted from the legacy
+     space.  */
+  if (i.vex.register_specifier && i.tm.opcode_space == SPACE_EVEXMAP4)
+    i.vex.bytes[3] |= 0x10;
 }
 
 static void
@@ -7404,18 +7411,22 @@  match_template (char mnem_suffix)
 	     - the store form is requested, and the template is a load form,
 	     - the non-default (swapped) form is requested.  */
 	  overlap1 = operand_type_and (operand_types[0], operand_types[1]);
+
+	  j = i.operands - 1 - (t->opcode_space == SPACE_EVEXMAP4
+				&& t->opcode_modifier.vexvvvv);
+
 	  if (t->opcode_modifier.d && i.reg_operands == i.operands
 	      && !operand_type_all_zero (&overlap1))
 	    switch (i.dir_encoding)
 	      {
 	      case dir_encoding_load:
-		if (operand_type_check (operand_types[i.operands - 1], anymem)
+		if (operand_type_check (operand_types[j], anymem)
 		    || t->opcode_modifier.regmem)
 		  goto check_reverse;
 		break;
 
 	      case dir_encoding_store:
-		if (!operand_type_check (operand_types[i.operands - 1], anymem)
+		if (!operand_type_check (operand_types[j], anymem)
 		    && !t->opcode_modifier.regmem)
 		  goto check_reverse;
 		break;
@@ -7426,6 +7437,7 @@  match_template (char mnem_suffix)
 	      case dir_encoding_default:
 		break;
 	      }
+
 	  /* If we want store form, we skip the current load.  */
 	  if ((i.dir_encoding == dir_encoding_store
 	       || i.dir_encoding == dir_encoding_swap)
@@ -7455,11 +7467,13 @@  match_template (char mnem_suffix)
 		continue;
 	      /* Try reversing direction of operands.  */
 	      j = is_cpu (t, CpuFMA4)
-		  || is_cpu (t, CpuXOP) ? 1 : i.operands - 1;
+		  || is_cpu (t, CpuXOP)
+		  || is_cpu (t, CpuAPX_F) ? 1 : i.operands - 1;
 	      overlap0 = operand_type_and (i.types[0], operand_types[j]);
 	      overlap1 = operand_type_and (i.types[j], operand_types[0]);
 	      overlap2 = operand_type_and (i.types[1], operand_types[1]);
-	      gas_assert (t->operands != 3 || !check_register);
+	      gas_assert (t->operands != 3 || !check_register
+			  || is_cpu (t, CpuAPX_F));
 	      if (!operand_type_match (overlap0, i.types[0])
 		  || !operand_type_match (overlap1, i.types[j])
 		  || (t->operands == 3
@@ -7494,6 +7508,11 @@  match_template (char mnem_suffix)
 		  found_reverse_match = Opcode_VexW;
 		  goto check_operands_345;
 		}
+	      else if (is_cpu (t, CpuAPX_F) && i.operands == 3)
+		{
+		  found_reverse_match = Opcode_D;
+		  goto check_operands_345;
+		}
 	      else if (t->opcode_space != SPACE_BASE
 		       && (t->opcode_space != SPACE_0F
 			   /* MOV to/from CR/DR/TR, as an exception, follow
@@ -7667,6 +7686,9 @@  match_template (char mnem_suffix)
 
       i.tm.base_opcode ^= found_reverse_match;
 
+      if (i.tm.opcode_space == SPACE_EVEXMAP4)
+	goto swap_first_2;
+
       /* Certain SIMD insns have their load forms specified in the opcode
 	 table, and hence we need to _set_ RegMem instead of clearing it.
 	 We need to avoid setting the bit though on insns like KMOVW.  */
@@ -7686,6 +7708,7 @@  match_template (char mnem_suffix)
 	 flipping VEX.W.  */
       i.tm.opcode_modifier.vexw ^= VEXW0 ^ VEXW1;
 
+    swap_first_2:
       j = i.tm.operand_types[0].bitfield.imm8;
       i.tm.operand_types[j] = operand_types[j + 1];
       i.tm.operand_types[j + 1] = operand_types[j];
@@ -8511,12 +8534,9 @@  process_operands (void)
      unnecessary segment overrides.  */
   const reg_entry *default_seg = NULL;
 
-  /* We only need to check those implicit registers for instructions
-     with 3 operands or less.  */
-  if (i.operands <= 3)
-    for (unsigned int j = 0; j < i.operands; j++)
-      if (i.types[j].bitfield.instance != InstanceNone)
-	i.reg_operands--;
+  for (unsigned int j = 0; j < i.operands; j++)
+    if (i.types[j].bitfield.instance != InstanceNone)
+      i.reg_operands--;
 
   if (i.tm.opcode_modifier.sse2avx)
     {
@@ -8870,25 +8890,33 @@  build_modrm_byte (void)
 				     || i.vec_encoding == vex_encoding_evex));
     }
 
-  for (v = source + 1; v < dest; ++v)
-    if (v != reg_slot)
-      break;
-  if (v >= dest)
-    v = ~0;
-  if (i.tm.extension_opcode != None)
+  if (i.tm.opcode_modifier.vexvvvv == VexVVVV_DST)
     {
-      if (dest != source)
-	v = dest;
-      dest = ~0;
+      v = dest;
+      dest-- ;
     }
-  gas_assert (source < dest);
-  if (i.tm.opcode_modifier.operandconstraint == SWAP_SOURCES
-      && source != op)
+  else
     {
-      unsigned int tmp = source;
+      for (v = source + 1; v < dest; ++v)
+	if (v != reg_slot)
+	  break;
+      if (v >= dest)
+	v = ~0;
+      if (i.tm.extension_opcode != None)
+	{
+	  if (dest != source)
+	    v = dest;
+	  dest = ~0;
+	}
+      gas_assert (source < dest);
+      if (i.tm.opcode_modifier.operandconstraint == SWAP_SOURCES
+	  && source != op)
+	{
+	  unsigned int tmp = source;
 
-      source = v;
-      v = tmp;
+	  source = v;
+	  v = tmp;
+	}
     }
 
   if (v < MAX_OPERANDS)
diff --git a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
index 07760240793..2ae0f1a358f 100644
--- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.d
@@ -27,4 +27,6 @@  Disassembly of section .text:
 [ 	]*[a-f0-9]+:[ 	]+c8 ff ff ff[ 	]+enter  \$0xffff,\$0xff
 [ 	]*[a-f0-9]+:[ 	]+67 62 f2 7c 18 f5[ 	]+addr32 \(bad\)
 [ 	]*[a-f0-9]+:[ 	]+0b ff[ 	]+or     %edi,%edi
+[ 	]*[a-f0-9]+:[ 	]+62 f4 fc 08 ff[ 	]+\(bad\)
+[ 	]*[a-f0-9]+:[ 	]+d8[ 	]+.byte 0xd8
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
index bfec0652d13..c4646dcadb4 100644
--- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-bad.s
@@ -26,3 +26,5 @@  _start:
 	#EVEX from VEX bzhi %ebx,%eax,%ecx EVEX.P[20](EVEX.b) == 1 (illegal value).
 	.insn EVEX.L0.NP.0f38.W0 0xf5, %eax ,(%ebx){1to8}, %ecx
 	.byte 0xff
+	#{evex} inc %rax %rbx EVEX.vvvv' != 1111 && EVEX.ND = 0.
+	.insn EVEX.L0.NP.M4.W1 0xff, %rax, %rbx
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd.d b/gas/testsuite/gas/i386/x86-64-apx-ndd.d
new file mode 100644
index 00000000000..73410606ce3
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd.d
@@ -0,0 +1,160 @@ 
+#as:
+#objdump: -dw
+#name: x86-64 APX NDD instructions with evex prefix encoding
+#source: x86-64-apx-ndd.s
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 d0 34 12 	adc    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 10 f9    	adc    %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 11 38    	adc    %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 12 04 07 	adc    \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 13 04 07 	adc    \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 14 83 11 	adcl   \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*62 54 6d 10 66 c7    	adcx   %r15d,%r8d,%r18d
+\s*[a-f0-9]+:\s*62 14 f9 08 66 04 3f 	adcx   \(%r15,%r31,1\),%r8
+\s*[a-f0-9]+:\s*62 14 69 10 66 04 3f 	adcx   \(%r15,%r31,1\),%r8d,%r18d
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 c0 34 12 	add    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 d4 fc 10 81 c7 33 44 34 12 	add    \$0x12344433,%r15,%r16
+\s*[a-f0-9]+:\s*62 d4 74 10 80 c5 34 	add    \$0x34,%r13b,%r17b
+\s*[a-f0-9]+:\s*62 f4 bc 18 81 c0 11 22 33 f4 	add    \$0xfffffffff4332211,%rax,%r8
+\s*[a-f0-9]+:\s*62 44 fc 10 01 f8    	add    %r31,%r8,%r16
+\s*[a-f0-9]+:\s*62 44 fc 10 01 38    	add    %r31,\(%r8\),%r16
+\s*[a-f0-9]+:\s*62 44 f8 10 01 3c c0 	add    %r31,\(%r8,%r16,8\),%r16
+\s*[a-f0-9]+:\s*62 44 7c 10 00 f8    	add    %r31b,%r8b,%r16b
+\s*[a-f0-9]+:\s*62 44 7c 10 01 f8    	add    %r31d,%r8d,%r16d
+\s*[a-f0-9]+:\s*62 44 7d 10 01 f8    	add    %r31w,%r8w,%r16w
+\s*[a-f0-9]+:\s*62 5c fc 10 03 07    	add    \(%r31\),%r8,%r16
+\s*[a-f0-9]+:\s*62 5c f8 10 03 84 07 90 90 00 00 	add    0x9090\(%r31,%r16,1\),%r8,%r16
+\s*[a-f0-9]+:\s*62 44 7c 10 00 f8    	add    %r31b,%r8b,%r16b
+\s*[a-f0-9]+:\s*62 44 7c 10 01 f8    	add    %r31d,%r8d,%r16d
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 04 83 11 	addl   \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*62 44 fc 10 01 f8    	add    %r31,%r8,%r16
+\s*[a-f0-9]+:\s*62 d4 fc 10 81 04 8f 33 44 34 12 	addq   \$0x12344433,\(%r15,%rcx,4\),%r16
+\s*[a-f0-9]+:\s*62 44 7d 10 01 f8    	add    %r31w,%r8w,%r16w
+\s*[a-f0-9]+:\s*62 54 6e 10 66 c7    	adox   %r15d,%r8d,%r18d
+\s*[a-f0-9]+:\s*62 5c fc 10 03 c7    	add    %r31,%r8,%r16
+\s*[a-f0-9]+:\s*62 44 fc 10 01 f8    	add    %r31,%r8,%r16
+\s*[a-f0-9]+:\s*62 14 fa 08 66 04 3f 	adox   \(%r15,%r31,1\),%r8
+\s*[a-f0-9]+:\s*62 14 6a 10 66 04 3f 	adox   \(%r15,%r31,1\),%r8d,%r18d
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 e0 34 12 	and    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 20 f9    	and    %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 21 38    	and    %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 22 04 07 	and    \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 23 04 07 	and    \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 24 83 11 	andl   \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 47 90 90 90 90 90 	cmova  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 43 90 90 90 90 90 	cmovae -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 42 90 90 90 90 90 	cmovb  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 46 90 90 90 90 90 	cmovbe -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 44 90 90 90 90 90 	cmove  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4f 90 90 90 90 90 	cmovg  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4d 90 90 90 90 90 	cmovge -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4c 90 90 90 90 90 	cmovl  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4e 90 90 90 90 90 	cmovle -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 45 90 90 90 90 90 	cmovne -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 41 90 90 90 90 90 	cmovno -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4b 90 90 90 90 90 	cmovnp -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 49 90 90 90 90 90 	cmovns -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 40 90 90 90 90 90 	cmovo  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 4a 90 90 90 90 90 	cmovp  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 48 90 90 90 90 90 	cmovs  -0x6f6f6f70\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*62 f4 f4 10 ff c8    	dec    %rax,%r17
+\s*[a-f0-9]+:\s*62 9c 3c 18 fe 0c 27 	decb   \(%r31,%r12,1\),%r8b
+\s*[a-f0-9]+:\s*62 b4 b0 10 af 94 f8 09 09 00 00 	imul   0x909\(%rax,%r31,8\),%rdx,%r25
+\s*[a-f0-9]+:\s*67 62 f4 3c 18 af 90 09 09 09 00 	imul   0x90909\(%eax\),%edx,%r8d
+\s*[a-f0-9]+:\s*62 dc fc 10 ff c7    	inc    %r31,%r16
+\s*[a-f0-9]+:\s*62 dc bc 18 ff c7    	inc    %r31,%r8
+\s*[a-f0-9]+:\s*62 f4 e4 18 ff c0    	inc    %rax,%rbx
+\s*[a-f0-9]+:\s*62 f4 f4 10 f7 d8    	neg    %rax,%r17
+\s*[a-f0-9]+:\s*62 9c 3c 18 f6 1c 27 	negb   \(%r31,%r12,1\),%r8b
+\s*[a-f0-9]+:\s*62 f4 f4 10 f7 d0    	not    %rax,%r17
+\s*[a-f0-9]+:\s*62 9c 3c 18 f6 14 27 	notb   \(%r31,%r12,1\),%r8b
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 c8 34 12 	or     \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 08 f9    	or     %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 09 38    	or     %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 0a 04 07 	or     \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 0b 04 07 	or     \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 0c 83 11 	orl    \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 d4 02 	rcl    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 d0    	rcl    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 10    	rclb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 10 02 	rcll   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 10    	rclw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 14 83 	rclw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 dc 02 	rcr    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 d8    	rcr    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 18    	rcrb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 18 02 	rcrl   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 18    	rcrw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 1c 83 	rcrw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 c4 02 	rol    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 c0    	rol    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 00    	rolb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 00 02 	roll   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 00    	rolw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 04 83 	rolw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 cc 02 	ror    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 c8    	ror    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 08    	rorb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 08 02 	rorl   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 08    	rorw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 0c 83 	rorw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 fc 02 	sar    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 f8    	sar    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 38    	sarb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 38 02 	sarl   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 38    	sarw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 3c 83 	sarw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 d8 34 12 	sbb    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 18 f9    	sbb    %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 19 38    	sbb    %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 1a 04 07 	sbb    \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 1b 04 07 	sbb    \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 1c 83 11 	sbbl   \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 e4 02 	shl    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 e4 02 	shl    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 e0    	shl    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 e0    	shl    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 20    	shlb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 20    	shlb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 74 84 10 24 20 01 	shld   \$0x1,%r12,\(%rax\),%r31
+\s*[a-f0-9]+:\s*62 74 04 10 24 38 02 	shld   \$0x2,%r15d,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 54 05 10 24 c4 02 	shld   \$0x2,%r8w,%r12w,%r31w
+\s*[a-f0-9]+:\s*62 7c bc 18 a5 e0    	shld   %cl,%r12,%r16,%r8
+\s*[a-f0-9]+:\s*62 7c 05 10 a5 2c 83 	shld   %cl,%r13w,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 74 05 10 a5 08    	shld   %cl,%r9w,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 20 02 	shll   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 20 02 	shll   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 20    	shlw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 20    	shlw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 24 83 	shlw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 24 83 	shlw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 d4 04 10 c0 ec 02 	shr    \$0x2,%r12b,%r31b
+\s*[a-f0-9]+:\s*62 fc 3c 18 d2 e8    	shr    %cl,%r16b,%r8b
+\s*[a-f0-9]+:\s*62 f4 04 10 d0 28    	shrb   \$1,\(%rax\),%r31b
+\s*[a-f0-9]+:\s*62 74 84 10 2c 20 01 	shrd   \$0x1,%r12,\(%rax\),%r31
+\s*[a-f0-9]+:\s*62 74 04 10 2c 38 02 	shrd   \$0x2,%r15d,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 54 05 10 2c c4 02 	shrd   \$0x2,%r8w,%r12w,%r31w
+\s*[a-f0-9]+:\s*62 7c bc 18 ad e0    	shrd   %cl,%r12,%r16,%r8
+\s*[a-f0-9]+:\s*62 7c 05 10 ad 2c 83 	shrd   %cl,%r13w,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 74 05 10 ad 08    	shrd   %cl,%r9w,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 f4 04 10 c1 28 02 	shrl   \$0x2,\(%rax\),%r31d
+\s*[a-f0-9]+:\s*62 f4 05 10 d1 28    	shrw   \$1,\(%rax\),%r31w
+\s*[a-f0-9]+:\s*62 fc 05 10 d3 2c 83 	shrw   %cl,\(%r19,%rax,4\),%r31w
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 e8 34 12 	sub    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 28 f9    	sub    %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 29 38    	sub    %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 2a 04 07 	sub    \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 2b 04 07 	sub    \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 2c 83 11 	subl   \$0x11,\(%r19,%rax,4\),%r20d
+\s*[a-f0-9]+:\s*62 f4 0d 10 81 f0 34 12 	xor    \$0x1234,%ax,%r30w
+\s*[a-f0-9]+:\s*62 7c 6c 10 30 f9    	xor    %r15b,%r17b,%r18b
+\s*[a-f0-9]+:\s*62 54 6c 10 31 38    	xor    %r15d,\(%r8\),%r18d
+\s*[a-f0-9]+:\s*62 c4 3c 18 32 04 07 	xor    \(%r15,%rax,1\),%r16b,%r8b
+\s*[a-f0-9]+:\s*62 c4 3d 18 33 04 07 	xor    \(%r15,%rax,1\),%r16w,%r8w
+\s*[a-f0-9]+:\s*62 fc 5c 10 83 34 83 11 	xorl   \$0x11,\(%r19,%rax,4\),%r20d
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd.s b/gas/testsuite/gas/i386/x86-64-apx-ndd.s
new file mode 100644
index 00000000000..c6edaace312
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd.s
@@ -0,0 +1,155 @@ 
+# Check 64bit APX NDD instructions with evex prefix encoding
+
+	.allow_index_reg
+	.text
+_start:
+	adc    $0x1234,%ax,%r30w
+	adc    %r15b,%r17b,%r18b
+	adc    %r15d,(%r8),%r18d
+	adc    (%r15,%rax,1),%r16b,%r8b
+	adc    (%r15,%rax,1),%r16w,%r8w
+	adcl   $0x11,(%r19,%rax,4),%r20d
+	adcx   %r15d,%r8d,%r18d
+	adcx   (%r15,%r31,1),%r8
+	adcx   (%r15,%r31,1),%r8d,%r18d
+	add    $0x1234,%ax,%r30w
+	add    $0x12344433,%r15,%r16
+	add    $0x34,%r13b,%r17b
+	add    $0xfffffffff4332211,%rax,%r8
+	add    %r31,%r8,%r16
+	add    %r31,(%r8),%r16
+	add    %r31,(%r8,%r16,8),%r16
+	add    %r31b,%r8b,%r16b
+	add    %r31d,%r8d,%r16d
+	add    %r31w,%r8w,%r16w
+	add    (%r31),%r8,%r16
+	add    0x9090(%r31,%r16,1),%r8,%r16
+	addb    %r31b,%r8b,%r16b
+	addl    %r31d,%r8d,%r16d
+	addl   $0x11,(%r19,%rax,4),%r20d
+	addq    %r31,%r8,%r16
+	addq   $0x12344433,(%r15,%rcx,4),%r16
+	addw    %r31w,%r8w,%r16w
+	adox   %r15d,%r8d,%r18d
+	{load}  add    %r31,%r8,%r16
+	{store} add    %r31,%r8,%r16
+	adox   (%r15,%r31,1),%r8
+	adox   (%r15,%r31,1),%r8d,%r18d
+	and    $0x1234,%ax,%r30w
+	and    %r15b,%r17b,%r18b
+	and    %r15d,(%r8),%r18d
+	and    (%r15,%rax,1),%r16b,%r8b
+	and    (%r15,%rax,1),%r16w,%r8w
+	andl   $0x11,(%r19,%rax,4),%r20d
+	cmova  0x90909090(%eax),%edx,%r8d
+	cmovae 0x90909090(%eax),%edx,%r8d
+	cmovb  0x90909090(%eax),%edx,%r8d
+	cmovbe 0x90909090(%eax),%edx,%r8d
+	cmove  0x90909090(%eax),%edx,%r8d
+	cmovg  0x90909090(%eax),%edx,%r8d
+	cmovge 0x90909090(%eax),%edx,%r8d
+	cmovl  0x90909090(%eax),%edx,%r8d
+	cmovle 0x90909090(%eax),%edx,%r8d
+	cmovne 0x90909090(%eax),%edx,%r8d
+	cmovno 0x90909090(%eax),%edx,%r8d
+	cmovnp 0x90909090(%eax),%edx,%r8d
+	cmovns 0x90909090(%eax),%edx,%r8d
+	cmovo  0x90909090(%eax),%edx,%r8d
+	cmovp  0x90909090(%eax),%edx,%r8d
+	cmovs  0x90909090(%eax),%edx,%r8d
+	dec    %rax,%r17
+	decb   (%r31,%r12,1),%r8b
+	imul   0x909(%rax,%r31,8),%rdx,%r25
+	imul   0x90909(%eax),%edx,%r8d
+	inc    %r31,%r16
+	inc    %r31,%r8
+	inc    %rax,%rbx
+	neg    %rax,%r17
+	negb   (%r31,%r12,1),%r8b
+	not    %rax,%r17
+	notb   (%r31,%r12,1),%r8b
+	or     $0x1234,%ax,%r30w
+	or     %r15b,%r17b,%r18b
+	or     %r15d,(%r8),%r18d
+	or     (%r15,%rax,1),%r16b,%r8b
+	or     (%r15,%rax,1),%r16w,%r8w
+	orl    $0x11,(%r19,%rax,4),%r20d
+	rcl    $0x2,%r12b,%r31b
+	rcl    %cl,%r16b,%r8b
+	rclb   $0x1, (%rax),%r31b
+	rcll   $0x2,(%rax),%r31d
+	rclw   $0x1, (%rax),%r31w
+	rclw   %cl,(%r19,%rax,4),%r31w
+	rcr    $0x2,%r12b,%r31b
+	rcr    %cl,%r16b,%r8b
+	rcrb   (%rax),%r31b
+	rcrl   $0x2,(%rax),%r31d
+	rcrw   $0x1, (%rax),%r31w
+	rcrw   %cl,(%r19,%rax,4),%r31w
+	rol    $0x2,%r12b,%r31b
+	rol    %cl,%r16b,%r8b
+	rolb   $0x1, (%rax),%r31b
+	roll   $0x2,(%rax),%r31d
+	rolw   $0x1, (%rax),%r31w
+	rolw   %cl,(%r19,%rax,4),%r31w
+	ror    $0x2,%r12b,%r31b
+	ror    %cl,%r16b,%r8b
+	rorb   $0x1, (%rax),%r31b
+	rorl   $0x2,(%rax),%r31d
+	rorw   $0x1, (%rax),%r31w
+	rorw   %cl,(%r19,%rax,4),%r31w
+	sar    $0x2,%r12b,%r31b
+	sar    %cl,%r16b,%r8b
+	sarb   $0x1, (%rax),%r31b
+	sarl   $0x2,(%rax),%r31d
+	sarw   $0x1, (%rax),%r31w
+	sarw   %cl,(%r19,%rax,4),%r31w
+	sbb    $0x1234,%ax,%r30w
+	sbb    %r15b,%r17b,%r18b
+	sbb    %r15d,(%r8),%r18d
+	sbb    (%r15,%rax,1),%r16b,%r8b
+	sbb    (%r15,%rax,1),%r16w,%r8w
+	sbbl   $0x11,(%r19,%rax,4),%r20d
+	shl    $0x2,%r12b,%r31b
+	shl    $0x2,%r12b,%r31b
+	shl    %cl,%r16b,%r8b
+	shl    %cl,%r16b,%r8b
+	shlb   $0x1, (%rax),%r31b
+	shlb   $0x1, (%rax),%r31b
+	shld   $0x1,%r12,(%rax),%r31
+	shld   $0x2,%r15d,(%rax),%r31d
+	shld   $0x2,%r8w,%r12w,%r31w
+	shld   %cl,%r12,%r16,%r8
+	shld   %cl,%r13w,(%r19,%rax,4),%r31w
+	shld   %cl,%r9w,(%rax),%r31w
+	shll   $0x2,(%rax),%r31d
+	shll   $0x2,(%rax),%r31d
+	shlw   $0x1, (%rax),%r31w
+	shlw   $0x1, (%rax),%r31w
+	shlw   %cl,(%r19,%rax,4),%r31w
+	shlw   %cl,(%r19,%rax,4),%r31w
+	shr    $0x2,%r12b,%r31b
+	shr    %cl,%r16b,%r8b
+	shrb   $0x1, (%rax),%r31b
+	shrd   $0x1,%r12,(%rax),%r31
+	shrd   $0x2,%r15d,(%rax),%r31d
+	shrd   $0x2,%r8w,%r12w,%r31w
+	shrd   %cl,%r12,%r16,%r8
+	shrd   %cl,%r13w,(%r19,%rax,4),%r31w
+	shrd   %cl,%r9w,(%rax),%r31w
+	shrl   $0x2,(%rax),%r31d
+	shrw   $0x1, (%rax),%r31w
+	shrw   %cl,(%r19,%rax,4),%r31w
+	sub    $0x1234,%ax,%r30w
+	sub    %r15b,%r17b,%r18b
+	sub    %r15d,(%r8),%r18d
+	sub    (%r15,%rax,1),%r16b,%r8b
+	sub    (%r15,%rax,1),%r16w,%r8w
+	subl   $0x11,(%r19,%rax,4),%r20d
+	xor    $0x1234,%ax,%r30w
+	xor    %r15b,%r17b,%r18b
+	xor    %r15d,(%r8),%r18d
+	xor    (%r15,%rax,1),%r16b,%r8b
+	xor    (%r15,%rax,1),%r16w,%r8w
+	xorl   $0x11,(%r19,%rax,4),%r20d
+
diff --git a/gas/testsuite/gas/i386/x86-64-pseudos.d b/gas/testsuite/gas/i386/x86-64-pseudos.d
index 708c22b5899..1d399ffa949 100644
--- a/gas/testsuite/gas/i386/x86-64-pseudos.d
+++ b/gas/testsuite/gas/i386/x86-64-pseudos.d
@@ -137,6 +137,48 @@  Disassembly of section .text:
  +[a-f0-9]+:	33 07                	xor    \(%rdi\),%eax
  +[a-f0-9]+:	31 07                	xor    %eax,\(%rdi\)
  +[a-f0-9]+:	33 07                	xor    \(%rdi\),%eax
+ +[a-f0-9]+:	62 44 fc 10 01 38    	add    %r31,\(%r8\),%r16
+ +[a-f0-9]+:	62 44 fc 10 03 38    	add    \(%r8\),%r31,%r16
+ +[a-f0-9]+:	62 44 fc 10 01 38    	add    %r31,\(%r8\),%r16
+ +[a-f0-9]+:	62 44 fc 10 03 38    	add    \(%r8\),%r31,%r16
+ +[a-f0-9]+:	62 54 6c 10 29 38    	sub    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 2b 38    	sub    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 29 38    	sub    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 2b 38    	sub    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 19 38    	sbb    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 1b 38    	sbb    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 19 38    	sbb    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 1b 38    	sbb    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 21 38    	and    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 23 38    	and    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 21 38    	and    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 23 38    	and    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 09 38    	or     %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 0b 38    	or     \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 09 38    	or     %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 0b 38    	or     \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 31 38    	xor    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 33 38    	xor    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 31 38    	xor    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 33 38    	xor    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 11 38    	adc    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 13 38    	adc    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 54 6c 10 11 38    	adc    %r15d,\(%r8\),%r18d
+ +[a-f0-9]+:	62 54 6c 10 13 38    	adc    \(%r8\),%r15d,%r18d
+ +[a-f0-9]+:	62 44 fc 10 01 f8    	add    %r31,%r8,%r16
+ +[a-f0-9]+:	62 5c fc 10 03 c7    	add    %r31,%r8,%r16
+ +[a-f0-9]+:	62 7c 6c 10 28 f9    	sub    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 2a cf    	sub    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 7c 6c 10 18 f9    	sbb    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 1a cf    	sbb    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 7c 6c 10 20 f9    	and    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 22 cf    	and    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 7c 6c 10 08 f9    	or     %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 0a cf    	or     %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 7c 6c 10 30 f9    	xor    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 32 cf    	xor    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 7c 6c 10 10 f9    	adc    %r15b,%r17b,%r18b
+ +[a-f0-9]+:	62 c4 6c 10 12 cf    	adc    %r15b,%r17b,%r18b
  +[a-f0-9]+:	b0 12                	mov    \$0x12,%al
  +[a-f0-9]+:	b8 45 03 00 00       	mov    \$0x345,%eax
  +[a-f0-9]+:	b0 12                	mov    \$0x12,%al
diff --git a/gas/testsuite/gas/i386/x86-64-pseudos.s b/gas/testsuite/gas/i386/x86-64-pseudos.s
index 29a0c3368fc..e5b3a0d625d 100644
--- a/gas/testsuite/gas/i386/x86-64-pseudos.s
+++ b/gas/testsuite/gas/i386/x86-64-pseudos.s
@@ -134,6 +134,49 @@  _start:
 	{load} xor (%rdi), %eax
 	{store} xor %eax, (%rdi)
 	{store} xor (%rdi), %eax
+	{load}  add    %r31,(%r8),%r16
+	{load}	add    (%r8),%r31,%r16
+	{store} add    %r31,(%r8),%r16
+	{store}	add    (%r8),%r31,%r16
+	{load} 	sub    %r15d,(%r8),%r18d
+	{load}	sub    (%r8),%r15d,%r18d
+	{store} sub    %r15d,(%r8),%r18d
+	{store} sub    (%r8),%r15d,%r18d
+	{load} 	sbb    %r15d,(%r8),%r18d
+	{load}	sbb    (%r8),%r15d,%r18d
+	{store} sbb    %r15d,(%r8),%r18d
+	{store} sbb    (%r8),%r15d,%r18d
+	{load} 	and    %r15d,(%r8),%r18d
+	{load}	and    (%r8),%r15d,%r18d
+	{store} and    %r15d,(%r8),%r18d
+	{store} and    (%r8),%r15d,%r18d
+	{load} 	or     %r15d,(%r8),%r18d
+	{load}	or     (%r8),%r15d,%r18d
+	{store} or     %r15d,(%r8),%r18d
+	{store} or     (%r8),%r15d,%r18d
+	{load} 	xor    %r15d,(%r8),%r18d
+	{load}	xor    (%r8),%r15d,%r18d
+	{store} xor    %r15d,(%r8),%r18d
+	{store} xor    (%r8),%r15d,%r18d
+	{load} 	adc    %r15d,(%r8),%r18d
+	{load}	adc    (%r8),%r15d,%r18d
+	{store} adc    %r15d,(%r8),%r18d
+	{store} adc    (%r8),%r15d,%r18d
+
+	{store} add    %r31,%r8,%r16
+	{load}  add    %r31,%r8,%r16
+	{store} sub    %r15b,%r17b,%r18b
+	{load}	sub    %r15b,%r17b,%r18b
+	{store}	sbb    %r15b,%r17b,%r18b
+	{load}	sbb    %r15b,%r17b,%r18b
+	{store}	and    %r15b,%r17b,%r18b
+	{load}	and    %r15b,%r17b,%r18b
+	{store}	or     %r15b,%r17b,%r18b
+	{load}	or     %r15b,%r17b,%r18b
+	{store}	xor    %r15b,%r17b,%r18b
+	{load}	xor    %r15b,%r17b,%r18b
+	{store}	adc    %r15b,%r17b,%r18b
+	{load}	adc    %r15b,%r17b,%r18b
 
 	.irp m, mov, adc, add, and, cmp, or, sbb, sub, test, xor
 	\m	$0x12, %al
diff --git a/gas/testsuite/gas/i386/x86-64.exp b/gas/testsuite/gas/i386/x86-64.exp
index f6b6bb2f426..c28e4e7e333 100644
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -370,6 +370,7 @@  run_dump_test "x86-64-apx-rex2"
 run_dump_test "x86-64-apx-evex-promoted"
 run_dump_test "x86-64-apx-evex-promoted-intel"
 run_dump_test "x86-64-apx-evex-egpr"
+run_dump_test "x86-64-apx-ndd"
 run_dump_test "x86-64-avx512f-rcigrz-intel"
 run_dump_test "x86-64-avx512f-rcigrz"
 run_dump_test "x86-64-clwb"
diff --git a/opcodes/i386-dis-evex-reg.h b/opcodes/i386-dis-evex-reg.h
index 8374f0ea93a..b7f87c2fa39 100644
--- a/opcodes/i386-dis-evex-reg.h
+++ b/opcodes/i386-dis-evex-reg.h
@@ -56,3 +56,58 @@ 
     { "blsmskS",	{ VexGdq, Edq }, 0 },
     { "blsiS",	{ VexGdq, Edq }, 0 },
   },
+  /* REG_EVEX_MAP4_80 */
+  {
+    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "adcA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "sbbA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "andA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "subA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "xorA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+  },
+  /* REG_EVEX_MAP4_81 */
+  {
+    { "addQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "orQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "adcQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "sbbQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "andQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "subQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+    { "xorQ",	{ VexGv, Ev, Iv }, PREFIX_NP_OR_DATA },
+  },
+  /* REG_EVEX_MAP4_83 */
+  {
+    { "addQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "orQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "adcQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "sbbQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "andQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "subQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+    { "xorQ",	{ VexGv, Ev, sIb }, PREFIX_NP_OR_DATA },
+  },
+  /* REG_EVEX_MAP4_F6 */
+  {
+    { Bad_Opcode },
+    { Bad_Opcode },
+    { "notA",	{ VexGb, Eb }, NO_PREFIX },
+    { "negA",	{ VexGb, Eb }, NO_PREFIX },
+  },
+  /* REG_EVEX_MAP4_F7 */
+  {
+    { Bad_Opcode },
+    { Bad_Opcode },
+    { "notQ",	{ VexGv, Ev }, PREFIX_NP_OR_DATA },
+    { "negQ",	{ VexGv, Ev }, PREFIX_NP_OR_DATA },
+  },
+  /* REG_EVEX_MAP4_FE */
+  {
+    { "incA",	{ VexGb, Eb }, NO_PREFIX },
+    { "decA",	{ VexGb, Eb }, NO_PREFIX },
+  },
+  /* REG_EVEX_MAP4_FF */
+  {
+    { "incQ",	{ VexGv, Ev }, PREFIX_NP_OR_DATA },
+    { "decQ",	{ VexGv, Ev }, PREFIX_NP_OR_DATA },
+  },
+
diff --git a/opcodes/i386-dis-evex.h b/opcodes/i386-dis-evex.h
index ea0a4c0b2a5..a6e1eb3250f 100644
--- a/opcodes/i386-dis-evex.h
+++ b/opcodes/i386-dis-evex.h
@@ -875,64 +875,64 @@  static const struct dis386 evex_table[][256] = {
   /* EVEX_MAP4_ */
   {
     /* 00 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "addB",             { VexGb, Eb, Gb }, NO_PREFIX },
+    { "addS",             { VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "addB",             { VexGb, Gb, EbS }, NO_PREFIX },
+    { "addS",             { VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 08 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "orB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "orS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "orB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "orS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 10 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "adcB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "adcS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "adcB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "adcS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 18 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "sbbB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "sbbS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "sbbB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "sbbS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 20 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "andB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "andS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "andB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "andS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
+    { "shldS",		{ VexGv, Ev, Gv, Ib }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 28 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "subB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "subS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "subB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "subS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
+    { "shrdS",		{ VexGv, Ev, Gv, Ib }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
     /* 30 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "xorB",		{ VexGb, Eb, Gb }, NO_PREFIX },
+    { "xorS",		{ VexGv, Ev, Gv }, PREFIX_NP_OR_DATA },
+    { "xorB",		{ VexGb, Gb, EbS }, NO_PREFIX },
+    { "xorS",		{ VexGv, Gv, EvS }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
@@ -947,23 +947,23 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     /* 40 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "%CFcmovoS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovnoS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovbS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovaeS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmoveS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovneS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovbeS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovaS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
     /* 48 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "%CFcmovsS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovnsS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovpS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovnpS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovlS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovgeS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovleS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
+    { "%CFcmovgS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
     /* 50 */
     { Bad_Opcode },
     { Bad_Opcode },
@@ -1019,10 +1019,10 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     /* 80 */
+    { REG_TABLE (REG_EVEX_MAP4_80) },
+    { REG_TABLE (REG_EVEX_MAP4_81) },
     { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { REG_TABLE (REG_EVEX_MAP4_83) },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
@@ -1060,7 +1060,7 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
-    { Bad_Opcode },
+    { "shldS",	{ VexGv, Ev, Gv, CL }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
     { Bad_Opcode },
     /* A8 */
@@ -1069,9 +1069,9 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
+    { "shrdS",	{ VexGv, Ev, Gv, CL }, PREFIX_NP_OR_DATA },
     { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { "imulS",	{ VexGv, Gv, Ev }, PREFIX_NP_OR_DATA },
     /* B0 */
     { Bad_Opcode },
     { Bad_Opcode },
@@ -1091,8 +1091,8 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     /* C0 */
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { REG_TABLE (REG_C0) },
+    { REG_TABLE (REG_C1) },
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
@@ -1109,10 +1109,10 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     /* D0 */
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { REG_TABLE (REG_D0) },
+    { REG_TABLE (REG_D1) },
+    { REG_TABLE (REG_D2) },
+    { REG_TABLE (REG_D3) },
     { "sha1rnds4",	{ XM, EXxmm, Ib }, NO_PREFIX },
     { Bad_Opcode },
     { Bad_Opcode },
@@ -1151,8 +1151,8 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { Bad_Opcode },
     { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { REG_TABLE (REG_EVEX_MAP4_F6) },
+    { REG_TABLE (REG_EVEX_MAP4_F7) },
     /* F8 */
     { PREFIX_TABLE (PREFIX_EVEX_MAP4_F8) },
     { "movdiri",	{ Mdq, Gdq }, NO_PREFIX },
@@ -1160,8 +1160,8 @@  static const struct dis386 evex_table[][256] = {
     { Bad_Opcode },
     { PREFIX_TABLE (PREFIX_0F38FC) },
     { Bad_Opcode },
-    { Bad_Opcode },
-    { Bad_Opcode },
+    { REG_TABLE (REG_EVEX_MAP4_FE) },
+    { REG_TABLE (REG_EVEX_MAP4_FF) },
   },
   /* EVEX_MAP5_ */
   {
diff --git a/opcodes/i386-dis.c b/opcodes/i386-dis.c
index b81e75aa786..50b2734108b 100644
--- a/opcodes/i386-dis.c
+++ b/opcodes/i386-dis.c
@@ -579,6 +579,8 @@  fetch_error (const instr_info *ins)
 #define VexGatherD { OP_VEX, vex_vsib_d_w_dq_mode }
 #define VexGatherQ { OP_VEX, vex_vsib_q_w_dq_mode }
 #define VexGdq { OP_VEX, dq_mode }
+#define VexGb { OP_VEX, b_mode }
+#define VexGv { OP_VEX, v_mode }
 #define VexTmm { OP_VEX, tmm_mode }
 #define XMVexI4 { OP_REG_VexI4, x_mode }
 #define XMVexScalarI4 { OP_REG_VexI4, scalar_mode }
@@ -894,6 +896,13 @@  enum
   REG_EVEX_0F38C6_L_2,
   REG_EVEX_0F38C7_L_2,
   REG_EVEX_0F38F3_L_0_P_0,
+  REG_EVEX_MAP4_80,
+  REG_EVEX_MAP4_81,
+  REG_EVEX_MAP4_83,
+  REG_EVEX_MAP4_F6,
+  REG_EVEX_MAP4_F7,
+  REG_EVEX_MAP4_FE,
+  REG_EVEX_MAP4_FF,
 };
 
 enum
@@ -2605,25 +2614,25 @@  static const struct dis386 reg_table[][8] = {
   },
   /* REG_C0 */
   {
-    { "rolA",	{ Eb, Ib }, 0 },
-    { "rorA",	{ Eb, Ib }, 0 },
-    { "rclA",	{ Eb, Ib }, 0 },
-    { "rcrA",	{ Eb, Ib }, 0 },
-    { "shlA",	{ Eb, Ib }, 0 },
-    { "shrA",	{ Eb, Ib }, 0 },
-    { "shlA",	{ Eb, Ib }, 0 },
-    { "sarA",	{ Eb, Ib }, 0 },
+    { "rolA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "rorA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "rclA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "rcrA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "shrA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, Ib }, NO_PREFIX },
+    { "sarA",	{ VexGb, Eb, Ib }, NO_PREFIX },
   },
   /* REG_C1 */
   {
-    { "rolQ",	{ Ev, Ib }, 0 },
-    { "rorQ",	{ Ev, Ib }, 0 },
-    { "rclQ",	{ Ev, Ib }, 0 },
-    { "rcrQ",	{ Ev, Ib }, 0 },
-    { "shlQ",	{ Ev, Ib }, 0 },
-    { "shrQ",	{ Ev, Ib }, 0 },
-    { "shlQ",	{ Ev, Ib }, 0 },
-    { "sarQ",	{ Ev, Ib }, 0 },
+    { "rolQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "rorQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "rclQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "rcrQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "shrQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
+    { "sarQ",	{ VexGv, Ev, Ib }, PREFIX_NP_OR_DATA },
   },
   /* REG_C6 */
   {
@@ -2649,47 +2658,47 @@  static const struct dis386 reg_table[][8] = {
   },
   /* REG_D0 */
   {
-    { "rolA",	{ Eb, I1 }, 0 },
-    { "rorA",	{ Eb, I1 }, 0 },
-    { "rclA",	{ Eb, I1 }, 0 },
-    { "rcrA",	{ Eb, I1 }, 0 },
-    { "shlA",	{ Eb, I1 }, 0 },
-    { "shrA",	{ Eb, I1 }, 0 },
-    { "shlA",	{ Eb, I1 }, 0 },
-    { "sarA",	{ Eb, I1 }, 0 },
+    { "rolA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "rorA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "rclA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "rcrA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "shrA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, I1 }, NO_PREFIX },
+    { "sarA",	{ VexGb, Eb, I1 }, NO_PREFIX },
   },
   /* REG_D1 */
   {
-    { "rolQ",	{ Ev, I1 }, 0 },
-    { "rorQ",	{ Ev, I1 }, 0 },
-    { "rclQ",	{ Ev, I1 }, 0 },
-    { "rcrQ",	{ Ev, I1 }, 0 },
-    { "shlQ",	{ Ev, I1 }, 0 },
-    { "shrQ",	{ Ev, I1 }, 0 },
-    { "shlQ",	{ Ev, I1 }, 0 },
-    { "sarQ",	{ Ev, I1 }, 0 },
+    { "rolQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "rorQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "rclQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "rcrQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "shrQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
+    { "sarQ",	{ VexGv, Ev, I1 }, PREFIX_NP_OR_DATA },
   },
   /* REG_D2 */
   {
-    { "rolA",	{ Eb, CL }, 0 },
-    { "rorA",	{ Eb, CL }, 0 },
-    { "rclA",	{ Eb, CL }, 0 },
-    { "rcrA",	{ Eb, CL }, 0 },
-    { "shlA",	{ Eb, CL }, 0 },
-    { "shrA",	{ Eb, CL }, 0 },
-    { "shlA",	{ Eb, CL }, 0 },
-    { "sarA",	{ Eb, CL }, 0 },
+    { "rolA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "rorA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "rclA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "rcrA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "shrA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "shlA",	{ VexGb, Eb, CL }, NO_PREFIX },
+    { "sarA",	{ VexGb, Eb, CL }, NO_PREFIX },
   },
   /* REG_D3 */
   {
-    { "rolQ",	{ Ev, CL }, 0 },
-    { "rorQ",	{ Ev, CL }, 0 },
-    { "rclQ",	{ Ev, CL }, 0 },
-    { "rcrQ",	{ Ev, CL }, 0 },
-    { "shlQ",	{ Ev, CL }, 0 },
-    { "shrQ",	{ Ev, CL }, 0 },
-    { "shlQ",	{ Ev, CL }, 0 },
-    { "sarQ",	{ Ev, CL }, 0 },
+    { "rolQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "rorQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "rclQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "rcrQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "shrQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "shlQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
+    { "sarQ",	{ VexGv, Ev, CL }, PREFIX_NP_OR_DATA },
   },
   /* REG_F6 */
   {
@@ -3639,8 +3648,8 @@  static const struct dis386 prefix_table[][4] = {
   /* PREFIX_0F38F6 */
   {
     { "wrssK",	{ M, Gdq }, 0 },
-    { "adoxS",	{ Gdq, Edq}, 0 },
-    { "adcxS",	{ Gdq, Edq}, 0 },
+    { "adoxS",	{ VexGdq, Gdq, Edq}, 0 },
+    { "adcxS",	{ VexGdq, Gdq, Edq}, 0 },
     { Bad_Opcode },
   },
 
@@ -9114,6 +9123,12 @@  get_valid_dis386 (const struct dis386 *dp, instr_info *ins)
 	  ins->rex2 &= ~REX_R;
 	}
 
+      /* EVEX from legacy instructions, when the EVEX.ND bit is 0,
+	 all bits of EVEX.vvvv and EVEX.V' must be 1.  */
+      if (ins->evex_type == evex_from_legacy && !ins->vex.b
+	  && (ins->vex.register_specifier || !ins->vex.v))
+	return &bad_opcode;
+
       ins->need_vex = 4;
 
       /* EVEX from legacy instructions require that EVEX.z, EVEX.L’L and the
@@ -9131,8 +9146,10 @@  get_valid_dis386 (const struct dis386 *dp, instr_info *ins)
       if (!fetch_modrm (ins))
 	return &err_opcode;
 
-      /* Set vector length.  */
-      if (ins->modrm.mod == 3 && ins->vex.b)
+      /* Set vector length. For EVEX-promoted instructions, evex.ll == 0b00,
+	 which has the same encoding as vex.length == 128 and they can share
+	 the same processing with vex.length in OP_VEX.  */
+      if (ins->modrm.mod == 3 && ins->vex.b && ins->evex_type != evex_from_legacy)
 	ins->vex.length = 512;
       else
 	{
@@ -9598,8 +9615,8 @@  print_insn (bfd_vma pc, disassemble_info *info, int intel_syntax)
 	    }
 
 	  /* Check whether rounding control was enabled for an insn not
-	     supporting it.  */
-	  if (ins.modrm.mod == 3 && ins.vex.b
+	     supporting it, when evex.b is not treated as evex.nd.  */
+	  if (ins.modrm.mod == 3 && ins.vex.b && ins.evex_type == evex_default
 	      && !(ins.evex_used & EVEX_b_used))
 	    {
 	      for (i = 0; i < MAX_OPERANDS; ++i)
@@ -10487,16 +10504,23 @@  putop (instr_info *ins, const char *in_template, int sizeflag)
 	  ins->used_prefixes |= (ins->prefixes & PREFIX_ADDR);
 	  break;
 	case 'F':
-	  if (ins->intel_syntax)
-	    break;
-	  if ((ins->prefixes & PREFIX_ADDR) || (sizeflag & SUFFIX_ALWAYS))
+	  if (l == 0)
 	    {
-	      if (sizeflag & AFLAG)
-		*ins->obufp++ = ins->address_mode == mode_64bit ? 'q' : 'l';
-	      else
-		*ins->obufp++ = ins->address_mode == mode_64bit ? 'l' : 'w';
-	      ins->used_prefixes |= (ins->prefixes & PREFIX_ADDR);
+	      if (ins->intel_syntax)
+		break;
+	      if ((ins->prefixes & PREFIX_ADDR) || (sizeflag & SUFFIX_ALWAYS))
+		{
+		  if (sizeflag & AFLAG)
+		    *ins->obufp++ = ins->address_mode == mode_64bit ? 'q' : 'l';
+		  else
+		    *ins->obufp++ = ins->address_mode == mode_64bit ? 'l' : 'w';
+		  ins->used_prefixes |= (ins->prefixes & PREFIX_ADDR);
+		}
 	    }
+	  else if (l == 1 && last[0] == 'C')
+	    break;
+	  else
+	    abort ();
 	  break;
 	case 'G':
 	  if (ins->intel_syntax || (ins->obufp[-1] != 's'
@@ -11060,7 +11084,8 @@  print_displacement (instr_info *ins, bfd_signed_vma val)
 static void
 intel_operand_size (instr_info *ins, int bytemode, int sizeflag)
 {
-  if (ins->vex.b)
+  /* Check if there is a broadcast, when evex.b is not treated as evex.nd.  */
+  if (ins->vex.b && ins->evex_type == evex_default)
     {
       if (!ins->vex.no_broadcast)
 	switch (bytemode)
@@ -11558,6 +11583,7 @@  OP_E_memory (instr_info *ins, int bytemode, int sizeflag)
 
   add += (ins->rex2 & REX_B) ? 16 : 0;
 
+  /* Handles EVEX other than APX EVEX-promoted instructions.  */
   if (ins->vex.evex && ins->evex_type == evex_default)
     {
 
@@ -11994,7 +12020,7 @@  OP_E_memory (instr_info *ins, int bytemode, int sizeflag)
 	  print_operand_value (ins, disp & 0xffff, dis_style_text);
 	}
     }
-  if (ins->vex.b)
+  if (ins->vex.b && ins->evex_type == evex_default)
     {
       ins->evex_used |= EVEX_b_used;
 
@@ -13362,6 +13388,14 @@  OP_VEX (instr_info *ins, int bytemode, int sizeflag ATTRIBUTE_UNUSED)
   if (!ins->need_vex)
     return true;
 
+  /* Here vex.b is treated as "EVEX.ND".  */
+  if (ins->evex_type == evex_from_legacy)
+    {
+      ins->evex_used |= EVEX_b_used;
+      if (!ins->vex.b)
+	return true;
+    }
+
   reg = ins->vex.register_specifier;
   ins->vex.register_specifier = 0;
   if (ins->address_mode != mode_64bit)
@@ -13453,12 +13487,19 @@  OP_VEX (instr_info *ins, int bytemode, int sizeflag ATTRIBUTE_UNUSED)
 	  names = att_names_xmm;
 	  ins->evex_used |= EVEX_len_used;
 	  break;
+	case v_mode:
 	case dq_mode:
 	  if (ins->rex & REX_W)
 	    names = att_names64;
+	  else if (bytemode == v_mode
+		   && !(sizeflag & DFLAG))
+	    names = att_names16;
 	  else
 	    names = att_names32;
 	  break;
+	case b_mode:
+	  names = att_names8rex;
+	  break;
 	case mask_bd_mode:
 	case mask_mode:
 	  if (reg > 0x7)
diff --git a/opcodes/i386-opc.h b/opcodes/i386-opc.h
index 88717fd7575..256f5a3865e 100644
--- a/opcodes/i386-opc.h
+++ b/opcodes/i386-opc.h
@@ -638,8 +638,10 @@  enum
   Vex,
   /* How to encode VEX.vvvv:
      0: VEX.vvvv must be 1111b.
-     1: VEX.vvvv encodes one of the register operands.
+     1: VEX.vvvv encodes one of the src register operands.
+     2: VEX.vvvv encodes the dest register operand.
    */
+#define VexVVVV_DST   2
   VexVVVV,
   /* How the VEX.W bit is used:
      0: Set by the REX.W bit.
@@ -786,7 +788,7 @@  typedef struct i386_opcode_modifier
   unsigned int immext:1;
   unsigned int norex64:1;
   unsigned int vex:2;
-  unsigned int vexvvvv:1;
+  unsigned int vexvvvv:2;
   unsigned int vexw:2;
   unsigned int opcodeprefix:2;
   unsigned int sib:3;
diff --git a/opcodes/i386-opc.tbl b/opcodes/i386-opc.tbl
index b27131ef185..5aa00cb93ef 100644
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -139,9 +139,13 @@ 
 #define Vsz256 Vsz=VSZ256
 #define Vsz512 Vsz=VSZ512
 
+#define DstVVVV VexVVVV=VexVVVV_DST
+
 // The EVEX purpose of StaticRounding appears only together with SAE. Re-use
 // the bit to mark commutative VEX encodings where swapping the source
 // operands may allow to switch from 3-byte to 2-byte VEX encoding.
+// And re-use the bit to mark some NDD insns that swapping the source operands
+// may allow to switch from EVEX encoding to REX2 encoding.
 #define C StaticRounding
 
 #define FP 387|287|8087
@@ -288,26 +292,40 @@  std, 0xfd, 0, NoSuf, {}
 sti, 0xfb, 0, NoSuf, {}
 
 // Arithmetic.
+add, 0x0, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+add, 0x83/0, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+add, 0x80/0, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
 add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
 inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
+inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, {Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
 inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|Optimize|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
 sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sub, 0x83/5, APX_F, Modrm|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 sub, 0x2c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+sub, 0x80/5, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sub, 0x80/5, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
 dec, 0x48, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
+dec, 0xfe/1, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 dec, 0xfe/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+sbb, 0x18, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sbb, 0x18, APX_F, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sbb, 0x83/3, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+sbb, 0x83/3, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+sbb, 0x80/3, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sbb, 0x80/3, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
 cmp, 0x38, 0, D|W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 cmp, 0x83/7, 0, Modrm|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
@@ -318,31 +336,50 @@  test, 0x84, 0, D|W|C|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64, R
 test, 0xa8, 0, W|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
 test, 0xf6/0, 0, W|Modrm|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+and, 0x20, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 and, 0x20, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+and, 0x83/4, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 and, 0x83/4, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock|Optimize, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 and, 0x24, 0, W|No_sSuf|Optimize, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+and, 0x80/4, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 and, 0x80/4, 0, W|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+or, 0x8, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 or, 0x8, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+or, 0x83/1, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 or, 0x83/1, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 or, 0xc, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+or, 0x80/1, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 or, 0x80/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+xor, 0x30, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 xor, 0x30, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+xor, 0x83/6, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 xor, 0x34, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+xor, 0x80/6, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 xor, 0x80/6, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
 // clr with 1 operand is really xor with 2 operands.
 clr, 0x30, 0, W|Modrm|No_sSuf|RegKludge|Optimize, { Reg8|Reg16|Reg32|Reg64 }
 
+adc, 0x10, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 adc, 0x10, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+adc, 0x10, APX_F, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+adc, 0x83/2, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 adc, 0x83/2, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+adc, 0x83/2, APX_F, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 adc, 0x14, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
+adc, 0x80/2, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 adc, 0x80/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+adc, 0x80/2, APX_F, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+neg, 0xf6/3, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 neg, 0xf6/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+
+not, 0xf6/2, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 not, 0xf6/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+not, 0xf6/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
 aaa, 0x37, No64, NoSuf, {}
 aas, 0x3f, No64, NoSuf, {}
@@ -375,6 +412,7 @@  cqto, 0x99, x64, Size64|NoSuf, {}
 // These multiplies can only be selected with single operand forms.
 mul, 0xf6/4, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 imul, 0xf6/5, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+imul, 0xaf, APX_F, C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg16|Reg32|Reg64|Unspecified|Word|Dword|Qword|BaseIndex, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }
 imul, 0xfaf, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|Word|Dword|Qword|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x6b, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x69, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
@@ -389,52 +427,98 @@  div, 0xf6/6, 0, W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|
 idiv, 0xf6/7, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 idiv, 0xf6/7, 0, W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Acc|Byte|Word|Dword|Qword }
 
+rol, 0xd0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rol, 0xd0/0, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rol, 0xc0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rol, 0xc0/0, i186, W|Modrm|No_sSuf, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rol, 0xd2/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rol, 0xd2/0, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rol, 0xd0/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rol, 0xd0/0, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+ror, 0xd0/1, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 ror, 0xd0/1, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+ror, 0xc0/1, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 ror, 0xc0/1, i186, W|Modrm|No_sSuf, { Imm8|Imm8S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+ror, 0xd2/1, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 ror, 0xd2/1, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+ror, 0xd0/1, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 ror, 0xd0/1, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+rcl, 0xd0/2, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcl, 0xd0/2, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xd0/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xc0/2, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcl, 0xc0/2, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xc0/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xd2/2, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcl, 0xd2/2, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xd2/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xd0/2, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcl, 0xd0/2, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcl, 0xd0/2, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+rcr, 0xd0/3, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcr, 0xd0/3, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xd0/3, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xc0/3, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcr, 0xc0/3, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xc0/3, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xd2/3, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcr, 0xd2/3, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xd2/3, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xd0/3, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 rcr, 0xd0/3, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+rcr, 0xd0/3, APX_F, W|Modrm|No_sSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+sal, 0xd0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sal, 0xd0/4, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sal, 0xc0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sal, 0xc0/4, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sal, 0xd2/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sal, 0xd2/4, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sal, 0xd0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sal, 0xd0/4, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+shl, 0xd0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shl, 0xd0/4, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shl, 0xc0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shl, 0xc0/4, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shl, 0xd2/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shl, 0xd2/4, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shl, 0xd0/4, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shl, 0xd0/4, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+shr, 0xd0/5, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shr, 0xd0/5, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shr, 0xc0/5, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shr, 0xc0/5, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shr, 0xd2/5, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shr, 0xd2/5, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+shr, 0xd0/5, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 shr, 0xd0/5, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+sar, 0xd0/7, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sar, 0xd0/7, 0, W|Modrm|No_sSuf, { Imm1, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sar, 0xc0/7, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sar, 0xc0/7, i186, W|Modrm|No_sSuf, { Imm8, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sar, 0xd2/7, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sar, 0xd2/7, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
+sar, 0xd0/7, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVex128|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 sar, 0xd0/7, 0, W|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 
+shld, 0x24, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shld, 0xfa4, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+shld, 0xa5, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shld, 0xfa5, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { ShiftCount, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+shld, 0xa5, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shld, 0xfa5, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 
+shrd, 0x2c, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Imm8, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shrd, 0xfac, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+shrd, 0xad, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { ShiftCount, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shrd, 0xfad, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { ShiftCount, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
+shrd, 0xad, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4|NF, { Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 shrd, 0xfad, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
 
 // Control transfer instructions.
@@ -940,6 +1024,7 @@  ud2b, 0xfb9, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|U
 // 3rd official undefined instr (older CPUs don't take a ModR/M byte)
 ud0, 0xfff, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 
+cmov<cc>, 0x4<cc:opc>, CMOV&APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }
 cmov<cc>, 0xf4<cc:opc>, CMOV, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 
 fcmovb, 0xda/0, i687, Modrm|NoSuf, { FloatReg, FloatAcc }
@@ -2027,8 +2112,12 @@  xcryptofb, 0xf30fa7e8, PadLock, NoSuf|RepPrefixOk, {}
 xstore, 0xfa7c0, PadLock, NoSuf|RepPrefixOk, {}
 
 // Multy-precision Add Carry, rdseed instructions.
+adcx, 0x6666, ADX&APX_F, C|Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64, Reg32|Reg64 }
 adcx, 0x660f38f6, ADX, Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64 }
+adcx, 0x6666, ADX&APX_F, Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|EVex128|EVexMap4, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64 }
+adox, 0xf366, ADX&APX_F, C|Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|DstVVVV|EVex128|EVexMap4, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64, Reg32|Reg64 }
 adox, 0xf30f38f6, ADX, Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64 }
+adox, 0xf366, ADX&APX_F, Modrm|CheckOperandSize|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|EVex128|EVexMap4, { Reg32|Reg64|Unspecified|BaseIndex, Reg32|Reg64 }
 rdseed, 0xfc7/7, RdSeed, Modrm|NoSuf, { Reg16|Reg32|Reg64 }
 
 // SMAP instructions.