x86: also prefer VEX encoding over EVEX one for VCVTNEPS2BF16 when possible

Message ID 4d46ae40-6bcf-49ff-b921-39a50ec3e219@suse.com
State Unresolved
Headers
Series x86: also prefer VEX encoding over EVEX one for VCVTNEPS2BF16 when possible |

Checks

Context Check Description
snail/binutils-gdb-check warning Git am fail log

Commit Message

Jan Beulich Nov. 20, 2023, 8:03 a.m. UTC
  Deal with what 58bceb182740 ("x86: prefer VEX encodings over EVEX ones
when possible") left out, for being slightly less straightforward.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -7475,20 +7475,27 @@  match_template (char mnem_suffix)
 	}
 
       /* Check whether to use the shorter VEX encoding for certain insns where
-	 the EVEX enconding comes first in the table.  This requires the respective
-	 AVX-* feature to be explicitly enabled.  */
-      if (t == current_templates->start
+	 the EVEX encoding comes first in the table.  This requires the respective
+	 AVX-* feature to be explicitly enabled.
+
+	 Most of the respective insns have just a single EVEX and a single VEX
+	 template.  The one that's presently different is generated using the
+	 Vxy / Exy constructs: There are 3 suffix-less EVEX forms, the latter
+	 two of which may fall back to their two corresponding VEX forms.  */
+      j = t->mnem_off != MN_vcvtneps2bf16 ? 1 : 2;
+      if ((t == current_templates->start || j > 1)
 	  && t->opcode_modifier.disp8memshift
 	  && !t->opcode_modifier.vex
 	  && !need_evex_encoding ()
-	  && t + 1 < current_templates->end
-	  && t[1].opcode_modifier.vex)
+	  && t + j < current_templates->end
+	  && t[j].opcode_modifier.vex)
 	{
 	  i386_cpu_flags cpu;
 	  unsigned int memshift = i.memshift;
 
 	  i.memshift = 0;
-	  cpu = cpu_flags_and (cpu_flags_from_attr (t[1].cpu), cpu_arch_isa_flags);
+	  cpu = cpu_flags_and (cpu_flags_from_attr (t[j].cpu),
+			       cpu_arch_isa_flags);
 	  if (!cpu_flags_all_zero (&cpu)
 	      && (!i.types[0].bitfield.disp8
 		  || !operand_type_check (i.types[0], disp)
@@ -7496,6 +7503,7 @@  match_template (char mnem_suffix)
 		  || fits_in_disp8 (i.op[0].disps->X_add_number)))
 	    {
 	      specific_error = progress (internal_error);
+	      t += j - 1;
 	      continue;
 	    }
 	  i.memshift = memshift;
--- a/gas/testsuite/gas/i386/avx-vex.l
+++ b/gas/testsuite/gas/i386/avx-vex.l
@@ -27,7 +27,8 @@ 
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F2F538 	>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+B410
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -36,6 +37,12 @@ 
 [ 	]*[0-9]+[ 	]+724808
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E38 	>  vcvtneps2bf16y \(%eax\)\{1to8\},%xmm1
 [ 	]*[0-9]+[ 	]+7208
+[ 	]*[0-9]+[ 	]+> *
+[ 	]*[0-9]+[ 	]+>  \.intel_syntax noprefix
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E08 	>  vcvtneps2bf16 xmm0,xmmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 xmm0,ymmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
 #...
 [ 	]*[0-9]+[ 	]+>  \.arch \.noavx512vl
 [ 	]*[0-9]+[ 	]+> *
@@ -56,7 +63,8 @@ 
 [ 	]*[0-9]+[ 	]+00
 [ 	]*[0-9]+[ 	]+>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -65,6 +73,12 @@ 
 [ 	]*[0-9]+[ 	]+88000100 *
 [ 	]*[0-9]+[ 	]+00
 [ 	]*[0-9]+[ 	]+>  vcvtneps2bf16y \(%eax\)\{1to8\},%xmm1
+[ 	]*[0-9]+[ 	]+> *
+[ 	]*[0-9]+[ 	]+>  \.intel_syntax noprefix
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27A72 	>  vcvtneps2bf16 xmm0,xmmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+01
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16 xmm0,ymmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+01
 #...
 [ 	]*[0-9]+[ 	]+>  \.arch \.noavx512f
 [ 	]*[0-9]+[ 	]+> *
@@ -85,7 +99,8 @@ 
 [ 	]*[0-9]+[ 	]+00
 [ 	]*[0-9]+[ 	]+>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -113,7 +128,8 @@ 
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F2F538 	>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+B410
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -122,6 +138,12 @@ 
 [ 	]*[0-9]+[ 	]+724808
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E38 	>  vcvtneps2bf16y \(%eax\)\{1to8\},%xmm1
 [ 	]*[0-9]+[ 	]+7208
+[ 	]*[0-9]+[ 	]+> *
+[ 	]*[0-9]+[ 	]+>  \.intel_syntax noprefix
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E08 	>  vcvtneps2bf16 xmm0,xmmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 xmm0,ymmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
 #...
 [ 	]*[0-9]+[ 	]+>  \.arch \.avx_ifma
 [ 	]*[0-9]+[ 	]+> *
@@ -141,7 +163,8 @@ 
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F2F538 	>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+B410
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+72C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -150,6 +173,12 @@ 
 [ 	]*[0-9]+[ 	]+724808
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E38 	>  vcvtneps2bf16y \(%eax\)\{1to8\},%xmm1
 [ 	]*[0-9]+[ 	]+7208
+[ 	]*[0-9]+[ 	]+> *
+[ 	]*[0-9]+[ 	]+>  \.intel_syntax noprefix
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E08 	>  vcvtneps2bf16 xmm0,xmmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
+[ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E28 	>  vcvtneps2bf16 xmm0,ymmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+7201
 #...
 [ 	]*[0-9]+[ 	]+>  \.arch \.avx_ne_convert
 [ 	]*[0-9]+[ 	]+> *
@@ -169,7 +198,8 @@ 
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F2F538 	>  vpmadd52luq \(%eax\)\{1to4\},%ymm1,%ymm2
 [ 	]*[0-9]+[ 	]+B410
 [ 	]*[0-9]+[ 	]+> *
-[ 	]*[0-9]+[ 	]+>.*
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16 %ymm0,%xmm1
+[ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y %ymm0,%xmm1
 [ 	]*[0-9]+[ 	]+C8
 [ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16y 0x20\(%eax\),%xmm1
@@ -178,4 +208,10 @@ 
 [ 	]*[0-9]+[ 	]+724808
 [ 	]*[0-9]+[ 	]+\?\?\?\? 62F27E38 	>  vcvtneps2bf16y \(%eax\)\{1to8\},%xmm1
 [ 	]*[0-9]+[ 	]+7208
+[ 	]*[0-9]+[ 	]+> *
+[ 	]*[0-9]+[ 	]+>  \.intel_syntax noprefix
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27A72 	>  vcvtneps2bf16 xmm0,xmmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+01
+[ 	]*[0-9]+[ 	]+\?\?\?\? C4E27E72 	>  vcvtneps2bf16 xmm0,ymmword ptr \[ecx\]
+[ 	]*[0-9]+[ 	]+01
 #pass
--- a/gas/testsuite/gas/i386/avx-vex.s
+++ b/gas/testsuite/gas/i386/avx-vex.s
@@ -14,10 +14,15 @@ 
 	vpmadd52luq	0x100(%eax), %ymm1, %ymm2
 	vpmadd52luq	(%eax){1to4}, %ymm1, %ymm2
 
-#	vcvtneps2bf16	%ymm0, %xmm1
+	vcvtneps2bf16	%ymm0, %xmm1
 	vcvtneps2bf16y	%ymm0, %xmm1
 	vcvtneps2bf16y	0x20(%eax), %xmm1
 	vcvtneps2bf16y	0x100(%eax), %xmm1
 	vcvtneps2bf16y	(%eax){1to8}, %xmm1
 
+	.intel_syntax noprefix
+	vcvtneps2bf16	xmm0, xmmword ptr [ecx]
+	vcvtneps2bf16	xmm0, ymmword ptr [ecx]
+	.att_syntax prefix
+
 	.endr
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1481,6 +1481,9 @@  gf2p8mulb<gfni>, 0x660f38cf, <gfni:cpu>G
     true_us:1f:C>
 
 // <Vxy> is used for VEX instructions with x/y suffixes.
+// NOTE: The order of the "unnamed" ($-prefixed) entries here needs to remain
+//       in sync with <Exy>, for match_template()'s EVEX-to-VEX lowering to
+//       continue to work.
 <Vxy:vex:syntax:src, +
     $i:Vex:IntelSyntax:RegXMM|RegYMM|Unspecified|BaseIndex, +
     $a:Vex:ATTSyntax:RegXMM|RegYMM, +
@@ -2097,6 +2100,9 @@  vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ&(
 // AVX512F instructions.
 
 // <Exy> is used for EVEX instructions with x/y suffixes.
+// NOTE: The order of the "unnamed" ($-prefixed) entries here needs to remain
+//       in sync with <Vxy>, for match_template()'s EVEX-to-VEX lowering to
+//       continue to work.
 <Exy:vl:attr:sr:sae:src:dst, +
     $z::EVex512|Disp8MemShift=6:StaticRounding|SAE:SAE:RegZMM|Unspecified|BaseIndex:RegYMM, +
     $i:AVX512VL:Disp8ShiftVL|IntelSyntax:::RegXMM|RegYMM|Unspecified|BaseIndex:RegXMM, +