[v6,2/7] x86: re-work insn/suffix recognition

Message ID 07ef67fd-752c-ad1f-b8cb-4eaec1f420fc@suse.com
State Accepted
Headers
Series x86: suffix handling changes |

Checks

Context Check Description
snail/binutils-gdb-check success Github commit url

Commit Message

Jan Beulich Nov. 4, 2022, 10:51 a.m. UTC
  Having templates with a suffix explicitly present has always been
quirky. Introduce a 2nd matching pass in case the 1st one couldn't find
a suitable template _and_ didn't itself already need to trim off a
suffix to find a match at all. This requires error reporting adjustments
(albeit luckily fewer than I was afraid might be necessary), as errors
previously reported during matching now need deferring until after the
2nd pass (because, obviously, we must not emit any error if the 2nd pass
succeeds). While also related to PR gas/29524, it was requested that
move-with-sign-extend be left as broken as it always was.

PR gas/29525
Note that with the dropped CMPSD and MOVSD Intel Syntax string insn
templates taking operands, mixed IsString/non-IsString template groups
(with memory operands) cannot occur anymore. With that
maybe_adjust_templates() becomes unnecessary (and is hence being
removed).

PR gas/29526
Note further that while the additions to the intel16 testcase aren't
really proper Intel syntax, we've been permitting all of those except
for the MOVD variant. The test therefore is to avoid re-introducing such
an inconsistency.
---
To limit code churn I'm using "goto" for the retry loop, but I'd be
happy to make this a proper loop either right here or in a follow-on
change doing just the necessary re-indentation.

The "too many memory references" errors which are being deleted weren't
fully consistent anyway - even the majority of IsString insns accepts
only a single memory operand. If we wanted to retain that, it would need
re-introducing in md_assemble(), latching the error into i.error just
like match_template() does.

As an unrelated observation: Why is "MOVQ $imm64, %reg64" being
optimized but "MOVABS $imm64, %reg64" is not?
---
v6: Re-base over dropping of Pass2 attribute.
v5: Split off move-with-sign-extend changes.
v4: Retain support for operand-less MOVSD and CMPSD.
v3: Limit xstrdup() to just the templates where a 2nd pass actually
    makes sense (new Pass2 attribute).
  

Comments

H.J. Lu Nov. 4, 2022, 11:54 p.m. UTC | #1
On Fri, Nov 4, 2022 at 3:51 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> Having templates with a suffix explicitly present has always been
> quirky. Introduce a 2nd matching pass in case the 1st one couldn't find
> a suitable template _and_ didn't itself already need to trim off a
> suffix to find a match at all. This requires error reporting adjustments
> (albeit luckily fewer than I was afraid might be necessary), as errors
> previously reported during matching now need deferring until after the
> 2nd pass (because, obviously, we must not emit any error if the 2nd pass
> succeeds). While also related to PR gas/29524, it was requested that
> move-with-sign-extend be left as broken as it always was.
>
> PR gas/29525
> Note that with the dropped CMPSD and MOVSD Intel Syntax string insn
> templates taking operands, mixed IsString/non-IsString template groups
> (with memory operands) cannot occur anymore. With that
> maybe_adjust_templates() becomes unnecessary (and is hence being
> removed).
>
> PR gas/29526
> Note further that while the additions to the intel16 testcase aren't
> really proper Intel syntax, we've been permitting all of those except
> for the MOVD variant. The test therefore is to avoid re-introducing such
> an inconsistency.
> ---
> To limit code churn I'm using "goto" for the retry loop, but I'd be
> happy to make this a proper loop either right here or in a follow-on
> change doing just the necessary re-indentation.
>
> The "too many memory references" errors which are being deleted weren't
> fully consistent anyway - even the majority of IsString insns accepts
> only a single memory operand. If we wanted to retain that, it would need
> re-introducing in md_assemble(), latching the error into i.error just
> like match_template() does.
>
> As an unrelated observation: Why is "MOVQ $imm64, %reg64" being
> optimized but "MOVABS $imm64, %reg64" is not?
> ---
> v6: Re-base over dropping of Pass2 attribute.
> v5: Split off move-with-sign-extend changes.
> v4: Retain support for operand-less MOVSD and CMPSD.
> v3: Limit xstrdup() to just the templates where a 2nd pass actually
>     makes sense (new Pass2 attribute).

I don't think we should add a second pass.

> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -297,9 +297,6 @@ struct _i386_insn
>         explicit segment overrides are given.  */
>      const reg_entry *seg[2];
>
> -    /* Copied first memory operand string, for re-checking.  */
> -    char *memop1_string;
> -
>      /* PREFIX holds all the given prefix opcodes (usually null).
>         PREFIXES is the number of prefix opcodes.  */
>      unsigned int prefixes;
> @@ -4293,7 +4290,20 @@ optimize_encoding (void)
>            movq $imm31, %r64   -> movl $imm31, %r32
>            movq $imm32, %r64   -> movl $imm32, %r32
>          */
> -      i.tm.opcode_modifier.norex64 = 1;
> +      i.tm.opcode_modifier.size = SIZE32;
> +      if (i.imm_operands)
> +       {
> +         i.types[0].bitfield.imm32 = 1;
> +         i.types[0].bitfield.imm32s = 0;
> +         i.types[0].bitfield.imm64 = 0;
> +       }
> +      else
> +       {
> +         i.types[0].bitfield.dword = 1;
> +         i.types[0].bitfield.qword = 0;
> +       }
> +      i.types[1].bitfield.dword = 1;
> +      i.types[1].bitfield.qword = 0;
>        if (i.tm.base_opcode == 0xb8 || (i.tm.base_opcode | 1) == 0xc7)
>         {
>           /* Handle
> @@ -4303,11 +4313,6 @@ optimize_encoding (void)
>           i.tm.operand_types[0].bitfield.imm32 = 1;
>           i.tm.operand_types[0].bitfield.imm32s = 0;
>           i.tm.operand_types[0].bitfield.imm64 = 0;
> -         i.types[0].bitfield.imm32 = 1;
> -         i.types[0].bitfield.imm32s = 0;
> -         i.types[0].bitfield.imm64 = 0;
> -         i.types[1].bitfield.dword = 1;
> -         i.types[1].bitfield.qword = 0;
>           if ((i.tm.base_opcode | 1) == 0xc7)
>             {
>               /* Handle
> @@ -4830,6 +4835,18 @@ insert_lfence_before (void)
>      }
>  }
>
> +/* Helper for md_assemble() to decide whether to prepare for a possible 2nd
> +   parsing pass. Instead of introducing a rarely use new insn attribute this
> +   utilizes a common pattern between affected templates. It is deemed
> +   acceptable that this will lead to unnecessary pass 2 preparations in a
> +   limited set of cases.  */
> +static INLINE bool may_need_pass2 (const insn_template *t)
> +{
> +  return t->opcode_modifier.sse2avx
> +        /* Note that all SSE2AVX templates have at least one operand.  */
> +        && t->operand_types[t->operands - 1].bitfield.class == RegSIMD;
> +}
> +
>  /* This is the guts of the machine-dependent assembler.  LINE points to a
>     machine dependent instruction.  This function is supposed to emit
>     the frags/bytes it assembles to.  */
> @@ -4838,11 +4855,14 @@ void
>  md_assemble (char *line)
>  {
>    unsigned int j;
> -  char mnemonic[MAX_MNEM_SIZE], mnem_suffix;
> -  const char *end;
> +  char mnemonic[MAX_MNEM_SIZE], mnem_suffix, *copy = NULL;
> +  const char *end, *pass1_mnem = NULL;
> +  enum i386_error pass1_err = 0;
>    const insn_template *t;
>
>    /* Initialize globals.  */
> +  current_templates = NULL;
> + retry:
>    memset (&i, '\0', sizeof (i));
>    i.rounding.type = rc_none;
>    for (j = 0; j < MAX_OPERANDS; j++)
> @@ -4857,16 +4877,26 @@ md_assemble (char *line)
>
>    end = parse_insn (line, mnemonic);
>    if (end == NULL)
> -    return;
> +    {
> +      if (pass1_mnem != NULL)
> +       goto match_error;
> +      return;
> +    }
> +  if (may_need_pass2 (current_templates->start))
> +    {
> +      /* Make a copy of the full line in case we need to retry.  */
> +      copy = xstrdup (line);
> +    }
>    line += end - line;
>    mnem_suffix = i.suffix;
>
>    line = parse_operands (line, mnemonic);
>    this_operand = -1;
> -  xfree (i.memop1_string);
> -  i.memop1_string = NULL;
>    if (line == NULL)
> -    return;
> +    {
> +      free (copy);
> +      return;
> +    }
>
>    /* Now we've parsed the mnemonic into a set of templates, and have the
>       operands at hand.  */
> @@ -4942,7 +4972,97 @@ md_assemble (char *line)
>       with the template operand types.  */
>
>    if (!(t = match_template (mnem_suffix)))
> -    return;
> +    {
> +      const char *err_msg;
> +
> +      if (copy && !mnem_suffix)
> +       {
> +         line = copy;
> +         copy = NULL;
> +         pass1_err = i.error;
> +         pass1_mnem = current_templates->start->name;
> +         goto retry;
> +       }
> +      free (copy);
> +  match_error:
> +      switch (pass1_mnem ? pass1_err : i.error)
> +       {
> +       default:
> +         abort ();
> +       case operand_size_mismatch:
> +         err_msg = _("operand size mismatch");
> +         break;
> +       case operand_type_mismatch:
> +         err_msg = _("operand type mismatch");
> +         break;
> +       case register_type_mismatch:
> +         err_msg = _("register type mismatch");
> +         break;
> +       case number_of_operands_mismatch:
> +         err_msg = _("number of operands mismatch");
> +         break;
> +       case invalid_instruction_suffix:
> +         err_msg = _("invalid instruction suffix");
> +         break;
> +       case bad_imm4:
> +         err_msg = _("constant doesn't fit in 4 bits");
> +         break;
> +       case unsupported_with_intel_mnemonic:
> +         err_msg = _("unsupported with Intel mnemonic");
> +         break;
> +       case unsupported_syntax:
> +         err_msg = _("unsupported syntax");
> +         break;
> +       case unsupported:
> +         as_bad (_("unsupported instruction `%s'"),
> +                 pass1_mnem ? pass1_mnem : current_templates->start->name);
> +         return;
> +       case invalid_sib_address:
> +         err_msg = _("invalid SIB address");
> +         break;
> +       case invalid_vsib_address:
> +         err_msg = _("invalid VSIB address");
> +         break;
> +       case invalid_vector_register_set:
> +         err_msg = _("mask, index, and destination registers must be distinct");
> +         break;
> +       case invalid_tmm_register_set:
> +         err_msg = _("all tmm registers must be distinct");
> +         break;
> +       case invalid_dest_and_src_register_set:
> +         err_msg = _("destination and source registers must be distinct");
> +         break;
> +       case unsupported_vector_index_register:
> +         err_msg = _("unsupported vector index register");
> +         break;
> +       case unsupported_broadcast:
> +         err_msg = _("unsupported broadcast");
> +         break;
> +       case broadcast_needed:
> +         err_msg = _("broadcast is needed for operand of such type");
> +         break;
> +       case unsupported_masking:
> +         err_msg = _("unsupported masking");
> +         break;
> +       case mask_not_on_destination:
> +         err_msg = _("mask not on destination operand");
> +         break;
> +       case no_default_mask:
> +         err_msg = _("default mask isn't allowed");
> +         break;
> +       case unsupported_rc_sae:
> +         err_msg = _("unsupported static rounding/sae");
> +         break;
> +       case invalid_register_operand:
> +         err_msg = _("invalid register operand");
> +         break;
> +       }
> +      as_bad (_("%s for `%s'"), err_msg,
> +             pass1_mnem ? pass1_mnem : current_templates->start->name);
> +      return;
> +    }
> +
> +  free (copy);
>
>    if (sse_check != check_none
>        /* The opcode space check isn't strictly needed; it's there only to
> @@ -5248,6 +5368,7 @@ parse_insn (const char *line, char *mnem
>  {
>    const char *l = line, *token_start = l;
>    char *mnem_p;
> +  bool pass1 = !current_templates;
>    int supported;
>    const insn_template *t;
>    char *dot_p = NULL;
> @@ -5417,8 +5538,10 @@ parse_insn (const char *line, char *mnem
>        current_templates = (const templates *) str_hash_find (op_hash, mnemonic);
>      }
>
> -  if (!current_templates)
> +  if (!current_templates || !pass1)
>      {
> +      current_templates = NULL;
> +
>      check_suffix:
>        if (mnem_p > mnemonic)
>         {
> @@ -5460,13 +5583,39 @@ parse_insn (const char *line, char *mnem
>                   current_templates
>                     = (const templates *) str_hash_find (op_hash, mnemonic);
>                 }
> +             /* For compatibility reasons accept MOVSD and CMPSD without
> +                operands even in AT&T mode.  */
> +             else if (*l == END_OF_INSN
> +                      || (is_space_char (*l) && l[1] == END_OF_INSN))
> +               {
> +                 mnem_p[-1] = '\0';
> +                 current_templates
> +                   = (const templates *) str_hash_find (op_hash, mnemonic);
> +                 if (current_templates != NULL
> +                     /* MOVS or CMPS */
> +                     && (current_templates->start->base_opcode | 2) == 0xa6
> +                     && current_templates->start->opcode_modifier.opcodespace
> +                        == SPACE_BASE
> +                     && mnem_p[-2] == 's')
> +                   {
> +                     as_warn (_("found `%sd'; assuming `%sl' was meant"),
> +                              mnemonic, mnemonic);
> +                     i.suffix = LONG_MNEM_SUFFIX;
> +                   }
> +                 else
> +                   {
> +                     current_templates = NULL;
> +                     mnem_p[-1] = 'd';
> +                   }
> +               }
>               break;
>             }
>         }
>
>        if (!current_templates)
>         {
> -         as_bad (_("no such instruction: `%s'"), token_start);
> +         if (pass1)
> +           as_bad (_("no such instruction: `%s'"), token_start);
>           return NULL;
>         }
>      }
> @@ -6870,81 +7019,7 @@ match_template (char mnem_suffix)
>    if (t == current_templates->end)
>      {
>        /* We found no match.  */
> -      const char *err_msg;
> -      switch (specific_error)
> -       {
> -       default:
> -         abort ();
> -       case operand_size_mismatch:
> -         err_msg = _("operand size mismatch");
> -         break;
> -       case operand_type_mismatch:
> -         err_msg = _("operand type mismatch");
> -         break;
> -       case register_type_mismatch:
> -         err_msg = _("register type mismatch");
> -         break;
> -       case number_of_operands_mismatch:
> -         err_msg = _("number of operands mismatch");
> -         break;
> -       case invalid_instruction_suffix:
> -         err_msg = _("invalid instruction suffix");
> -         break;
> -       case bad_imm4:
> -         err_msg = _("constant doesn't fit in 4 bits");
> -         break;
> -       case unsupported_with_intel_mnemonic:
> -         err_msg = _("unsupported with Intel mnemonic");
> -         break;
> -       case unsupported_syntax:
> -         err_msg = _("unsupported syntax");
> -         break;
> -       case unsupported:
> -         as_bad (_("unsupported instruction `%s'"),
> -                 current_templates->start->name);
> -         return NULL;
> -       case invalid_sib_address:
> -         err_msg = _("invalid SIB address");
> -         break;
> -       case invalid_vsib_address:
> -         err_msg = _("invalid VSIB address");
> -         break;
> -       case invalid_vector_register_set:
> -         err_msg = _("mask, index, and destination registers must be distinct");
> -         break;
> -       case invalid_tmm_register_set:
> -         err_msg = _("all tmm registers must be distinct");
> -         break;
> -       case invalid_dest_and_src_register_set:
> -         err_msg = _("destination and source registers must be distinct");
> -         break;
> -       case unsupported_vector_index_register:
> -         err_msg = _("unsupported vector index register");
> -         break;
> -       case unsupported_broadcast:
> -         err_msg = _("unsupported broadcast");
> -         break;
> -       case broadcast_needed:
> -         err_msg = _("broadcast is needed for operand of such type");
> -         break;
> -       case unsupported_masking:
> -         err_msg = _("unsupported masking");
> -         break;
> -       case mask_not_on_destination:
> -         err_msg = _("mask not on destination operand");
> -         break;
> -       case no_default_mask:
> -         err_msg = _("default mask isn't allowed");
> -         break;
> -       case unsupported_rc_sae:
> -         err_msg = _("unsupported static rounding/sae");
> -         break;
> -       case invalid_register_operand:
> -         err_msg = _("invalid register operand");
> -         break;
> -       }
> -      as_bad (_("%s for `%s'"), err_msg,
> -             current_templates->start->name);
> +      i.error = specific_error;
>        return NULL;
>      }
>
> @@ -11347,49 +11422,6 @@ RC_SAE_immediate (const char *imm_start)
>    return 1;
>  }
>
> -/* Only string instructions can have a second memory operand, so
> -   reduce current_templates to just those if it contains any.  */
> -static int
> -maybe_adjust_templates (void)
> -{
> -  const insn_template *t;
> -
> -  gas_assert (i.mem_operands == 1);
> -
> -  for (t = current_templates->start; t < current_templates->end; ++t)
> -    if (t->opcode_modifier.isstring)
> -      break;
> -
> -  if (t < current_templates->end)
> -    {
> -      static templates aux_templates;
> -      bool recheck;
> -
> -      aux_templates.start = t;
> -      for (; t < current_templates->end; ++t)
> -       if (!t->opcode_modifier.isstring)
> -         break;
> -      aux_templates.end = t;
> -
> -      /* Determine whether to re-check the first memory operand.  */
> -      recheck = (aux_templates.start != current_templates->start
> -                || t != current_templates->end);
> -
> -      current_templates = &aux_templates;
> -
> -      if (recheck)
> -       {
> -         i.mem_operands = 0;
> -         if (i.memop1_string != NULL
> -             && i386_index_check (i.memop1_string) == 0)
> -           return 0;
> -         i.mem_operands = 1;
> -       }
> -    }
> -
> -  return 1;
> -}
> -
>  static INLINE bool starts_memory_operand (char c)
>  {
>    return ISDIGIT (c)
> @@ -11540,17 +11572,6 @@ i386_att_operand (char *operand_string)
>        char *displacement_string_end;
>
>      do_memory_reference:
> -      if (i.mem_operands == 1 && !maybe_adjust_templates ())
> -       return 0;
> -      if ((i.mem_operands == 1
> -          && !current_templates->start->opcode_modifier.isstring)
> -         || i.mem_operands == 2)
> -       {
> -         as_bad (_("too many memory references for `%s'"),
> -                 current_templates->start->name);
> -         return 0;
> -       }
> -
>        /* Check for base index form.  We detect the base index form by
>          looking for an ')' at the end of the operand, searching
>          for the '(' matching it, and finding a REGISTER_PREFIX or ','
> @@ -11757,8 +11778,6 @@ i386_att_operand (char *operand_string)
>        if (i386_index_check (operand_string) == 0)
>         return 0;
>        i.flags[this_operand] |= Operand_Mem;
> -      if (i.mem_operands == 0)
> -       i.memop1_string = xstrdup (operand_string);
>        i.mem_operands++;
>      }
>    else
> --- a/gas/config/tc-i386-intel.c
> +++ b/gas/config/tc-i386-intel.c
> @@ -993,10 +993,7 @@ i386_intel_operand (char *operand_string
>            || intel_state.is_mem)
>      {
>        /* Memory operand.  */
> -      if (i.mem_operands == 1 && !maybe_adjust_templates ())
> -       return 0;
> -      if ((int) i.mem_operands
> -         >= 2 - !current_templates->start->opcode_modifier.isstring)
> +      if (i.mem_operands)
>         {
>           /* Handle
>
> @@ -1041,10 +1038,6 @@ i386_intel_operand (char *operand_string
>                     }
>                 }
>             }
> -
> -         as_bad (_("too many memory references for `%s'"),
> -                 current_templates->start->name);
> -         return 0;
>         }
>
>        /* Swap base and index in 16-bit memory operands like
> @@ -1158,8 +1151,6 @@ i386_intel_operand (char *operand_string
>         return 0;
>
>        i.flags[this_operand] |= Operand_Mem;
> -      if (i.mem_operands == 0)
> -       i.memop1_string = xstrdup (operand_string);
>        ++i.mem_operands;
>      }
>    else
> --- a/gas/testsuite/gas/i386/code16.d
> +++ b/gas/testsuite/gas/i386/code16.d
> @@ -1,5 +1,6 @@
>  #objdump: -drw -mi8086
>  #name: i386 with .code16
> +#warning_output: code16.e
>
>  .*: +file format .*
>
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/code16.e
> @@ -0,0 +1,3 @@
> +.*: Assembler messages:
> +.*:3: Warning: .* `movsd'.*`movsl'.*
> +.*:4: Warning: .* `cmpsd'.*`cmpsl'.*
> --- a/gas/testsuite/gas/i386/code16.s
> +++ b/gas/testsuite/gas/i386/code16.s
> @@ -2,8 +2,8 @@
>         .code16
>         rep; movsd
>         rep; cmpsd
> -       rep movsd %ds:(%si),%es:(%di)
> -       rep cmpsd %es:(%di),%ds:(%si)
> +       rep movsl %ds:(%si),%es:(%di)
> +       rep cmpsl %es:(%di),%ds:(%si)
>
>         mov     %cr2, %ecx
>         mov     %ecx, %cr2
> --- a/gas/testsuite/gas/i386/intel16.d
> +++ b/gas/testsuite/gas/i386/intel16.d
> @@ -20,4 +20,12 @@ Disassembly of section .text:
>    2c:  8d 02 [         ]*lea    \(%bp,%si\),%ax
>    2e:  8d 01 [         ]*lea    \(%bx,%di\),%ax
>    30:  8d 03 [         ]*lea    \(%bp,%di\),%ax
> -       ...
> +[      ]*[0-9a-f]+:    67 f7 13[       ]+notw[         ]+\(%ebx\)
> +[      ]*[0-9a-f]+:    66 f7 17[       ]+notl[         ]+\(%bx\)
> +[      ]*[0-9a-f]+:    67 0f 1f 03[    ]+nopw[         ]+\(%ebx\)
> +[      ]*[0-9a-f]+:    66 0f 1f 07[    ]+nopl[         ]+\(%bx\)
> +[      ]*[0-9a-f]+:    67 83 03 05[    ]+addw[         ]+\$0x5,\(%ebx\)
> +[      ]*[0-9a-f]+:    66 83 07 05[    ]+addl[         ]+\$0x5,\(%bx\)
> +[      ]*[0-9a-f]+:    67 c7 03 05 00[         ]+movw[         ]+\$0x5,\(%ebx\)
> +[      ]*[0-9a-f]+:    66 c7 07 05 00 00 00[   ]+movl[         ]+\$0x5,\(%bx\)
> +#pass
> --- a/gas/testsuite/gas/i386/intel16.s
> +++ b/gas/testsuite/gas/i386/intel16.s
> @@ -18,4 +18,14 @@
>   lea   ax, [di][bx]
>   lea   ax, [di][bp]
>
> - .p2align 4,0
> + notw  [ebx]
> + notd  [bx]
> +
> + nopw  [ebx]
> + nopd  [bx]
> +
> + addw  [ebx], 5
> + addd  [bx], 5
> +
> + movw  [ebx], 5
> + movd  [bx], 5
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -135,34 +135,27 @@
>  mov, 0xa0, None, CpuNo64, D|W|No_sSuf|No_qSuf|No_ldSuf, { Disp16|Disp32|Unspecified|Byte|Word|Dword, Acc|Byte|Word|Dword }
>  mov, 0xa0, None, Cpu64, D|W|No_sSuf|No_ldSuf, { Disp64|Unspecified|Byte|Word|Dword|Qword, Acc|Byte|Word|Dword|Qword }
>  movabs, 0xa0, None, Cpu64, D|W|No_sSuf|No_ldSuf, { Disp64|Unspecified|Byte|Word|Dword|Qword, Acc|Byte|Word|Dword|Qword }
> -movq, 0xa1, None, Cpu64, D|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Disp64|Unspecified|Qword, Acc|Qword }
>  mov, 0x88, None, 0, D|W|CheckRegSize|Modrm|No_sSuf|No_ldSuf|HLEPrefixRelease, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> -movq, 0x89, None, Cpu64, D|Modrm|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|HLEPrefixRelease, { Reg64, Reg64|Unspecified|Qword|BaseIndex }
>  // In the 64bit mode the short form mov immediate is redefined to have
>  // 64bit value.
>  mov, 0xb0, None, 0, W|No_sSuf|No_qSuf|No_ldSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32 }
>  mov, 0xc6, 0, 0, W|Modrm|No_sSuf|No_ldSuf|HLEPrefixRelease|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> -movq, 0xc7, 0, Cpu64, Modrm|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|HLEPrefixRelease|Optimize, { Imm32S, Reg64|Qword|Unspecified|BaseIndex }
>  mov, 0xb8, None, Cpu64, No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|Optimize, { Imm64, Reg64 }
>  movabs, 0xb8, None, Cpu64, No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf, { Imm64, Reg64 }
> -movq, 0xb8, None, Cpu64, Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { Imm64, Reg64 }
>  // The segment register moves accept WordReg so that a segment register
>  // can be copied to a 32 bit register, and vice versa, without using a
>  // size prefix.  When moving to a 32 bit register, the upper 16 bits
>  // are set to an implementation defined value (on the Pentium Pro, the
>  // implementation defined value is zero).
> -mov, 0x8c, None, 0, RegMem|No_bSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { SReg, Reg16|Reg32|Reg64 }
> +mov, 0x8c, None, 0, RegMem|No_bSuf|No_sSuf|No_ldSuf|NoRex64, { SReg, Reg16|Reg32|Reg64 }
>  mov, 0x8c, None, 0, D|Modrm|IgnoreSize|No_bSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { SReg, Word|Unspecified|BaseIndex }
> -movq, 0x8c, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { SReg, Reg64 }
> -mov, 0x8e, None, 0, Modrm|IgnoreSize|No_bSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Reg16|Reg32|Reg64, SReg }
> +mov, 0x8e, None, 0, Modrm|IgnoreSize|No_bSuf|No_sSuf|No_ldSuf|NoRex64, { Reg16|Reg32|Reg64, SReg }
>  // Move to/from control debug registers.  In the 16 or 32bit modes
>  // they are 32bit.  In the 64bit mode they are 64bit.
>  mov, 0xf20, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Control, Reg32 }
>  mov, 0xf20, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|NoRex64, { Control, Reg64 }
> -movq, 0xf20, None, Cpu64, D|RegMem|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Control, Reg64 }
>  mov, 0xf21, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Debug, Reg32 }
>  mov, 0xf21, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|NoRex64, { Debug, Reg64 }
> -movq, 0xf21, None, Cpu64, D|RegMem|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Debug, Reg64 }
>  mov, 0xf24, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Test, Reg32 }
>
>  // Move after swapping the bytes
> @@ -492,9 +485,6 @@ set<cc>, 0xf9<cc:opc>, 0, Cpu386, Modrm|
>  // String manipulation.
>  cmps, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
>  cmps, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> -// Intel mode string compare.
> -cmpsd, 0xa7, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
> -cmpsd, 0xa7, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Dword|Unspecified|BaseIndex, Dword|Unspecified|BaseIndex }
>  scmp, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
>  scmp, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  ins, 0x6c, None, Cpu186, W|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
> @@ -509,9 +499,6 @@ slod, 0xac, None, 0, W|No_sSuf|No_ldSuf|
>  slod, 0xac, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Acc|Byte|Word|Dword|Qword }
>  movs, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
>  movs, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> -// Intel mode string move.
> -movsd, 0xa5, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
> -movsd, 0xa5, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Dword|Unspecified|BaseIndex, Dword|Unspecified|BaseIndex }
>  smov, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
>  smov, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>  scas, 0xae, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
> @@ -993,6 +980,7 @@ movd, 0x666e, None, CpuAVX, D|Modrm|Vex1
>  movd, 0x666e, None, CpuAVX|Cpu64, D|Modrm|Vex=1|Space0F|VexW1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64|SSE2AVX, { Reg64|BaseIndex, RegXMM }
>  movd, 0x660f6e, None, CpuSSE2, D|Modrm|IgnoreSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Reg32|Unspecified|BaseIndex, RegXMM }
>  movd, 0x660f6e, None, CpuSSE2|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|BaseIndex, RegXMM }
> +// The MMX templates have to remain after at least the SSE2AVX ones.
>  movd, 0xf6e, None, CpuMMX, D|Modrm|IgnoreSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Reg32|Unspecified|BaseIndex, RegMMX }
>  movd, 0xf6e, None, CpuMMX|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|BaseIndex, RegMMX }
>  movq, 0xf37e, None, CpuAVX, Load|Modrm|Vex=1|Space0F|VexWIG|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|SSE2AVX, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
> @@ -1001,6 +989,7 @@ movq, 0x666e, None, CpuAVX|Cpu64, D|Modr
>  movq, 0xf30f7e, None, CpuSSE2, Load|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Unspecified|Qword|BaseIndex|RegXMM, RegXMM }
>  movq, 0x660fd6, None, CpuSSE2, Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, Unspecified|Qword|BaseIndex|RegXMM }
>  movq, 0x660f6e, None, CpuSSE2|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|Unspecified|BaseIndex, RegXMM }
> +// The MMX templates have to remain after at least the SSE2AVX ones.
>  movq, 0xf6f, None, CpuMMX, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Unspecified|Qword|BaseIndex|RegMMX, RegMMX }
>  movq, 0xf6e, None, CpuMMX|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|Unspecified|BaseIndex, RegMMX }
>  packssdw<mmx>, 0x<mmx:pfx>0f6b, None, <mmx:cpu>, Modrm|<mmx:attr>|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
>
  
Jan Beulich Nov. 7, 2022, 10:23 a.m. UTC | #2
On 05.11.2022 00:54, H.J. Lu wrote:
> On Fri, Nov 4, 2022 at 3:51 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> Having templates with a suffix explicitly present has always been
>> quirky. Introduce a 2nd matching pass in case the 1st one couldn't find
>> a suitable template _and_ didn't itself already need to trim off a
>> suffix to find a match at all. This requires error reporting adjustments
>> (albeit luckily fewer than I was afraid might be necessary), as errors
>> previously reported during matching now need deferring until after the
>> 2nd pass (because, obviously, we must not emit any error if the 2nd pass
>> succeeds). While also related to PR gas/29524, it was requested that
>> move-with-sign-extend be left as broken as it always was.
>>
>> PR gas/29525
>> Note that with the dropped CMPSD and MOVSD Intel Syntax string insn
>> templates taking operands, mixed IsString/non-IsString template groups
>> (with memory operands) cannot occur anymore. With that
>> maybe_adjust_templates() becomes unnecessary (and is hence being
>> removed).
>>
>> PR gas/29526
>> Note further that while the additions to the intel16 testcase aren't
>> really proper Intel syntax, we've been permitting all of those except
>> for the MOVD variant. The test therefore is to avoid re-introducing such
>> an inconsistency.
>> ---
>> To limit code churn I'm using "goto" for the retry loop, but I'd be
>> happy to make this a proper loop either right here or in a follow-on
>> change doing just the necessary re-indentation.
>>
>> The "too many memory references" errors which are being deleted weren't
>> fully consistent anyway - even the majority of IsString insns accepts
>> only a single memory operand. If we wanted to retain that, it would need
>> re-introducing in md_assemble(), latching the error into i.error just
>> like match_template() does.
>>
>> As an unrelated observation: Why is "MOVQ $imm64, %reg64" being
>> optimized but "MOVABS $imm64, %reg64" is not?
>> ---
>> v6: Re-base over dropping of Pass2 attribute.
>> v5: Split off move-with-sign-extend changes.
>> v4: Retain support for operand-less MOVSD and CMPSD.
>> v3: Limit xstrdup() to just the templates where a 2nd pass actually
>>     makes sense (new Pass2 attribute).
> 
> I don't think we should add a second pass.

So you've asked me to re-work the series several times just to _now_
say "no" altogether? What's your alternative proposal to address the
various shortcomings that this series is addressing? (Yes, patches 4
and 5 can, with some effort, probably be re-based ahead, but those
are only minor improvements found while doing the main piece of work
here, and they aren't strictly related to the main goal of the series.)

Plus I now really feel urged to point out that you're blocking further
work I have pending, which I keep re-basing over all the adjustments I
was making to address your comments (plus of course the new ISA
extension patches which have gone in recently, all of which also
collide with work I'm doing). This re-basing is non-trivial and hence
is consuming a considerable amount of time as well.

Jan
  
H.J. Lu Nov. 8, 2022, 1:21 a.m. UTC | #3
On Mon, Nov 7, 2022 at 2:23 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 05.11.2022 00:54, H.J. Lu wrote:
> > On Fri, Nov 4, 2022 at 3:51 AM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> Having templates with a suffix explicitly present has always been
> >> quirky. Introduce a 2nd matching pass in case the 1st one couldn't find
> >> a suitable template _and_ didn't itself already need to trim off a
> >> suffix to find a match at all. This requires error reporting adjustments
> >> (albeit luckily fewer than I was afraid might be necessary), as errors
> >> previously reported during matching now need deferring until after the
> >> 2nd pass (because, obviously, we must not emit any error if the 2nd pass
> >> succeeds). While also related to PR gas/29524, it was requested that
> >> move-with-sign-extend be left as broken as it always was.
> >>
> >> PR gas/29525
> >> Note that with the dropped CMPSD and MOVSD Intel Syntax string insn
> >> templates taking operands, mixed IsString/non-IsString template groups
> >> (with memory operands) cannot occur anymore. With that
> >> maybe_adjust_templates() becomes unnecessary (and is hence being
> >> removed).
> >>
> >> PR gas/29526
> >> Note further that while the additions to the intel16 testcase aren't
> >> really proper Intel syntax, we've been permitting all of those except
> >> for the MOVD variant. The test therefore is to avoid re-introducing such
> >> an inconsistency.
> >> ---
> >> To limit code churn I'm using "goto" for the retry loop, but I'd be
> >> happy to make this a proper loop either right here or in a follow-on
> >> change doing just the necessary re-indentation.
> >>
> >> The "too many memory references" errors which are being deleted weren't
> >> fully consistent anyway - even the majority of IsString insns accepts
> >> only a single memory operand. If we wanted to retain that, it would need
> >> re-introducing in md_assemble(), latching the error into i.error just
> >> like match_template() does.
> >>
> >> As an unrelated observation: Why is "MOVQ $imm64, %reg64" being
> >> optimized but "MOVABS $imm64, %reg64" is not?
> >> ---
> >> v6: Re-base over dropping of Pass2 attribute.
> >> v5: Split off move-with-sign-extend changes.
> >> v4: Retain support for operand-less MOVSD and CMPSD.
> >> v3: Limit xstrdup() to just the templates where a 2nd pass actually
> >>     makes sense (new Pass2 attribute).
> >
> > I don't think we should add a second pass.
>
> So you've asked me to re-work the series several times just to _now_
> say "no" altogether? What's your alternative proposal to address the
> various shortcomings that this series is addressing? (Yes, patches 4
> and 5 can, with some effort, probably be re-based ahead, but those
> are only minor improvements found while doing the main piece of work
> here, and they aren't strictly related to the main goal of the series.)

The more I looked, the more I didn't think we needed the second pass.
The issues you raised are minor issues.  We shouldn't add the second
pass because of them.

> Plus I now really feel urged to point out that you're blocking further
> work I have pending, which I keep re-basing over all the adjustments I
> was making to address your comments (plus of course the new ISA
> extension patches which have gone in recently, all of which also
> collide with work I'm doing). This re-basing is non-trivial and hence
> is consuming a considerable amount of time as well.
>
> Jan
  
Jan Beulich Nov. 8, 2022, 8:29 a.m. UTC | #4
On 08.11.2022 02:21, H.J. Lu wrote:
> On Mon, Nov 7, 2022 at 2:23 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 05.11.2022 00:54, H.J. Lu wrote:
>>> On Fri, Nov 4, 2022 at 3:51 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>>
>>>> Having templates with a suffix explicitly present has always been
>>>> quirky. Introduce a 2nd matching pass in case the 1st one couldn't find
>>>> a suitable template _and_ didn't itself already need to trim off a
>>>> suffix to find a match at all. This requires error reporting adjustments
>>>> (albeit luckily fewer than I was afraid might be necessary), as errors
>>>> previously reported during matching now need deferring until after the
>>>> 2nd pass (because, obviously, we must not emit any error if the 2nd pass
>>>> succeeds). While also related to PR gas/29524, it was requested that
>>>> move-with-sign-extend be left as broken as it always was.
>>>>
>>>> PR gas/29525
>>>> Note that with the dropped CMPSD and MOVSD Intel Syntax string insn
>>>> templates taking operands, mixed IsString/non-IsString template groups
>>>> (with memory operands) cannot occur anymore. With that
>>>> maybe_adjust_templates() becomes unnecessary (and is hence being
>>>> removed).
>>>>
>>>> PR gas/29526
>>>> Note further that while the additions to the intel16 testcase aren't
>>>> really proper Intel syntax, we've been permitting all of those except
>>>> for the MOVD variant. The test therefore is to avoid re-introducing such
>>>> an inconsistency.
>>>> ---
>>>> To limit code churn I'm using "goto" for the retry loop, but I'd be
>>>> happy to make this a proper loop either right here or in a follow-on
>>>> change doing just the necessary re-indentation.
>>>>
>>>> The "too many memory references" errors which are being deleted weren't
>>>> fully consistent anyway - even the majority of IsString insns accepts
>>>> only a single memory operand. If we wanted to retain that, it would need
>>>> re-introducing in md_assemble(), latching the error into i.error just
>>>> like match_template() does.
>>>>
>>>> As an unrelated observation: Why is "MOVQ $imm64, %reg64" being
>>>> optimized but "MOVABS $imm64, %reg64" is not?
>>>> ---
>>>> v6: Re-base over dropping of Pass2 attribute.
>>>> v5: Split off move-with-sign-extend changes.
>>>> v4: Retain support for operand-less MOVSD and CMPSD.
>>>> v3: Limit xstrdup() to just the templates where a 2nd pass actually
>>>>     makes sense (new Pass2 attribute).
>>>
>>> I don't think we should add a second pass.
>>
>> So you've asked me to re-work the series several times just to _now_
>> say "no" altogether? What's your alternative proposal to address the
>> various shortcomings that this series is addressing? (Yes, patches 4
>> and 5 can, with some effort, probably be re-based ahead, but those
>> are only minor improvements found while doing the main piece of work
>> here, and they aren't strictly related to the main goal of the series.)
> 
> The more I looked, the more I didn't think we needed the second pass.
> The issues you raised are minor issues.  We shouldn't add the second
> pass because of them.

IOW your view is "let's keep the assembler broken, and let's keep
surprising assembly programmers". I'm afraid what is to be considered
"minor" is up for debate _and_ it is also up for debate whether leaving
"minor" issues unfixed is okay. The more that I've already done the
work in the cases here. Please revisit your looking at it.

Furthermore I don't think patch 3 (which is merely a first step to take)
can really fall in that "minor" category: The "inventing" of suffixes
in Intel Syntax mode has always been bogus. Yet this patch builds on top
of what patch 2 does; at least at the moment I can't see doing that same
work without first removing all the stray "movq" templates. (Their
presence also gets in the way of patch 4, but there this could be dealt
with afaict.) In this context please also remember that I'm the
maintainer for Intel Syntax code, so it is at least an edge case for
you to block me making improvements there.

Nick, may I ask for a more project-wide view on this perspective that
H.J. is taking? I certainly don't mean to (ab)use my global maintainer
role to override him, but I also don't really agree with my view being
put off. Yet if there was a more general (unwritten) policy like this,
then of course I'd need to accept things the way they are.

Jan

>> Plus I now really feel urged to point out that you're blocking further
>> work I have pending, which I keep re-basing over all the adjustments I
>> was making to address your comments (plus of course the new ISA
>> extension patches which have gone in recently, all of which also
>> collide with work I'm doing). This re-basing is non-trivial and hence
>> is consuming a considerable amount of time as well.
>>
>> Jan
> 
> 
>
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -297,9 +297,6 @@  struct _i386_insn
        explicit segment overrides are given.  */
     const reg_entry *seg[2];
 
-    /* Copied first memory operand string, for re-checking.  */
-    char *memop1_string;
-
     /* PREFIX holds all the given prefix opcodes (usually null).
        PREFIXES is the number of prefix opcodes.  */
     unsigned int prefixes;
@@ -4293,7 +4290,20 @@  optimize_encoding (void)
 	   movq $imm31, %r64   -> movl $imm31, %r32
 	   movq $imm32, %r64   -> movl $imm32, %r32
         */
-      i.tm.opcode_modifier.norex64 = 1;
+      i.tm.opcode_modifier.size = SIZE32;
+      if (i.imm_operands)
+	{
+	  i.types[0].bitfield.imm32 = 1;
+	  i.types[0].bitfield.imm32s = 0;
+	  i.types[0].bitfield.imm64 = 0;
+	}
+      else
+	{
+	  i.types[0].bitfield.dword = 1;
+	  i.types[0].bitfield.qword = 0;
+	}
+      i.types[1].bitfield.dword = 1;
+      i.types[1].bitfield.qword = 0;
       if (i.tm.base_opcode == 0xb8 || (i.tm.base_opcode | 1) == 0xc7)
 	{
 	  /* Handle
@@ -4303,11 +4313,6 @@  optimize_encoding (void)
 	  i.tm.operand_types[0].bitfield.imm32 = 1;
 	  i.tm.operand_types[0].bitfield.imm32s = 0;
 	  i.tm.operand_types[0].bitfield.imm64 = 0;
-	  i.types[0].bitfield.imm32 = 1;
-	  i.types[0].bitfield.imm32s = 0;
-	  i.types[0].bitfield.imm64 = 0;
-	  i.types[1].bitfield.dword = 1;
-	  i.types[1].bitfield.qword = 0;
 	  if ((i.tm.base_opcode | 1) == 0xc7)
 	    {
 	      /* Handle
@@ -4830,6 +4835,18 @@  insert_lfence_before (void)
     }
 }
 
+/* Helper for md_assemble() to decide whether to prepare for a possible 2nd
+   parsing pass. Instead of introducing a rarely use new insn attribute this
+   utilizes a common pattern between affected templates. It is deemed
+   acceptable that this will lead to unnecessary pass 2 preparations in a
+   limited set of cases.  */
+static INLINE bool may_need_pass2 (const insn_template *t)
+{
+  return t->opcode_modifier.sse2avx
+	 /* Note that all SSE2AVX templates have at least one operand.  */
+	 && t->operand_types[t->operands - 1].bitfield.class == RegSIMD;
+}
+
 /* This is the guts of the machine-dependent assembler.  LINE points to a
    machine dependent instruction.  This function is supposed to emit
    the frags/bytes it assembles to.  */
@@ -4838,11 +4855,14 @@  void
 md_assemble (char *line)
 {
   unsigned int j;
-  char mnemonic[MAX_MNEM_SIZE], mnem_suffix;
-  const char *end;
+  char mnemonic[MAX_MNEM_SIZE], mnem_suffix, *copy = NULL;
+  const char *end, *pass1_mnem = NULL;
+  enum i386_error pass1_err = 0;
   const insn_template *t;
 
   /* Initialize globals.  */
+  current_templates = NULL;
+ retry:
   memset (&i, '\0', sizeof (i));
   i.rounding.type = rc_none;
   for (j = 0; j < MAX_OPERANDS; j++)
@@ -4857,16 +4877,26 @@  md_assemble (char *line)
 
   end = parse_insn (line, mnemonic);
   if (end == NULL)
-    return;
+    {
+      if (pass1_mnem != NULL)
+	goto match_error;
+      return;
+    }
+  if (may_need_pass2 (current_templates->start))
+    {
+      /* Make a copy of the full line in case we need to retry.  */
+      copy = xstrdup (line);
+    }
   line += end - line;
   mnem_suffix = i.suffix;
 
   line = parse_operands (line, mnemonic);
   this_operand = -1;
-  xfree (i.memop1_string);
-  i.memop1_string = NULL;
   if (line == NULL)
-    return;
+    {
+      free (copy);
+      return;
+    }
 
   /* Now we've parsed the mnemonic into a set of templates, and have the
      operands at hand.  */
@@ -4942,7 +4972,97 @@  md_assemble (char *line)
      with the template operand types.  */
 
   if (!(t = match_template (mnem_suffix)))
-    return;
+    {
+      const char *err_msg;
+
+      if (copy && !mnem_suffix)
+	{
+	  line = copy;
+	  copy = NULL;
+	  pass1_err = i.error;
+	  pass1_mnem = current_templates->start->name;
+	  goto retry;
+	}
+      free (copy);
+  match_error:
+      switch (pass1_mnem ? pass1_err : i.error)
+	{
+	default:
+	  abort ();
+	case operand_size_mismatch:
+	  err_msg = _("operand size mismatch");
+	  break;
+	case operand_type_mismatch:
+	  err_msg = _("operand type mismatch");
+	  break;
+	case register_type_mismatch:
+	  err_msg = _("register type mismatch");
+	  break;
+	case number_of_operands_mismatch:
+	  err_msg = _("number of operands mismatch");
+	  break;
+	case invalid_instruction_suffix:
+	  err_msg = _("invalid instruction suffix");
+	  break;
+	case bad_imm4:
+	  err_msg = _("constant doesn't fit in 4 bits");
+	  break;
+	case unsupported_with_intel_mnemonic:
+	  err_msg = _("unsupported with Intel mnemonic");
+	  break;
+	case unsupported_syntax:
+	  err_msg = _("unsupported syntax");
+	  break;
+	case unsupported:
+	  as_bad (_("unsupported instruction `%s'"),
+		  pass1_mnem ? pass1_mnem : current_templates->start->name);
+	  return;
+	case invalid_sib_address:
+	  err_msg = _("invalid SIB address");
+	  break;
+	case invalid_vsib_address:
+	  err_msg = _("invalid VSIB address");
+	  break;
+	case invalid_vector_register_set:
+	  err_msg = _("mask, index, and destination registers must be distinct");
+	  break;
+	case invalid_tmm_register_set:
+	  err_msg = _("all tmm registers must be distinct");
+	  break;
+	case invalid_dest_and_src_register_set:
+	  err_msg = _("destination and source registers must be distinct");
+	  break;
+	case unsupported_vector_index_register:
+	  err_msg = _("unsupported vector index register");
+	  break;
+	case unsupported_broadcast:
+	  err_msg = _("unsupported broadcast");
+	  break;
+	case broadcast_needed:
+	  err_msg = _("broadcast is needed for operand of such type");
+	  break;
+	case unsupported_masking:
+	  err_msg = _("unsupported masking");
+	  break;
+	case mask_not_on_destination:
+	  err_msg = _("mask not on destination operand");
+	  break;
+	case no_default_mask:
+	  err_msg = _("default mask isn't allowed");
+	  break;
+	case unsupported_rc_sae:
+	  err_msg = _("unsupported static rounding/sae");
+	  break;
+	case invalid_register_operand:
+	  err_msg = _("invalid register operand");
+	  break;
+	}
+      as_bad (_("%s for `%s'"), err_msg,
+	      pass1_mnem ? pass1_mnem : current_templates->start->name);
+      return;
+    }
+
+  free (copy);
 
   if (sse_check != check_none
       /* The opcode space check isn't strictly needed; it's there only to
@@ -5248,6 +5368,7 @@  parse_insn (const char *line, char *mnem
 {
   const char *l = line, *token_start = l;
   char *mnem_p;
+  bool pass1 = !current_templates;
   int supported;
   const insn_template *t;
   char *dot_p = NULL;
@@ -5417,8 +5538,10 @@  parse_insn (const char *line, char *mnem
       current_templates = (const templates *) str_hash_find (op_hash, mnemonic);
     }
 
-  if (!current_templates)
+  if (!current_templates || !pass1)
     {
+      current_templates = NULL;
+
     check_suffix:
       if (mnem_p > mnemonic)
 	{
@@ -5460,13 +5583,39 @@  parse_insn (const char *line, char *mnem
 		  current_templates
 		    = (const templates *) str_hash_find (op_hash, mnemonic);
 		}
+	      /* For compatibility reasons accept MOVSD and CMPSD without
+	         operands even in AT&T mode.  */
+	      else if (*l == END_OF_INSN
+		       || (is_space_char (*l) && l[1] == END_OF_INSN))
+		{
+		  mnem_p[-1] = '\0';
+		  current_templates
+		    = (const templates *) str_hash_find (op_hash, mnemonic);
+		  if (current_templates != NULL
+		      /* MOVS or CMPS */
+		      && (current_templates->start->base_opcode | 2) == 0xa6
+		      && current_templates->start->opcode_modifier.opcodespace
+			 == SPACE_BASE
+		      && mnem_p[-2] == 's')
+		    {
+		      as_warn (_("found `%sd'; assuming `%sl' was meant"),
+			       mnemonic, mnemonic);
+		      i.suffix = LONG_MNEM_SUFFIX;
+		    }
+		  else
+		    {
+		      current_templates = NULL;
+		      mnem_p[-1] = 'd';
+		    }
+		}
 	      break;
 	    }
 	}
 
       if (!current_templates)
 	{
-	  as_bad (_("no such instruction: `%s'"), token_start);
+	  if (pass1)
+	    as_bad (_("no such instruction: `%s'"), token_start);
 	  return NULL;
 	}
     }
@@ -6870,81 +7019,7 @@  match_template (char mnem_suffix)
   if (t == current_templates->end)
     {
       /* We found no match.  */
-      const char *err_msg;
-      switch (specific_error)
-	{
-	default:
-	  abort ();
-	case operand_size_mismatch:
-	  err_msg = _("operand size mismatch");
-	  break;
-	case operand_type_mismatch:
-	  err_msg = _("operand type mismatch");
-	  break;
-	case register_type_mismatch:
-	  err_msg = _("register type mismatch");
-	  break;
-	case number_of_operands_mismatch:
-	  err_msg = _("number of operands mismatch");
-	  break;
-	case invalid_instruction_suffix:
-	  err_msg = _("invalid instruction suffix");
-	  break;
-	case bad_imm4:
-	  err_msg = _("constant doesn't fit in 4 bits");
-	  break;
-	case unsupported_with_intel_mnemonic:
-	  err_msg = _("unsupported with Intel mnemonic");
-	  break;
-	case unsupported_syntax:
-	  err_msg = _("unsupported syntax");
-	  break;
-	case unsupported:
-	  as_bad (_("unsupported instruction `%s'"),
-		  current_templates->start->name);
-	  return NULL;
-	case invalid_sib_address:
-	  err_msg = _("invalid SIB address");
-	  break;
-	case invalid_vsib_address:
-	  err_msg = _("invalid VSIB address");
-	  break;
-	case invalid_vector_register_set:
-	  err_msg = _("mask, index, and destination registers must be distinct");
-	  break;
-	case invalid_tmm_register_set:
-	  err_msg = _("all tmm registers must be distinct");
-	  break;
-	case invalid_dest_and_src_register_set:
-	  err_msg = _("destination and source registers must be distinct");
-	  break;
-	case unsupported_vector_index_register:
-	  err_msg = _("unsupported vector index register");
-	  break;
-	case unsupported_broadcast:
-	  err_msg = _("unsupported broadcast");
-	  break;
-	case broadcast_needed:
-	  err_msg = _("broadcast is needed for operand of such type");
-	  break;
-	case unsupported_masking:
-	  err_msg = _("unsupported masking");
-	  break;
-	case mask_not_on_destination:
-	  err_msg = _("mask not on destination operand");
-	  break;
-	case no_default_mask:
-	  err_msg = _("default mask isn't allowed");
-	  break;
-	case unsupported_rc_sae:
-	  err_msg = _("unsupported static rounding/sae");
-	  break;
-	case invalid_register_operand:
-	  err_msg = _("invalid register operand");
-	  break;
-	}
-      as_bad (_("%s for `%s'"), err_msg,
-	      current_templates->start->name);
+      i.error = specific_error;
       return NULL;
     }
 
@@ -11347,49 +11422,6 @@  RC_SAE_immediate (const char *imm_start)
   return 1;
 }
 
-/* Only string instructions can have a second memory operand, so
-   reduce current_templates to just those if it contains any.  */
-static int
-maybe_adjust_templates (void)
-{
-  const insn_template *t;
-
-  gas_assert (i.mem_operands == 1);
-
-  for (t = current_templates->start; t < current_templates->end; ++t)
-    if (t->opcode_modifier.isstring)
-      break;
-
-  if (t < current_templates->end)
-    {
-      static templates aux_templates;
-      bool recheck;
-
-      aux_templates.start = t;
-      for (; t < current_templates->end; ++t)
-	if (!t->opcode_modifier.isstring)
-	  break;
-      aux_templates.end = t;
-
-      /* Determine whether to re-check the first memory operand.  */
-      recheck = (aux_templates.start != current_templates->start
-		 || t != current_templates->end);
-
-      current_templates = &aux_templates;
-
-      if (recheck)
-	{
-	  i.mem_operands = 0;
-	  if (i.memop1_string != NULL
-	      && i386_index_check (i.memop1_string) == 0)
-	    return 0;
-	  i.mem_operands = 1;
-	}
-    }
-
-  return 1;
-}
-
 static INLINE bool starts_memory_operand (char c)
 {
   return ISDIGIT (c)
@@ -11540,17 +11572,6 @@  i386_att_operand (char *operand_string)
       char *displacement_string_end;
 
     do_memory_reference:
-      if (i.mem_operands == 1 && !maybe_adjust_templates ())
-	return 0;
-      if ((i.mem_operands == 1
-	   && !current_templates->start->opcode_modifier.isstring)
-	  || i.mem_operands == 2)
-	{
-	  as_bad (_("too many memory references for `%s'"),
-		  current_templates->start->name);
-	  return 0;
-	}
-
       /* Check for base index form.  We detect the base index form by
 	 looking for an ')' at the end of the operand, searching
 	 for the '(' matching it, and finding a REGISTER_PREFIX or ','
@@ -11757,8 +11778,6 @@  i386_att_operand (char *operand_string)
       if (i386_index_check (operand_string) == 0)
 	return 0;
       i.flags[this_operand] |= Operand_Mem;
-      if (i.mem_operands == 0)
-	i.memop1_string = xstrdup (operand_string);
       i.mem_operands++;
     }
   else
--- a/gas/config/tc-i386-intel.c
+++ b/gas/config/tc-i386-intel.c
@@ -993,10 +993,7 @@  i386_intel_operand (char *operand_string
 	   || intel_state.is_mem)
     {
       /* Memory operand.  */
-      if (i.mem_operands == 1 && !maybe_adjust_templates ())
-	return 0;
-      if ((int) i.mem_operands
-	  >= 2 - !current_templates->start->opcode_modifier.isstring)
+      if (i.mem_operands)
 	{
 	  /* Handle
 
@@ -1041,10 +1038,6 @@  i386_intel_operand (char *operand_string
 		    }
 		}
 	    }
-
-	  as_bad (_("too many memory references for `%s'"),
-		  current_templates->start->name);
-	  return 0;
 	}
 
       /* Swap base and index in 16-bit memory operands like
@@ -1158,8 +1151,6 @@  i386_intel_operand (char *operand_string
 	return 0;
 
       i.flags[this_operand] |= Operand_Mem;
-      if (i.mem_operands == 0)
-	i.memop1_string = xstrdup (operand_string);
       ++i.mem_operands;
     }
   else
--- a/gas/testsuite/gas/i386/code16.d
+++ b/gas/testsuite/gas/i386/code16.d
@@ -1,5 +1,6 @@ 
 #objdump: -drw -mi8086
 #name: i386 with .code16
+#warning_output: code16.e
 
 .*: +file format .*
 
--- /dev/null
+++ b/gas/testsuite/gas/i386/code16.e
@@ -0,0 +1,3 @@ 
+.*: Assembler messages:
+.*:3: Warning: .* `movsd'.*`movsl'.*
+.*:4: Warning: .* `cmpsd'.*`cmpsl'.*
--- a/gas/testsuite/gas/i386/code16.s
+++ b/gas/testsuite/gas/i386/code16.s
@@ -2,8 +2,8 @@ 
 	.code16
 	rep; movsd
 	rep; cmpsd
-	rep movsd %ds:(%si),%es:(%di)
-	rep cmpsd %es:(%di),%ds:(%si)
+	rep movsl %ds:(%si),%es:(%di)
+	rep cmpsl %es:(%di),%ds:(%si)
 
 	mov	%cr2, %ecx
 	mov	%ecx, %cr2
--- a/gas/testsuite/gas/i386/intel16.d
+++ b/gas/testsuite/gas/i386/intel16.d
@@ -20,4 +20,12 @@  Disassembly of section .text:
   2c:	8d 02 [ 	]*lea    \(%bp,%si\),%ax
   2e:	8d 01 [ 	]*lea    \(%bx,%di\),%ax
   30:	8d 03 [ 	]*lea    \(%bp,%di\),%ax
-	...
+[ 	]*[0-9a-f]+:	67 f7 13[ 	]+notw[ 	]+\(%ebx\)
+[ 	]*[0-9a-f]+:	66 f7 17[ 	]+notl[ 	]+\(%bx\)
+[ 	]*[0-9a-f]+:	67 0f 1f 03[ 	]+nopw[ 	]+\(%ebx\)
+[ 	]*[0-9a-f]+:	66 0f 1f 07[ 	]+nopl[ 	]+\(%bx\)
+[ 	]*[0-9a-f]+:	67 83 03 05[ 	]+addw[ 	]+\$0x5,\(%ebx\)
+[ 	]*[0-9a-f]+:	66 83 07 05[ 	]+addl[ 	]+\$0x5,\(%bx\)
+[ 	]*[0-9a-f]+:	67 c7 03 05 00[ 	]+movw[ 	]+\$0x5,\(%ebx\)
+[ 	]*[0-9a-f]+:	66 c7 07 05 00 00 00[ 	]+movl[ 	]+\$0x5,\(%bx\)
+#pass
--- a/gas/testsuite/gas/i386/intel16.s
+++ b/gas/testsuite/gas/i386/intel16.s
@@ -18,4 +18,14 @@ 
  lea	ax, [di][bx]
  lea	ax, [di][bp]
 
- .p2align 4,0
+ notw	[ebx]
+ notd	[bx]
+
+ nopw	[ebx]
+ nopd	[bx]
+
+ addw	[ebx], 5
+ addd	[bx], 5
+
+ movw	[ebx], 5
+ movd	[bx], 5
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -135,34 +135,27 @@ 
 mov, 0xa0, None, CpuNo64, D|W|No_sSuf|No_qSuf|No_ldSuf, { Disp16|Disp32|Unspecified|Byte|Word|Dword, Acc|Byte|Word|Dword }
 mov, 0xa0, None, Cpu64, D|W|No_sSuf|No_ldSuf, { Disp64|Unspecified|Byte|Word|Dword|Qword, Acc|Byte|Word|Dword|Qword }
 movabs, 0xa0, None, Cpu64, D|W|No_sSuf|No_ldSuf, { Disp64|Unspecified|Byte|Word|Dword|Qword, Acc|Byte|Word|Dword|Qword }
-movq, 0xa1, None, Cpu64, D|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Disp64|Unspecified|Qword, Acc|Qword }
 mov, 0x88, None, 0, D|W|CheckRegSize|Modrm|No_sSuf|No_ldSuf|HLEPrefixRelease, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
-movq, 0x89, None, Cpu64, D|Modrm|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|HLEPrefixRelease, { Reg64, Reg64|Unspecified|Qword|BaseIndex }
 // In the 64bit mode the short form mov immediate is redefined to have
 // 64bit value.
 mov, 0xb0, None, 0, W|No_sSuf|No_qSuf|No_ldSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32 }
 mov, 0xc6, 0, 0, W|Modrm|No_sSuf|No_ldSuf|HLEPrefixRelease|Optimize, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
-movq, 0xc7, 0, Cpu64, Modrm|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|HLEPrefixRelease|Optimize, { Imm32S, Reg64|Qword|Unspecified|BaseIndex }
 mov, 0xb8, None, Cpu64, No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|Optimize, { Imm64, Reg64 }
 movabs, 0xb8, None, Cpu64, No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf, { Imm64, Reg64 }
-movq, 0xb8, None, Cpu64, Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { Imm64, Reg64 }
 // The segment register moves accept WordReg so that a segment register
 // can be copied to a 32 bit register, and vice versa, without using a
 // size prefix.  When moving to a 32 bit register, the upper 16 bits
 // are set to an implementation defined value (on the Pentium Pro, the
 // implementation defined value is zero).
-mov, 0x8c, None, 0, RegMem|No_bSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { SReg, Reg16|Reg32|Reg64 }
+mov, 0x8c, None, 0, RegMem|No_bSuf|No_sSuf|No_ldSuf|NoRex64, { SReg, Reg16|Reg32|Reg64 }
 mov, 0x8c, None, 0, D|Modrm|IgnoreSize|No_bSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { SReg, Word|Unspecified|BaseIndex }
-movq, 0x8c, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { SReg, Reg64 }
-mov, 0x8e, None, 0, Modrm|IgnoreSize|No_bSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Reg16|Reg32|Reg64, SReg }
+mov, 0x8e, None, 0, Modrm|IgnoreSize|No_bSuf|No_sSuf|No_ldSuf|NoRex64, { Reg16|Reg32|Reg64, SReg }
 // Move to/from control debug registers.  In the 16 or 32bit modes
 // they are 32bit.  In the 64bit mode they are 64bit.
 mov, 0xf20, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Control, Reg32 }
 mov, 0xf20, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|NoRex64, { Control, Reg64 }
-movq, 0xf20, None, Cpu64, D|RegMem|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Control, Reg64 }
 mov, 0xf21, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Debug, Reg32 }
 mov, 0xf21, None, Cpu64, D|RegMem|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_ldSuf|NoRex64, { Debug, Reg64 }
-movq, 0xf21, None, Cpu64, D|RegMem|Size64|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|NoRex64, { Debug, Reg64 }
 mov, 0xf24, None, Cpu386|CpuNo64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf|No_ldSuf, { Test, Reg32 }
 
 // Move after swapping the bytes
@@ -492,9 +485,6 @@  set<cc>, 0xf9<cc:opc>, 0, Cpu386, Modrm|
 // String manipulation.
 cmps, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
 cmps, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
-// Intel mode string compare.
-cmpsd, 0xa7, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
-cmpsd, 0xa7, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Dword|Unspecified|BaseIndex, Dword|Unspecified|BaseIndex }
 scmp, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
 scmp, 0xa6, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp0|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 ins, 0x6c, None, Cpu186, W|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
@@ -509,9 +499,6 @@  slod, 0xac, None, 0, W|No_sSuf|No_ldSuf|
 slod, 0xac, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Acc|Byte|Word|Dword|Qword }
 movs, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
 movs, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
-// Intel mode string move.
-movsd, 0xa5, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsString|RepPrefixOk, {}
-movsd, 0xa5, None, Cpu386, Size32|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Dword|Unspecified|BaseIndex, Dword|Unspecified|BaseIndex }
 smov, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
 smov, 0xa4, None, 0, W|No_sSuf|No_ldSuf|IsStringEsOp1|RepPrefixOk, { Byte|Word|Dword|Qword|Unspecified|BaseIndex, Byte|Word|Dword|Qword|Unspecified|BaseIndex }
 scas, 0xae, None, 0, W|No_sSuf|No_ldSuf|IsString|RepPrefixOk, {}
@@ -993,6 +980,7 @@  movd, 0x666e, None, CpuAVX, D|Modrm|Vex1
 movd, 0x666e, None, CpuAVX|Cpu64, D|Modrm|Vex=1|Space0F|VexW1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64|SSE2AVX, { Reg64|BaseIndex, RegXMM }
 movd, 0x660f6e, None, CpuSSE2, D|Modrm|IgnoreSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Reg32|Unspecified|BaseIndex, RegXMM }
 movd, 0x660f6e, None, CpuSSE2|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|BaseIndex, RegXMM }
+// The MMX templates have to remain after at least the SSE2AVX ones.
 movd, 0xf6e, None, CpuMMX, D|Modrm|IgnoreSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Reg32|Unspecified|BaseIndex, RegMMX }
 movd, 0xf6e, None, CpuMMX|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|BaseIndex, RegMMX }
 movq, 0xf37e, None, CpuAVX, Load|Modrm|Vex=1|Space0F|VexWIG|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|SSE2AVX, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
@@ -1001,6 +989,7 @@  movq, 0x666e, None, CpuAVX|Cpu64, D|Modr
 movq, 0xf30f7e, None, CpuSSE2, Load|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Unspecified|Qword|BaseIndex|RegXMM, RegXMM }
 movq, 0x660fd6, None, CpuSSE2, Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, Unspecified|Qword|BaseIndex|RegXMM }
 movq, 0x660f6e, None, CpuSSE2|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|Unspecified|BaseIndex, RegXMM }
+// The MMX templates have to remain after at least the SSE2AVX ones.
 movq, 0xf6f, None, CpuMMX, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Unspecified|Qword|BaseIndex|RegMMX, RegMMX }
 movq, 0xf6e, None, CpuMMX|Cpu64, D|Modrm|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Size64, { Reg64|Unspecified|BaseIndex, RegMMX }
 packssdw<mmx>, 0x<mmx:pfx>0f6b, None, <mmx:cpu>, Modrm|<mmx:attr>|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }