Detect bswap + rotate for byte permutation in pass_bswap.

Message ID 20230509070604.3466556-1-hongtao.liu@intel.com
State Accepted
Headers
Series Detect bswap + rotate for byte permutation in pass_bswap. |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

liuhongt May 9, 2023, 7:06 a.m. UTC
  The patch doesn't handle:
  1. cast64_to_32,
  2. memory source with rsize < range.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR middle-end/108938
	* gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New
	function, cut from original find_bswap_or_nop function.
	(find_bswap_or_nop): Add a new parameter, detect bswap +
	rotate and save rotate result in the new parameter.
	(bswap_replace): Add a new parameter to indicate rotate and
	generate rotate stmt if needed.
	(maybe_optimize_vector_constructor): Adjust for new rotate
	parameter in the upper 2 functions.
	(pass_optimize_bswap::execute): Ditto.
	(imm_store_chain_info::output_merged_store): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr108938-1.c: New test.
	* gcc.target/i386/pr108938-2.c: New test.
	* gcc.target/i386/pr108938-3.c: New test.
	* gcc.target/i386/pr108938-load-1.c: New test.
	* gcc.target/i386/pr108938-load-2.c: New test.
---
 gcc/gimple-ssa-store-merging.cc               | 130 ++++++++++++++----
 gcc/testsuite/gcc.target/i386/pr108938-1.c    |  79 +++++++++++
 gcc/testsuite/gcc.target/i386/pr108938-2.c    |  35 +++++
 gcc/testsuite/gcc.target/i386/pr108938-3.c    |  26 ++++
 .../gcc.target/i386/pr108938-load-1.c         |  69 ++++++++++
 .../gcc.target/i386/pr108938-load-2.c         |  30 ++++
 6 files changed, 342 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c
  

Comments

Richard Biener May 30, 2023, 9:36 a.m. UTC | #1
On Tue, May 9, 2023 at 9:06 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The patch doesn't handle:
>   1. cast64_to_32,
>   2. memory source with rsize < range.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

OK and sorry for the delay.

Richard.

> gcc/ChangeLog:
>
>         PR middle-end/108938
>         * gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New
>         function, cut from original find_bswap_or_nop function.
>         (find_bswap_or_nop): Add a new parameter, detect bswap +
>         rotate and save rotate result in the new parameter.
>         (bswap_replace): Add a new parameter to indicate rotate and
>         generate rotate stmt if needed.
>         (maybe_optimize_vector_constructor): Adjust for new rotate
>         parameter in the upper 2 functions.
>         (pass_optimize_bswap::execute): Ditto.
>         (imm_store_chain_info::output_merged_store): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr108938-1.c: New test.
>         * gcc.target/i386/pr108938-2.c: New test.
>         * gcc.target/i386/pr108938-3.c: New test.
>         * gcc.target/i386/pr108938-load-1.c: New test.
>         * gcc.target/i386/pr108938-load-2.c: New test.
> ---
>  gcc/gimple-ssa-store-merging.cc               | 130 ++++++++++++++----
>  gcc/testsuite/gcc.target/i386/pr108938-1.c    |  79 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr108938-2.c    |  35 +++++
>  gcc/testsuite/gcc.target/i386/pr108938-3.c    |  26 ++++
>  .../gcc.target/i386/pr108938-load-1.c         |  69 ++++++++++
>  .../gcc.target/i386/pr108938-load-2.c         |  30 ++++
>  6 files changed, 342 insertions(+), 27 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c
>
> diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
> index df7afd2fd78..9cb574fa315 100644
> --- a/gcc/gimple-ssa-store-merging.cc
> +++ b/gcc/gimple-ssa-store-merging.cc
> @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
>    n->range *= BITS_PER_UNIT;
>  }
>
> +/* Helper function for find_bswap_or_nop,
> +   Return true if N is a swap or nop with MASK.  */
> +static bool
> +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
> +                  uint64_t cmpnop, uint64_t* mask,
> +                  bool* bswap)
> +{
> +  *mask = ~(uint64_t) 0;
> +  if (n == cmpnop)
> +    *bswap = false;
> +  else if (n == cmpxchg)
> +    *bswap = true;
> +  else
> +    {
> +      int set = 0;
> +      for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
> +       if ((n & msk) == 0)
> +         *mask &= ~msk;
> +       else if ((n & msk) == (cmpxchg & msk))
> +         set++;
> +       else
> +         return false;
> +
> +      if (set < 2)
> +       return false;
> +      *bswap = true;
> +    }
> +  return true;
> +}
> +
> +
>  /* Check if STMT completes a bswap implementation or a read in a given
>     endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
>     accordingly.  It also sets N to represent the kind of operations
> @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
>
>  gimple *
>  find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
> -                  bool *cast64_to_32, uint64_t *mask)
> +                  bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
>  {
>    tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
>    if (!tree_fits_uhwi_p (type_size))
> @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
>      }
>
>    uint64_t cmpxchg, cmpnop;
> +  uint64_t orig_range = n->range * BITS_PER_UNIT;
>    find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32);
>
>    /* A complete byte swap should make the symbolic number to start with
>       the largest digit in the highest order byte. Unchanged symbolic
>       number indicates a read with same endianness as target architecture.  */
> -  *mask = ~(uint64_t) 0;
> -  if (n->n == cmpnop)
> -    *bswap = false;
> -  else if (n->n == cmpxchg)
> -    *bswap = true;
> -  else
> +  *l_rotate = 0;
> +  uint64_t tmp_n = n->n;
> +  if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
>      {
> -      int set = 0;
> -      for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
> -       if ((n->n & msk) == 0)
> -         *mask &= ~msk;
> -       else if ((n->n & msk) == (cmpxchg & msk))
> -         set++;
> -       else
> -         return NULL;
> -      if (set < 2)
> +      /* Try bswap + lrotate.  */
> +      /* TODO, handle cast64_to_32 and big/litte_endian memory
> +        source when rsize < range.  */
> +      if (n->range == orig_range
> +         && ((orig_range == 32
> +              && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
> +             || (orig_range == 64
> +                 && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
> +         && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
> +       {
> +         uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER;
> +         uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER;
> +         /* .i.e. hanlde 0x203040506070800 when lower byte is zero.  */
> +         if (!count)
> +           {
> +             for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++)
> +               {
> +                 count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK;
> +                 if (count)
> +                   {
> +                     /* Count should be meaningful not 0xff.  */
> +                     if (count <= range / BITS_PER_MARKER)
> +                       {
> +                         count = (count + i) * BITS_PER_MARKER % range;
> +                         break;
> +                       }
> +                     else
> +                       return NULL;
> +                   }
> +               }
> +           }
> +         tmp_n = tmp_n >> count | tmp_n << (range - count);
> +         if (orig_range == 32)
> +           tmp_n &= (1ULL << 32) - 1;
> +         if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
> +           return NULL;
> +         *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT;
> +         gcc_assert (*bswap);
> +       }
> +      else
>         return NULL;
> -      *bswap = true;
>      }
>
>    /* Useless bit manipulation performed by code.  */
> @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val,
>  tree
>  bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>                tree bswap_type, tree load_type, struct symbolic_number *n,
> -              bool bswap, uint64_t mask)
> +              bool bswap, uint64_t mask, uint64_t l_rotate)
>  {
>    tree src, tmp, tgt = NULL_TREE;
> -  gimple *bswap_stmt, *mask_stmt = NULL;
> +  gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL;
>    tree_code conv_code = NOP_EXPR;
>
>    gimple *cur_stmt = gsi_stmt (gsi);
> @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>        tmp = tgt;
>      }
>
> +  if (l_rotate)
> +    {
> +      tree m = build_int_cst (bswap_type, l_rotate);
> +      tmp = make_temp_ssa_name (bswap_type, NULL,
> +                               mask_stmt ? "bswapmaskdst" : "bswapdst");
> +      gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
> +      rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m);
> +      tmp = tgt;
> +    }
> +
>    /* Convert the result if necessary.  */
>    if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
>      {
> @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>        gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT);
>      }
>
> -  gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
> +  gimple_set_lhs (rotl_stmt ? rotl_stmt
> +                 : mask_stmt ? mask_stmt : bswap_stmt, tmp);
>
>    if (dump_file)
>      {
> @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>
>    if (cur_stmt)
>      {
> +      if (rotl_stmt)
> +       gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
>        if (mask_stmt)
>         gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT);
>        gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
> @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>        gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
>        if (mask_stmt)
>         gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT);
> +      if (rotl_stmt)
> +       gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
>      }
>    return tgt;
>  }
> @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
>      }
>
>    bool cast64_to_32;
> -  uint64_t mask;
> +  uint64_t mask, l_rotate;
>    gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
> -                                       &cast64_to_32, &mask);
> +                                       &cast64_to_32, &mask, &l_rotate);
>    if (!ins_stmt
>        || n.range != (unsigned HOST_WIDE_INT) sz
>        || cast64_to_32
> @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
>    memset (&nop_stats, 0, sizeof (nop_stats));
>    memset (&bswap_stats, 0, sizeof (bswap_stats));
>    return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
> -                       bswap_type, load_type, &n, bswap, mask) != NULL_TREE;
> +                       bswap_type, load_type, &n, bswap, mask,
> +                       l_rotate) != NULL_TREE;
>  }
>
>  /* Find manual byte swap implementations as well as load in a given
> @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun)
>           enum tree_code code;
>           struct symbolic_number n;
>           bool bswap, cast64_to_32;
> -         uint64_t mask;
> +         uint64_t mask, l_rotate;
>
>           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
>              might be moved to a different basic block by bswap_replace and gsi
> @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun)
>             }
>
>           ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
> -                                       &cast64_to_32, &mask);
> +                                       &cast64_to_32, &mask, &l_rotate);
>
>           if (!ins_stmt)
>             continue;
> @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun)
>             continue;
>
>           if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
> -                            bswap_type, load_type, &n, bswap, mask))
> +                            bswap_type, load_type, &n, bswap, mask,
> +                            l_rotate))
>             changed = true;
>         }
>      }
> @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group)
>         }
>        bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
>                                  bswap_type, load_type, n, bswap,
> -                                ~(uint64_t) 0);
> +                                ~(uint64_t) 0, 0);
>        gcc_assert (bswap_res);
>      }
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c
> new file mode 100644
> index 00000000000..67f245d1441
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c
> @@ -0,0 +1,79 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */
> +
> +#include<stdint.h>
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64 (uint64_t x)
> +{
> +  return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
> +         (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> +         (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> +         (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
> +         (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
> +         (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> +         (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> +         (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_1 (uint64_t x)
> +{
> +  return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) |
> +         (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> +         (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> +         (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
> +         (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
> +         (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> +         (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> +         (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_2 (uint64_t x)
> +{
> +  return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
> +         (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> +         (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> +         (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
> +         (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
> +         (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> +         (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> +         (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32 (uint32_t x)
> +{
> +  return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
> +         (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> +         (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> +         (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_1 (uint32_t x)
> +{
> +  return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) |
> +         (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> +         (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> +         (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_2 (uint32_t x)
> +{
> +  return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
> +         (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> +         (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> +         (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c
> new file mode 100644
> index 00000000000..47a2c89f1c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr108938-1.c"
> +
> +int main ()
> +{
> +  uint64_t a = 0x0807060504030201ULL;
> +  uint64_t res = swap_rotate_64 (a);
> +  if (res != 0x0203040506070801ULL)
> +    __builtin_abort ();
> +
> +  res = swap_rotate_64_mask_1 (a);
> +  if (res != 0x0203040506070800ULL)
> +    __builtin_abort ();
> +
> +  res = swap_rotate_64_mask_2 (a);
> +  if (res != 0x0203040506070001ULL)
> +    __builtin_abort ();
> +
> +  uint32_t b = 0x04030201;
> +  uint32_t res2 = swap_rotate_32 (b);
> +  if (res2 != 0x03040102)
> +    __builtin_abort ();
> +
> +  res2 = swap_rotate_32_mask_1 (b);
> +  if (res2 != 0x03040002)
> +    __builtin_abort ();
> +
> +  res2 = swap_rotate_32_mask_2 (b);
> +  if (res2 != 0x03040100)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c
> new file mode 100644
> index 00000000000..32ac544c7ed
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */
> +
> +void
> +foo1 (char* a, unsigned int* __restrict b)
> +{
> +  a[0] = b[0] >> 24;
> +  a[1] = b[0] >> 16;
> +  a[2] = b[0] >> 8;
> +  a[3] = b[0];
> +  a[4] = b[1] >> 24;
> +  a[5] = b[1] >> 16;
> +  a[6] = b[1] >> 8;
> +  a[7] = b[1];
> +}
> +
> +void
> +foo2 (char* a, short* __restrict b)
> +{
> +  a[0] = b[0] >> 8;
> +  a[1] = b[0];
> +  a[2] = b[1] >> 8;
> +  a[3] = b[1];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
> new file mode 100644
> index 00000000000..50d3a505c81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
> @@ -0,0 +1,69 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */
> +
> +#include<stdint.h>
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64 (unsigned char* x)
> +{
> +  return ((uint64_t)(x[0]) << 0 |
> +         (uint64_t)(x[1]) << 56 |
> +         (uint64_t)(x[2]) << 48 |
> +         (uint64_t)(x[3]) << 40 |
> +         (uint64_t)(x[4]) << 32 |
> +         (uint64_t)(x[5]) << 24 |
> +         (uint64_t)(x[6]) << 16 |
> +         (uint64_t)(x[7]) << 8);
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_1 (unsigned char* x)
> +{
> +  return ((uint64_t)(x[0]) << 24 |
> +         (uint64_t)(x[1]) << 16 |
> +         (uint64_t)(x[2]) << 8 |
> +         (uint64_t)(0) << 0 |
> +         (uint64_t)(x[4]) << 56 |
> +         (uint64_t)(x[5]) << 48 |
> +         (uint64_t)(x[6]) << 40 |
> +         (uint64_t)(x[7]) << 32);
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_2 (unsigned char* x)
> +{
> +  return ((uint64_t)(x[0]) << 0 |
> +         (uint64_t)(x[1]) << 56 |
> +         (uint64_t)(x[2]) << 48 |
> +         (uint64_t)(0) << 40 |
> +         (uint64_t)(x[4]) << 32 |
> +         (uint64_t)(x[5]) << 24 |
> +         (uint64_t)(x[6]) << 16 |
> +         (uint64_t)(x[7]) << 8);
> +}
> +
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32 (unsigned char* x)
> +{
> +  return ((uint64_t)(x[0]) << 8 |
> +         (uint64_t)(x[1]) << 0 |
> +         (uint64_t)(x[2]) << 24 |
> +         (uint64_t)(x[3]) << 16);
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_1 (unsigned char* x)
> +{
> +  return ((uint64_t)(x[0]) << 8 |
> +         (uint64_t)(0) << 0 |
> +         (uint64_t)(x[2]) << 24 |
> +         (uint64_t)(x[3]) << 16);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
> new file mode 100644
> index 00000000000..51a8102f995
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr108938-load-1.c"
> +
> +int main ()
> +{
> +  unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
> +  uint64_t res = swap_rotate_64 (a);
> +  if (res != 0x0203040506070801ULL)
> +    __builtin_abort ();
> +
> +  res = swap_rotate_64_mask_1 (a);
> +  if (res != 0x0506070801020300ULL)
> +    __builtin_abort ();
> +
> +  res = swap_rotate_64_mask_2 (a);
> +  if (res != 0x0203000506070801ULL)
> +    __builtin_abort ();
> +
> +  uint32_t res2 = swap_rotate_32 (a);
> +  if (res2 != 0x03040102)
> +    __builtin_abort ();
> +
> +  res2 = swap_rotate_32_mask_1 (a);
> +  if (res2 != 0x03040100)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> --
> 2.39.1.388.g2fc9e9ca3c
>
  

Patch

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index df7afd2fd78..9cb574fa315 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -893,6 +893,37 @@  find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
   n->range *= BITS_PER_UNIT;
 }
 
+/* Helper function for find_bswap_or_nop,
+   Return true if N is a swap or nop with MASK.  */
+static bool
+is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
+		   uint64_t cmpnop, uint64_t* mask,
+		   bool* bswap)
+{
+  *mask = ~(uint64_t) 0;
+  if (n == cmpnop)
+    *bswap = false;
+  else if (n == cmpxchg)
+    *bswap = true;
+  else
+    {
+      int set = 0;
+      for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
+	if ((n & msk) == 0)
+	  *mask &= ~msk;
+	else if ((n & msk) == (cmpxchg & msk))
+	  set++;
+	else
+	  return false;
+
+      if (set < 2)
+	return false;
+      *bswap = true;
+    }
+  return true;
+}
+
+
 /* Check if STMT completes a bswap implementation or a read in a given
    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
    accordingly.  It also sets N to represent the kind of operations
@@ -903,7 +934,7 @@  find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
 
 gimple *
 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
-		   bool *cast64_to_32, uint64_t *mask)
+		   bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
 {
   tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
   if (!tree_fits_uhwi_p (type_size))
@@ -984,29 +1015,57 @@  find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
     }
 
   uint64_t cmpxchg, cmpnop;
+  uint64_t orig_range = n->range * BITS_PER_UNIT;
   find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32);
 
   /* A complete byte swap should make the symbolic number to start with
      the largest digit in the highest order byte. Unchanged symbolic
      number indicates a read with same endianness as target architecture.  */
-  *mask = ~(uint64_t) 0;
-  if (n->n == cmpnop)
-    *bswap = false;
-  else if (n->n == cmpxchg)
-    *bswap = true;
-  else
+  *l_rotate = 0;
+  uint64_t tmp_n = n->n;
+  if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
     {
-      int set = 0;
-      for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
-	if ((n->n & msk) == 0)
-	  *mask &= ~msk;
-	else if ((n->n & msk) == (cmpxchg & msk))
-	  set++;
-	else
-	  return NULL;
-      if (set < 2)
+      /* Try bswap + lrotate.  */
+      /* TODO, handle cast64_to_32 and big/litte_endian memory
+	 source when rsize < range.  */
+      if (n->range == orig_range
+	  && ((orig_range == 32
+	       && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
+	      || (orig_range == 64
+		  && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
+	  && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
+	{
+	  uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER;
+	  uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER;
+	  /* .i.e. hanlde 0x203040506070800 when lower byte is zero.  */
+	  if (!count)
+	    {
+	      for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++)
+		{
+		  count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK;
+		  if (count)
+		    {
+		      /* Count should be meaningful not 0xff.  */
+		      if (count <= range / BITS_PER_MARKER)
+			{
+			  count = (count + i) * BITS_PER_MARKER % range;
+			  break;
+			}
+		      else
+			return NULL;
+		    }
+		}
+	    }
+	  tmp_n = tmp_n >> count | tmp_n << (range - count);
+	  if (orig_range == 32)
+	    tmp_n &= (1ULL << 32) - 1;
+	  if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
+	    return NULL;
+	  *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT;
+	  gcc_assert (*bswap);
+	}
+      else
 	return NULL;
-      *bswap = true;
     }
 
   /* Useless bit manipulation performed by code.  */
@@ -1099,10 +1158,10 @@  bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val,
 tree
 bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 	       tree bswap_type, tree load_type, struct symbolic_number *n,
-	       bool bswap, uint64_t mask)
+	       bool bswap, uint64_t mask, uint64_t l_rotate)
 {
   tree src, tmp, tgt = NULL_TREE;
-  gimple *bswap_stmt, *mask_stmt = NULL;
+  gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL;
   tree_code conv_code = NOP_EXPR;
 
   gimple *cur_stmt = gsi_stmt (gsi);
@@ -1332,6 +1391,16 @@  bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
       tmp = tgt;
     }
 
+  if (l_rotate)
+    {
+      tree m = build_int_cst (bswap_type, l_rotate);
+      tmp = make_temp_ssa_name (bswap_type, NULL,
+				mask_stmt ? "bswapmaskdst" : "bswapdst");
+      gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+      rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m);
+      tmp = tgt;
+    }
+
   /* Convert the result if necessary.  */
   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
     {
@@ -1344,7 +1413,8 @@  bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
       gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT);
     }
 
-  gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+  gimple_set_lhs (rotl_stmt ? rotl_stmt
+		  : mask_stmt ? mask_stmt : bswap_stmt, tmp);
 
   if (dump_file)
     {
@@ -1361,6 +1431,8 @@  bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 
   if (cur_stmt)
     {
+      if (rotl_stmt)
+	gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
       if (mask_stmt)
 	gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT);
       gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
@@ -1371,6 +1443,8 @@  bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
       gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
       if (mask_stmt)
 	gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT);
+      if (rotl_stmt)
+	gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
     }
   return tgt;
 }
@@ -1432,9 +1506,9 @@  maybe_optimize_vector_constructor (gimple *cur_stmt)
     }
 
   bool cast64_to_32;
-  uint64_t mask;
+  uint64_t mask, l_rotate;
   gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
-					&cast64_to_32, &mask);
+					&cast64_to_32, &mask, &l_rotate);
   if (!ins_stmt
       || n.range != (unsigned HOST_WIDE_INT) sz
       || cast64_to_32
@@ -1447,7 +1521,8 @@  maybe_optimize_vector_constructor (gimple *cur_stmt)
   memset (&nop_stats, 0, sizeof (nop_stats));
   memset (&bswap_stats, 0, sizeof (bswap_stats));
   return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
-			bswap_type, load_type, &n, bswap, mask) != NULL_TREE;
+			bswap_type, load_type, &n, bswap, mask,
+			l_rotate) != NULL_TREE;
 }
 
 /* Find manual byte swap implementations as well as load in a given
@@ -1502,7 +1577,7 @@  pass_optimize_bswap::execute (function *fun)
 	  enum tree_code code;
 	  struct symbolic_number n;
 	  bool bswap, cast64_to_32;
-	  uint64_t mask;
+	  uint64_t mask, l_rotate;
 
 	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
 	     might be moved to a different basic block by bswap_replace and gsi
@@ -1542,7 +1617,7 @@  pass_optimize_bswap::execute (function *fun)
 	    }
 
 	  ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
-					&cast64_to_32, &mask);
+					&cast64_to_32, &mask, &l_rotate);
 
 	  if (!ins_stmt)
 	    continue;
@@ -1579,7 +1654,8 @@  pass_optimize_bswap::execute (function *fun)
 	    continue;
 
 	  if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
-			     bswap_type, load_type, &n, bswap, mask))
+			     bswap_type, load_type, &n, bswap, mask,
+			     l_rotate))
 	    changed = true;
 	}
     }
@@ -4271,7 +4347,7 @@  imm_store_chain_info::output_merged_store (merged_store_group *group)
 	}
       bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
 				 bswap_type, load_type, n, bswap,
-				 ~(uint64_t) 0);
+				 ~(uint64_t) 0, 0);
       gcc_assert (bswap_res);
     }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c
new file mode 100644
index 00000000000..67f245d1441
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c
@@ -0,0 +1,79 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (uint64_t x)
+{
+  return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+	  (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+	  (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+	  (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
+	  (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
+	  (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+	  (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+	  (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (uint64_t x)
+{
+  return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) |
+	  (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+	  (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+	  (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
+	  (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
+	  (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+	  (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+	  (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (uint64_t x)
+{
+  return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+	  (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+	  (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+	  (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  16) |
+	  (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  0) |
+	  (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+	  (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+	  (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (uint32_t x)
+{
+  return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+	  (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+	  (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+	  (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (uint32_t x)
+{
+  return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) |
+	  (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+	  (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+	  (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_2 (uint32_t x)
+{
+  return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+	  (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+	  (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+	  (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c
new file mode 100644
index 00000000000..47a2c89f1c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-1.c"
+
+int main ()
+{
+  uint64_t a = 0x0807060504030201ULL;
+  uint64_t res = swap_rotate_64 (a);
+  if (res != 0x0203040506070801ULL)
+    __builtin_abort ();
+
+  res = swap_rotate_64_mask_1 (a);
+  if (res != 0x0203040506070800ULL)
+    __builtin_abort ();
+
+  res = swap_rotate_64_mask_2 (a);
+  if (res != 0x0203040506070001ULL)
+    __builtin_abort ();
+
+  uint32_t b = 0x04030201;
+  uint32_t res2 = swap_rotate_32 (b);
+  if (res2 != 0x03040102)
+    __builtin_abort ();
+
+  res2 = swap_rotate_32_mask_1 (b);
+  if (res2 != 0x03040002)
+    __builtin_abort ();
+
+  res2 = swap_rotate_32_mask_2 (b);
+  if (res2 != 0x03040100)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c
new file mode 100644
index 00000000000..32ac544c7ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */
+
+void
+foo1 (char* a, unsigned int* __restrict b)
+{
+  a[0] = b[0] >> 24;
+  a[1] = b[0] >> 16;
+  a[2] = b[0] >> 8;
+  a[3] = b[0];
+  a[4] = b[1] >> 24;
+  a[5] = b[1] >> 16;
+  a[6] = b[1] >> 8;
+  a[7] = b[1];
+}
+
+void
+foo2 (char* a, short* __restrict b)
+{
+  a[0] = b[0] >> 8;
+  a[1] = b[0];
+  a[2] = b[1] >> 8;
+  a[3] = b[1];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
new file mode 100644
index 00000000000..50d3a505c81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
@@ -0,0 +1,69 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (unsigned char* x)
+{
+  return ((uint64_t)(x[0]) << 0 |
+	  (uint64_t)(x[1]) << 56 |
+	  (uint64_t)(x[2]) << 48 |
+	  (uint64_t)(x[3]) << 40 |
+	  (uint64_t)(x[4]) << 32 |
+	  (uint64_t)(x[5]) << 24 |
+	  (uint64_t)(x[6]) << 16 |
+	  (uint64_t)(x[7]) << 8);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (unsigned char* x)
+{
+  return ((uint64_t)(x[0]) << 24 |
+	  (uint64_t)(x[1]) << 16 |
+	  (uint64_t)(x[2]) << 8 |
+	  (uint64_t)(0) << 0 |
+	  (uint64_t)(x[4]) << 56 |
+	  (uint64_t)(x[5]) << 48 |
+	  (uint64_t)(x[6]) << 40 |
+	  (uint64_t)(x[7]) << 32);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (unsigned char* x)
+{
+  return ((uint64_t)(x[0]) << 0 |
+	  (uint64_t)(x[1]) << 56 |
+	  (uint64_t)(x[2]) << 48 |
+	  (uint64_t)(0) << 40 |
+	  (uint64_t)(x[4]) << 32 |
+	  (uint64_t)(x[5]) << 24 |
+	  (uint64_t)(x[6]) << 16 |
+	  (uint64_t)(x[7]) << 8);
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (unsigned char* x)
+{
+  return ((uint64_t)(x[0]) << 8 |
+	  (uint64_t)(x[1]) << 0 |
+	  (uint64_t)(x[2]) << 24 |
+	  (uint64_t)(x[3]) << 16);
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (unsigned char* x)
+{
+  return ((uint64_t)(x[0]) << 8 |
+	  (uint64_t)(0) << 0 |
+	  (uint64_t)(x[2]) << 24 |
+	  (uint64_t)(x[3]) << 16);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
new file mode 100644
index 00000000000..51a8102f995
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-load-1.c"
+
+int main ()
+{
+  unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+  uint64_t res = swap_rotate_64 (a);
+  if (res != 0x0203040506070801ULL)
+    __builtin_abort ();
+
+  res = swap_rotate_64_mask_1 (a);
+  if (res != 0x0506070801020300ULL)
+    __builtin_abort ();
+
+  res = swap_rotate_64_mask_2 (a);
+  if (res != 0x0203000506070801ULL)
+    __builtin_abort ();
+
+  uint32_t res2 = swap_rotate_32 (a);
+  if (res2 != 0x03040102)
+    __builtin_abort ();
+
+  res2 = swap_rotate_32_mask_1 (a);
+  if (res2 != 0x03040100)
+    __builtin_abort ();
+
+  return 0;
+}