Detect bswap + rotate for byte permutation in pass_bswap.
Checks
Commit Message
The patch doesn't handle:
1. cast64_to_32,
2. memory source with rsize < range.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR middle-end/108938
* gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New
function, cut from original find_bswap_or_nop function.
(find_bswap_or_nop): Add a new parameter, detect bswap +
rotate and save rotate result in the new parameter.
(bswap_replace): Add a new parameter to indicate rotate and
generate rotate stmt if needed.
(maybe_optimize_vector_constructor): Adjust for new rotate
parameter in the upper 2 functions.
(pass_optimize_bswap::execute): Ditto.
(imm_store_chain_info::output_merged_store): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr108938-1.c: New test.
* gcc.target/i386/pr108938-2.c: New test.
* gcc.target/i386/pr108938-3.c: New test.
* gcc.target/i386/pr108938-load-1.c: New test.
* gcc.target/i386/pr108938-load-2.c: New test.
---
gcc/gimple-ssa-store-merging.cc | 130 ++++++++++++++----
gcc/testsuite/gcc.target/i386/pr108938-1.c | 79 +++++++++++
gcc/testsuite/gcc.target/i386/pr108938-2.c | 35 +++++
gcc/testsuite/gcc.target/i386/pr108938-3.c | 26 ++++
.../gcc.target/i386/pr108938-load-1.c | 69 ++++++++++
.../gcc.target/i386/pr108938-load-2.c | 30 ++++
6 files changed, 342 insertions(+), 27 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c
Comments
On Tue, May 9, 2023 at 9:06 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The patch doesn't handle:
> 1. cast64_to_32,
> 2. memory source with rsize < range.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
OK and sorry for the delay.
Richard.
> gcc/ChangeLog:
>
> PR middle-end/108938
> * gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New
> function, cut from original find_bswap_or_nop function.
> (find_bswap_or_nop): Add a new parameter, detect bswap +
> rotate and save rotate result in the new parameter.
> (bswap_replace): Add a new parameter to indicate rotate and
> generate rotate stmt if needed.
> (maybe_optimize_vector_constructor): Adjust for new rotate
> parameter in the upper 2 functions.
> (pass_optimize_bswap::execute): Ditto.
> (imm_store_chain_info::output_merged_store): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr108938-1.c: New test.
> * gcc.target/i386/pr108938-2.c: New test.
> * gcc.target/i386/pr108938-3.c: New test.
> * gcc.target/i386/pr108938-load-1.c: New test.
> * gcc.target/i386/pr108938-load-2.c: New test.
> ---
> gcc/gimple-ssa-store-merging.cc | 130 ++++++++++++++----
> gcc/testsuite/gcc.target/i386/pr108938-1.c | 79 +++++++++++
> gcc/testsuite/gcc.target/i386/pr108938-2.c | 35 +++++
> gcc/testsuite/gcc.target/i386/pr108938-3.c | 26 ++++
> .../gcc.target/i386/pr108938-load-1.c | 69 ++++++++++
> .../gcc.target/i386/pr108938-load-2.c | 30 ++++
> 6 files changed, 342 insertions(+), 27 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c
>
> diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
> index df7afd2fd78..9cb574fa315 100644
> --- a/gcc/gimple-ssa-store-merging.cc
> +++ b/gcc/gimple-ssa-store-merging.cc
> @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
> n->range *= BITS_PER_UNIT;
> }
>
> +/* Helper function for find_bswap_or_nop,
> + Return true if N is a swap or nop with MASK. */
> +static bool
> +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
> + uint64_t cmpnop, uint64_t* mask,
> + bool* bswap)
> +{
> + *mask = ~(uint64_t) 0;
> + if (n == cmpnop)
> + *bswap = false;
> + else if (n == cmpxchg)
> + *bswap = true;
> + else
> + {
> + int set = 0;
> + for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
> + if ((n & msk) == 0)
> + *mask &= ~msk;
> + else if ((n & msk) == (cmpxchg & msk))
> + set++;
> + else
> + return false;
> +
> + if (set < 2)
> + return false;
> + *bswap = true;
> + }
> + return true;
> +}
> +
> +
> /* Check if STMT completes a bswap implementation or a read in a given
> endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
> accordingly. It also sets N to represent the kind of operations
> @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
>
> gimple *
> find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
> - bool *cast64_to_32, uint64_t *mask)
> + bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
> {
> tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
> if (!tree_fits_uhwi_p (type_size))
> @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
> }
>
> uint64_t cmpxchg, cmpnop;
> + uint64_t orig_range = n->range * BITS_PER_UNIT;
> find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32);
>
> /* A complete byte swap should make the symbolic number to start with
> the largest digit in the highest order byte. Unchanged symbolic
> number indicates a read with same endianness as target architecture. */
> - *mask = ~(uint64_t) 0;
> - if (n->n == cmpnop)
> - *bswap = false;
> - else if (n->n == cmpxchg)
> - *bswap = true;
> - else
> + *l_rotate = 0;
> + uint64_t tmp_n = n->n;
> + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
> {
> - int set = 0;
> - for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
> - if ((n->n & msk) == 0)
> - *mask &= ~msk;
> - else if ((n->n & msk) == (cmpxchg & msk))
> - set++;
> - else
> - return NULL;
> - if (set < 2)
> + /* Try bswap + lrotate. */
> + /* TODO, handle cast64_to_32 and big/litte_endian memory
> + source when rsize < range. */
> + if (n->range == orig_range
> + && ((orig_range == 32
> + && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
> + || (orig_range == 64
> + && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
> + && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
> + {
> + uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER;
> + uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER;
> + /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */
> + if (!count)
> + {
> + for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++)
> + {
> + count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK;
> + if (count)
> + {
> + /* Count should be meaningful not 0xff. */
> + if (count <= range / BITS_PER_MARKER)
> + {
> + count = (count + i) * BITS_PER_MARKER % range;
> + break;
> + }
> + else
> + return NULL;
> + }
> + }
> + }
> + tmp_n = tmp_n >> count | tmp_n << (range - count);
> + if (orig_range == 32)
> + tmp_n &= (1ULL << 32) - 1;
> + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
> + return NULL;
> + *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT;
> + gcc_assert (*bswap);
> + }
> + else
> return NULL;
> - *bswap = true;
> }
>
> /* Useless bit manipulation performed by code. */
> @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val,
> tree
> bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
> tree bswap_type, tree load_type, struct symbolic_number *n,
> - bool bswap, uint64_t mask)
> + bool bswap, uint64_t mask, uint64_t l_rotate)
> {
> tree src, tmp, tgt = NULL_TREE;
> - gimple *bswap_stmt, *mask_stmt = NULL;
> + gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL;
> tree_code conv_code = NOP_EXPR;
>
> gimple *cur_stmt = gsi_stmt (gsi);
> @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
> tmp = tgt;
> }
>
> + if (l_rotate)
> + {
> + tree m = build_int_cst (bswap_type, l_rotate);
> + tmp = make_temp_ssa_name (bswap_type, NULL,
> + mask_stmt ? "bswapmaskdst" : "bswapdst");
> + gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
> + rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m);
> + tmp = tgt;
> + }
> +
> /* Convert the result if necessary. */
> if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
> {
> @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
> gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT);
> }
>
> - gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
> + gimple_set_lhs (rotl_stmt ? rotl_stmt
> + : mask_stmt ? mask_stmt : bswap_stmt, tmp);
>
> if (dump_file)
> {
> @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
>
> if (cur_stmt)
> {
> + if (rotl_stmt)
> + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
> if (mask_stmt)
> gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT);
> gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
> @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
> gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
> if (mask_stmt)
> gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT);
> + if (rotl_stmt)
> + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
> }
> return tgt;
> }
> @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
> }
>
> bool cast64_to_32;
> - uint64_t mask;
> + uint64_t mask, l_rotate;
> gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
> - &cast64_to_32, &mask);
> + &cast64_to_32, &mask, &l_rotate);
> if (!ins_stmt
> || n.range != (unsigned HOST_WIDE_INT) sz
> || cast64_to_32
> @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
> memset (&nop_stats, 0, sizeof (nop_stats));
> memset (&bswap_stats, 0, sizeof (bswap_stats));
> return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
> - bswap_type, load_type, &n, bswap, mask) != NULL_TREE;
> + bswap_type, load_type, &n, bswap, mask,
> + l_rotate) != NULL_TREE;
> }
>
> /* Find manual byte swap implementations as well as load in a given
> @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun)
> enum tree_code code;
> struct symbolic_number n;
> bool bswap, cast64_to_32;
> - uint64_t mask;
> + uint64_t mask, l_rotate;
>
> /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
> might be moved to a different basic block by bswap_replace and gsi
> @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun)
> }
>
> ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
> - &cast64_to_32, &mask);
> + &cast64_to_32, &mask, &l_rotate);
>
> if (!ins_stmt)
> continue;
> @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun)
> continue;
>
> if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
> - bswap_type, load_type, &n, bswap, mask))
> + bswap_type, load_type, &n, bswap, mask,
> + l_rotate))
> changed = true;
> }
> }
> @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group)
> }
> bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
> bswap_type, load_type, n, bswap,
> - ~(uint64_t) 0);
> + ~(uint64_t) 0, 0);
> gcc_assert (bswap_res);
> }
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c
> new file mode 100644
> index 00000000000..67f245d1441
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c
> @@ -0,0 +1,79 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */
> +
> +#include<stdint.h>
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64 (uint64_t x)
> +{
> + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
> + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
> + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
> + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_1 (uint64_t x)
> +{
> + return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) |
> + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
> + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
> + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_2 (uint64_t x)
> +{
> + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
> + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
> + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
> + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
> + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
> + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
> + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
> + (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48));
> +}
> +
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32 (uint32_t x)
> +{
> + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
> + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_1 (uint32_t x)
> +{
> + return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) |
> + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_2 (uint32_t x)
> +{
> + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
> + (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) |
> + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
> + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c
> new file mode 100644
> index 00000000000..47a2c89f1c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr108938-1.c"
> +
> +int main ()
> +{
> + uint64_t a = 0x0807060504030201ULL;
> + uint64_t res = swap_rotate_64 (a);
> + if (res != 0x0203040506070801ULL)
> + __builtin_abort ();
> +
> + res = swap_rotate_64_mask_1 (a);
> + if (res != 0x0203040506070800ULL)
> + __builtin_abort ();
> +
> + res = swap_rotate_64_mask_2 (a);
> + if (res != 0x0203040506070001ULL)
> + __builtin_abort ();
> +
> + uint32_t b = 0x04030201;
> + uint32_t res2 = swap_rotate_32 (b);
> + if (res2 != 0x03040102)
> + __builtin_abort ();
> +
> + res2 = swap_rotate_32_mask_1 (b);
> + if (res2 != 0x03040002)
> + __builtin_abort ();
> +
> + res2 = swap_rotate_32_mask_2 (b);
> + if (res2 != 0x03040100)
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c
> new file mode 100644
> index 00000000000..32ac544c7ed
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */
> +
> +void
> +foo1 (char* a, unsigned int* __restrict b)
> +{
> + a[0] = b[0] >> 24;
> + a[1] = b[0] >> 16;
> + a[2] = b[0] >> 8;
> + a[3] = b[0];
> + a[4] = b[1] >> 24;
> + a[5] = b[1] >> 16;
> + a[6] = b[1] >> 8;
> + a[7] = b[1];
> +}
> +
> +void
> +foo2 (char* a, short* __restrict b)
> +{
> + a[0] = b[0] >> 8;
> + a[1] = b[0];
> + a[2] = b[1] >> 8;
> + a[3] = b[1];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
> new file mode 100644
> index 00000000000..50d3a505c81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
> @@ -0,0 +1,69 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-movbe" } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */
> +
> +#include<stdint.h>
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64 (unsigned char* x)
> +{
> + return ((uint64_t)(x[0]) << 0 |
> + (uint64_t)(x[1]) << 56 |
> + (uint64_t)(x[2]) << 48 |
> + (uint64_t)(x[3]) << 40 |
> + (uint64_t)(x[4]) << 32 |
> + (uint64_t)(x[5]) << 24 |
> + (uint64_t)(x[6]) << 16 |
> + (uint64_t)(x[7]) << 8);
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_1 (unsigned char* x)
> +{
> + return ((uint64_t)(x[0]) << 24 |
> + (uint64_t)(x[1]) << 16 |
> + (uint64_t)(x[2]) << 8 |
> + (uint64_t)(0) << 0 |
> + (uint64_t)(x[4]) << 56 |
> + (uint64_t)(x[5]) << 48 |
> + (uint64_t)(x[6]) << 40 |
> + (uint64_t)(x[7]) << 32);
> +}
> +
> +uint64_t
> +__attribute__((noipa))
> +swap_rotate_64_mask_2 (unsigned char* x)
> +{
> + return ((uint64_t)(x[0]) << 0 |
> + (uint64_t)(x[1]) << 56 |
> + (uint64_t)(x[2]) << 48 |
> + (uint64_t)(0) << 40 |
> + (uint64_t)(x[4]) << 32 |
> + (uint64_t)(x[5]) << 24 |
> + (uint64_t)(x[6]) << 16 |
> + (uint64_t)(x[7]) << 8);
> +}
> +
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32 (unsigned char* x)
> +{
> + return ((uint64_t)(x[0]) << 8 |
> + (uint64_t)(x[1]) << 0 |
> + (uint64_t)(x[2]) << 24 |
> + (uint64_t)(x[3]) << 16);
> +}
> +
> +uint32_t
> +__attribute__((noipa))
> +swap_rotate_32_mask_1 (unsigned char* x)
> +{
> + return ((uint64_t)(x[0]) << 8 |
> + (uint64_t)(0) << 0 |
> + (uint64_t)(x[2]) << 24 |
> + (uint64_t)(x[3]) << 16);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
> new file mode 100644
> index 00000000000..51a8102f995
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr108938-load-1.c"
> +
> +int main ()
> +{
> + unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
> + uint64_t res = swap_rotate_64 (a);
> + if (res != 0x0203040506070801ULL)
> + __builtin_abort ();
> +
> + res = swap_rotate_64_mask_1 (a);
> + if (res != 0x0506070801020300ULL)
> + __builtin_abort ();
> +
> + res = swap_rotate_64_mask_2 (a);
> + if (res != 0x0203000506070801ULL)
> + __builtin_abort ();
> +
> + uint32_t res2 = swap_rotate_32 (a);
> + if (res2 != 0x03040102)
> + __builtin_abort ();
> +
> + res2 = swap_rotate_32_mask_1 (a);
> + if (res2 != 0x03040100)
> + __builtin_abort ();
> +
> + return 0;
> +}
> --
> 2.39.1.388.g2fc9e9ca3c
>
@@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
n->range *= BITS_PER_UNIT;
}
+/* Helper function for find_bswap_or_nop,
+ Return true if N is a swap or nop with MASK. */
+static bool
+is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
+ uint64_t cmpnop, uint64_t* mask,
+ bool* bswap)
+{
+ *mask = ~(uint64_t) 0;
+ if (n == cmpnop)
+ *bswap = false;
+ else if (n == cmpxchg)
+ *bswap = true;
+ else
+ {
+ int set = 0;
+ for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
+ if ((n & msk) == 0)
+ *mask &= ~msk;
+ else if ((n & msk) == (cmpxchg & msk))
+ set++;
+ else
+ return false;
+
+ if (set < 2)
+ return false;
+ *bswap = true;
+ }
+ return true;
+}
+
+
/* Check if STMT completes a bswap implementation or a read in a given
endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
accordingly. It also sets N to represent the kind of operations
@@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
gimple *
find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
- bool *cast64_to_32, uint64_t *mask)
+ bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
{
tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
if (!tree_fits_uhwi_p (type_size))
@@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
}
uint64_t cmpxchg, cmpnop;
+ uint64_t orig_range = n->range * BITS_PER_UNIT;
find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32);
/* A complete byte swap should make the symbolic number to start with
the largest digit in the highest order byte. Unchanged symbolic
number indicates a read with same endianness as target architecture. */
- *mask = ~(uint64_t) 0;
- if (n->n == cmpnop)
- *bswap = false;
- else if (n->n == cmpxchg)
- *bswap = true;
- else
+ *l_rotate = 0;
+ uint64_t tmp_n = n->n;
+ if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
{
- int set = 0;
- for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
- if ((n->n & msk) == 0)
- *mask &= ~msk;
- else if ((n->n & msk) == (cmpxchg & msk))
- set++;
- else
- return NULL;
- if (set < 2)
+ /* Try bswap + lrotate. */
+ /* TODO, handle cast64_to_32 and big/litte_endian memory
+ source when rsize < range. */
+ if (n->range == orig_range
+ && ((orig_range == 32
+ && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
+ || (orig_range == 64
+ && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
+ && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
+ {
+ uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER;
+ uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER;
+ /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */
+ if (!count)
+ {
+ for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++)
+ {
+ count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK;
+ if (count)
+ {
+ /* Count should be meaningful not 0xff. */
+ if (count <= range / BITS_PER_MARKER)
+ {
+ count = (count + i) * BITS_PER_MARKER % range;
+ break;
+ }
+ else
+ return NULL;
+ }
+ }
+ }
+ tmp_n = tmp_n >> count | tmp_n << (range - count);
+ if (orig_range == 32)
+ tmp_n &= (1ULL << 32) - 1;
+ if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
+ return NULL;
+ *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT;
+ gcc_assert (*bswap);
+ }
+ else
return NULL;
- *bswap = true;
}
/* Useless bit manipulation performed by code. */
@@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val,
tree
bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
tree bswap_type, tree load_type, struct symbolic_number *n,
- bool bswap, uint64_t mask)
+ bool bswap, uint64_t mask, uint64_t l_rotate)
{
tree src, tmp, tgt = NULL_TREE;
- gimple *bswap_stmt, *mask_stmt = NULL;
+ gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL;
tree_code conv_code = NOP_EXPR;
gimple *cur_stmt = gsi_stmt (gsi);
@@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
tmp = tgt;
}
+ if (l_rotate)
+ {
+ tree m = build_int_cst (bswap_type, l_rotate);
+ tmp = make_temp_ssa_name (bswap_type, NULL,
+ mask_stmt ? "bswapmaskdst" : "bswapdst");
+ gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+ rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m);
+ tmp = tgt;
+ }
+
/* Convert the result if necessary. */
if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
{
@@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT);
}
- gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+ gimple_set_lhs (rotl_stmt ? rotl_stmt
+ : mask_stmt ? mask_stmt : bswap_stmt, tmp);
if (dump_file)
{
@@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
if (cur_stmt)
{
+ if (rotl_stmt)
+ gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
if (mask_stmt)
gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT);
gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
@@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
if (mask_stmt)
gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT);
+ if (rotl_stmt)
+ gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
}
return tgt;
}
@@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
}
bool cast64_to_32;
- uint64_t mask;
+ uint64_t mask, l_rotate;
gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
- &cast64_to_32, &mask);
+ &cast64_to_32, &mask, &l_rotate);
if (!ins_stmt
|| n.range != (unsigned HOST_WIDE_INT) sz
|| cast64_to_32
@@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
memset (&nop_stats, 0, sizeof (nop_stats));
memset (&bswap_stats, 0, sizeof (bswap_stats));
return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
- bswap_type, load_type, &n, bswap, mask) != NULL_TREE;
+ bswap_type, load_type, &n, bswap, mask,
+ l_rotate) != NULL_TREE;
}
/* Find manual byte swap implementations as well as load in a given
@@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun)
enum tree_code code;
struct symbolic_number n;
bool bswap, cast64_to_32;
- uint64_t mask;
+ uint64_t mask, l_rotate;
/* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
might be moved to a different basic block by bswap_replace and gsi
@@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun)
}
ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
- &cast64_to_32, &mask);
+ &cast64_to_32, &mask, &l_rotate);
if (!ins_stmt)
continue;
@@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun)
continue;
if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
- bswap_type, load_type, &n, bswap, mask))
+ bswap_type, load_type, &n, bswap, mask,
+ l_rotate))
changed = true;
}
}
@@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group)
}
bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
bswap_type, load_type, n, bswap,
- ~(uint64_t) 0);
+ ~(uint64_t) 0, 0);
gcc_assert (bswap_res);
}
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (uint64_t x)
+{
+ return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (uint64_t x)
+{
+ return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (uint64_t x)
+{
+ return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (uint32_t x)
+{
+ return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (uint32_t x)
+{
+ return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_2 (uint32_t x)
+{
+ return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-1.c"
+
+int main ()
+{
+ uint64_t a = 0x0807060504030201ULL;
+ uint64_t res = swap_rotate_64 (a);
+ if (res != 0x0203040506070801ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_1 (a);
+ if (res != 0x0203040506070800ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_2 (a);
+ if (res != 0x0203040506070001ULL)
+ __builtin_abort ();
+
+ uint32_t b = 0x04030201;
+ uint32_t res2 = swap_rotate_32 (b);
+ if (res2 != 0x03040102)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_1 (b);
+ if (res2 != 0x03040002)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_2 (b);
+ if (res2 != 0x03040100)
+ __builtin_abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */
+
+void
+foo1 (char* a, unsigned int* __restrict b)
+{
+ a[0] = b[0] >> 24;
+ a[1] = b[0] >> 16;
+ a[2] = b[0] >> 8;
+ a[3] = b[0];
+ a[4] = b[1] >> 24;
+ a[5] = b[1] >> 16;
+ a[6] = b[1] >> 8;
+ a[7] = b[1];
+}
+
+void
+foo2 (char* a, short* __restrict b)
+{
+ a[0] = b[0] >> 8;
+ a[1] = b[0];
+ a[2] = b[1] >> 8;
+ a[3] = b[1];
+}
new file mode 100644
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 0 |
+ (uint64_t)(x[1]) << 56 |
+ (uint64_t)(x[2]) << 48 |
+ (uint64_t)(x[3]) << 40 |
+ (uint64_t)(x[4]) << 32 |
+ (uint64_t)(x[5]) << 24 |
+ (uint64_t)(x[6]) << 16 |
+ (uint64_t)(x[7]) << 8);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 24 |
+ (uint64_t)(x[1]) << 16 |
+ (uint64_t)(x[2]) << 8 |
+ (uint64_t)(0) << 0 |
+ (uint64_t)(x[4]) << 56 |
+ (uint64_t)(x[5]) << 48 |
+ (uint64_t)(x[6]) << 40 |
+ (uint64_t)(x[7]) << 32);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 0 |
+ (uint64_t)(x[1]) << 56 |
+ (uint64_t)(x[2]) << 48 |
+ (uint64_t)(0) << 40 |
+ (uint64_t)(x[4]) << 32 |
+ (uint64_t)(x[5]) << 24 |
+ (uint64_t)(x[6]) << 16 |
+ (uint64_t)(x[7]) << 8);
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 8 |
+ (uint64_t)(x[1]) << 0 |
+ (uint64_t)(x[2]) << 24 |
+ (uint64_t)(x[3]) << 16);
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 8 |
+ (uint64_t)(0) << 0 |
+ (uint64_t)(x[2]) << 24 |
+ (uint64_t)(x[3]) << 16);
+}
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-load-1.c"
+
+int main ()
+{
+ unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+ uint64_t res = swap_rotate_64 (a);
+ if (res != 0x0203040506070801ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_1 (a);
+ if (res != 0x0506070801020300ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_2 (a);
+ if (res != 0x0203000506070801ULL)
+ __builtin_abort ();
+
+ uint32_t res2 = swap_rotate_32 (a);
+ if (res2 != 0x03040102)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_1 (a);
+ if (res2 != 0x03040100)
+ __builtin_abort ();
+
+ return 0;
+}