[v2] -finline-stringops: check base blksize for memset [PR112778]
Checks
Commit Message
Scratch the previous one, the "slightly different version" I had before
it was not entirely broken due to unnecessary, suboptimal and incorrect
use of ctz. Here I have yet another implementation of that loop that
should perform better and even work correctly ;-)
This one has so far regstrapped on x86_64-linux-gnu (v1 failed in
regression testing, sorry), and bootstrapped with -finline-stringops on
ppc64le-linux-gnu (still ongoing on x86-64-linux-gnu and
aarch64-linux-gnu). Ok to install?
The recently-added logic for -finline-stringops=memset introduced an
assumption that doesn't necessarily hold, namely, that
can_store_by_pieces of a larger size implies can_store_by_pieces by
smaller sizes. Checks for all sizes the by-multiple-pieces machinery
might use before committing to an expansion pattern.
for gcc/ChangeLog
PR target/112778
* builtins.cc (can_store_by_multiple_pieces): New.
(try_store_by_multiple_pieces): Call it.
for gcc/testsuite/ChangeLog
PR target/112778
* gcc.dg/inline-mem-cmp-pr112778.c: New.
---
gcc/builtins.cc | 57 ++++++++++++++++++++----
gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c | 10 ++++
2 files changed, 58 insertions(+), 9 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
Comments
On Sat, Dec 9, 2023 at 8:05 AM Alexandre Oliva <oliva@adacore.com> wrote:
>
> Scratch the previous one, the "slightly different version" I had before
> it was not entirely broken due to unnecessary, suboptimal and incorrect
> use of ctz. Here I have yet another implementation of that loop that
> should perform better and even work correctly ;-)
>
>
> This one has so far regstrapped on x86_64-linux-gnu (v1 failed in
> regression testing, sorry), and bootstrapped with -finline-stringops on
> ppc64le-linux-gnu (still ongoing on x86-64-linux-gnu and
> aarch64-linux-gnu). Ok to install?
OK
>
> The recently-added logic for -finline-stringops=memset introduced an
> assumption that doesn't necessarily hold, namely, that
> can_store_by_pieces of a larger size implies can_store_by_pieces by
> smaller sizes. Checks for all sizes the by-multiple-pieces machinery
> might use before committing to an expansion pattern.
>
>
> for gcc/ChangeLog
>
> PR target/112778
> * builtins.cc (can_store_by_multiple_pieces): New.
> (try_store_by_multiple_pieces): Call it.
>
> for gcc/testsuite/ChangeLog
>
> PR target/112778
> * gcc.dg/inline-mem-cmp-pr112778.c: New.
> ---
> gcc/builtins.cc | 57 ++++++++++++++++++++----
> gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c | 10 ++++
> 2 files changed, 58 insertions(+), 9 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
>
> diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> index 12a535d313f12..f6c96498f0783 100644
> --- a/gcc/builtins.cc
> +++ b/gcc/builtins.cc
> @@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, machine_mode mode)
> return expand_builtin_memset_args (dest, val, len, target, mode, exp);
> }
>
> +/* Check that store_by_pieces allows BITS + LEN (so that we don't
> + expand something too unreasonably long), and every power of 2 in
> + BITS. It is assumed that LEN has already been tested by
> + itself. */
> +static bool
> +can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits,
> + by_pieces_constfn constfun,
> + void *constfundata, unsigned int align,
> + bool memsetp,
> + unsigned HOST_WIDE_INT len)
> +{
> + if (bits
> + && !can_store_by_pieces (bits + len, constfun, constfundata,
> + align, memsetp))
> + return false;
> +
> + /* BITS set are expected to be generally in the low range and
> + contiguous. We do NOT want to repeat the test above in case BITS
> + has a single bit set, so we terminate the loop when BITS == BIT.
> + In the unlikely case that BITS has the MSB set, also terminate in
> + case BIT gets shifted out. */
> + for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1)
> + {
> + if ((bits & bit) == 0)
> + continue;
> +
> + if (!can_store_by_pieces (bit, constfun, constfundata,
> + align, memsetp))
> + return false;
> + }
> +
> + return true;
> +}
> +
> /* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO.
> Return TRUE if successful, FALSE otherwise. TO is assumed to be
> aligned at an ALIGN-bits boundary. LEN must be a multiple of
> @@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
> else
> /* Huh, max_len < min_len? Punt. See pr100843.c. */
> return false;
> - if (min_len >= blksize)
> + if (min_len >= blksize
> + /* ??? Maybe try smaller fixed-prefix blksizes before
> + punting? */
> + && can_store_by_pieces (blksize, builtin_memset_read_str,
> + &valc, align, true))
> {
> min_len -= blksize;
> min_bits = floor_log2 (min_len);
> @@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
> happen because of the way max_bits and blksize are related, but
> it doesn't hurt to test. */
> if (blksize > xlenest
> - || !can_store_by_pieces (xlenest, builtin_memset_read_str,
> - &valc, align, true))
> + || !can_store_by_multiple_pieces (xlenest - blksize,
> + builtin_memset_read_str,
> + &valc, align, true, blksize))
> {
> if (!(flag_inline_stringops & ILSOP_MEMSET))
> return false;
> @@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
> of overflow. */
> if (max_bits < orig_max_bits
> && xlenest + blksize >= xlenest
> - && can_store_by_pieces (xlenest + blksize,
> - builtin_memset_read_str,
> - &valc, align, true))
> + && can_store_by_multiple_pieces (xlenest,
> + builtin_memset_read_str,
> + &valc, align, true, blksize))
> {
> max_loop = true;
> break;
> }
> if (blksize
> - && can_store_by_pieces (xlenest,
> - builtin_memset_read_str,
> - &valc, align, true))
> + && can_store_by_multiple_pieces (xlenest,
> + builtin_memset_read_str,
> + &valc, align, true, 0))
> {
> max_len += blksize;
> min_len += blksize;
> diff --git a/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
> new file mode 100644
> index 0000000000000..fdfc5b6f28c8e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-finline-stringops" } */
> +
> +char buf[3];
> +
> +int
> +f ()
> +{
> + __builtin_memset (buf, 'v', 3);
> +}
>
>
> --
> Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/
> Free Software Activist GNU Toolchain Engineer
> More tolerance and less prejudice are key for inclusion and diversity
> Excluding neuro-others for not behaving ""normal"" is *not* inclusive
@@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, machine_mode mode)
return expand_builtin_memset_args (dest, val, len, target, mode, exp);
}
+/* Check that store_by_pieces allows BITS + LEN (so that we don't
+ expand something too unreasonably long), and every power of 2 in
+ BITS. It is assumed that LEN has already been tested by
+ itself. */
+static bool
+can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits,
+ by_pieces_constfn constfun,
+ void *constfundata, unsigned int align,
+ bool memsetp,
+ unsigned HOST_WIDE_INT len)
+{
+ if (bits
+ && !can_store_by_pieces (bits + len, constfun, constfundata,
+ align, memsetp))
+ return false;
+
+ /* BITS set are expected to be generally in the low range and
+ contiguous. We do NOT want to repeat the test above in case BITS
+ has a single bit set, so we terminate the loop when BITS == BIT.
+ In the unlikely case that BITS has the MSB set, also terminate in
+ case BIT gets shifted out. */
+ for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1)
+ {
+ if ((bits & bit) == 0)
+ continue;
+
+ if (!can_store_by_pieces (bit, constfun, constfundata,
+ align, memsetp))
+ return false;
+ }
+
+ return true;
+}
+
/* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO.
Return TRUE if successful, FALSE otherwise. TO is assumed to be
aligned at an ALIGN-bits boundary. LEN must be a multiple of
@@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
else
/* Huh, max_len < min_len? Punt. See pr100843.c. */
return false;
- if (min_len >= blksize)
+ if (min_len >= blksize
+ /* ??? Maybe try smaller fixed-prefix blksizes before
+ punting? */
+ && can_store_by_pieces (blksize, builtin_memset_read_str,
+ &valc, align, true))
{
min_len -= blksize;
min_bits = floor_log2 (min_len);
@@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
happen because of the way max_bits and blksize are related, but
it doesn't hurt to test. */
if (blksize > xlenest
- || !can_store_by_pieces (xlenest, builtin_memset_read_str,
- &valc, align, true))
+ || !can_store_by_multiple_pieces (xlenest - blksize,
+ builtin_memset_read_str,
+ &valc, align, true, blksize))
{
if (!(flag_inline_stringops & ILSOP_MEMSET))
return false;
@@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
of overflow. */
if (max_bits < orig_max_bits
&& xlenest + blksize >= xlenest
- && can_store_by_pieces (xlenest + blksize,
- builtin_memset_read_str,
- &valc, align, true))
+ && can_store_by_multiple_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true, blksize))
{
max_loop = true;
break;
}
if (blksize
- && can_store_by_pieces (xlenest,
- builtin_memset_read_str,
- &valc, align, true))
+ && can_store_by_multiple_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true, 0))
{
max_len += blksize;
min_len += blksize;
new file mode 100644
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops" } */
+
+char buf[3];
+
+int
+f ()
+{
+ __builtin_memset (buf, 'v', 3);
+}