expand: Convert cst - x into cst xor x.
Commit Message
Hi,
posting this separately from PR91213 now. I wrote an s390 test and most
likely it could also be done for x86 which will give it broader coverage.
Depending on the backend it might be better to convert
cst - x
into
cst xor x
if cst + 1 is a power of two and 0 <= x <= cst. This patch compares
both sequences and emits the less expensive one.
Does this look like a viable approach? Bootstrapped and regtested on
s390[x], waited with x86 tests until a first round of comments.
Regards
Robin
gcc/ChangeLog:
PR middle-end/91213
* expr.cc (expand_expr_real_2): Call new function.
(maybe_optimize_cst_sub): New function.
* expr.h (maybe_optimize_cst_sub): Define.
gcc/testsuite/ChangeLog:
* gcc.target/s390/cst-minus-var.c: New test.
---
gcc/expr.cc | 79 +++++++++++++++++++
gcc/expr.h | 2 +
gcc/testsuite/gcc.target/s390/cst-minus-var.c | 55 +++++++++++++
3 files changed, 136 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/s390/cst-minus-var.c
+ return 65535 - a;
+}
Comments
On Tue, Sep 6, 2022 at 11:42 AM Robin Dapp <rdapp@linux.ibm.com> wrote:
>
> Hi,
>
> posting this separately from PR91213 now. I wrote an s390 test and most
> likely it could also be done for x86 which will give it broader coverage.
>
> Depending on the backend it might be better to convert
> cst - x
> into
> cst xor x
> if cst + 1 is a power of two and 0 <= x <= cst. This patch compares
> both sequences and emits the less expensive one.
>
> Does this look like a viable approach? Bootstrapped and regtested on
> s390[x], waited with x86 tests until a first round of comments.
cost might also depend on the context in case flag setting
behavior differs for xor vs sub (on x86 sub looks strictly more
powerful here). The same is probably true when looking for
a combination with another bitwise operation.
Btw, why not perform the optimization in expand_binop? That
for example already does
if (binoptab == sub_optab && CONST_INT_P (op1))
{
op1 = negate_rtx (mode, op1);
binoptab = add_optab;
}
alternatively a targets expander can do the selection as well.
Richard.
> Regards
> Robin
>
> gcc/ChangeLog:
>
> PR middle-end/91213
> * expr.cc (expand_expr_real_2): Call new function.
> (maybe_optimize_cst_sub): New function.
> * expr.h (maybe_optimize_cst_sub): Define.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/s390/cst-minus-var.c: New test.
>
> ---
> gcc/expr.cc | 79 +++++++++++++++++++
> gcc/expr.h | 2 +
> gcc/testsuite/gcc.target/s390/cst-minus-var.c | 55 +++++++++++++
> 3 files changed, 136 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/s390/cst-minus-var.c
>
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 80bb1b8a4c5b..80f25720d7b6 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -9397,6 +9397,21 @@ expand_expr_real_2 (sepops ops, rtx target,
> machine_mode tmode,
> return simplify_gen_binary (MINUS, mode, op0, op1);
> }
>
> + /* Convert const - A to const xor A if integer_pow2p (const + 1)
> + and 0 <= A <= const. */
> + if (code == MINUS_EXPR
> + && SCALAR_INT_MODE_P (mode)
> + && TREE_CODE (treeop0) == INTEGER_CST
> + && TREE_CODE (TREE_TYPE (treeop1)) == INTEGER_TYPE
> + && wi::exact_log2 (wi::to_widest (treeop0) + 1) != -1)
> + {
> + rtx res = maybe_optimize_cst_sub (code, treeop0, treeop1,
> + mode, unsignedp, type,
> + target, subtarget);
> + if (res)
> + return res;
> + }
> +
> /* No sense saving up arithmetic to be done
> if it's all in the wrong mode to form part of an address.
> And force_operand won't know whether to sign-extend or
> @@ -12692,6 +12707,70 @@ maybe_optimize_mod_cmp (enum tree_code code,
> tree *arg0, tree *arg1)
> return code == EQ_EXPR ? LE_EXPR : GT_EXPR;
> }
>
> +/* Convert const - A to const xor A if integer_pow2p (const + 1)
> + and 0 <= A <= const. */
> +
> +rtx
> +maybe_optimize_cst_sub (enum tree_code code, tree treeop0, tree treeop1,
> + machine_mode mode, int unsignedp, tree type,
> + rtx target, rtx subtarget)
> +{
> + gcc_checking_assert (code == MINUS_EXPR);
> + gcc_checking_assert (SCALAR_INT_MODE_P (mode));
> + gcc_checking_assert (TREE_CODE (treeop0) == INTEGER_CST);
> + gcc_checking_assert (TREE_CODE (TREE_TYPE (treeop1)) == INTEGER_TYPE);
> + gcc_checking_assert (wi::exact_log2 (wi::to_widest (treeop0) + 1) != -1);
> +
> + if (!optimize)
> + return NULL_RTX;
> +
> + optab this_optab;
> + rtx op0, op1;
> +
> + if (wi::leu_p (tree_nonzero_bits (treeop1), tree_nonzero_bits (treeop0)))
> + {
> + expand_operands (treeop0, treeop1, subtarget, &op0, &op1,
> + EXPAND_NORMAL);
> + bool speed_p = optimize_insn_for_speed_p ();
> + do_pending_stack_adjust ();
> + start_sequence ();
> + this_optab = optab_for_tree_code (MINUS_EXPR, type,
> + optab_default);
> + rtx subi = expand_binop (mode, this_optab, op0, op1, target,
> + unsignedp, OPTAB_LIB_WIDEN);
> +
> + rtx_insn *sub_insns = get_insns ();
> + end_sequence ();
> + start_sequence ();
> + this_optab = optab_for_tree_code (BIT_XOR_EXPR, type,
> + optab_default);
> + rtx xori = expand_binop (mode, this_optab, op0, op1, target,
> + unsignedp, OPTAB_LIB_WIDEN);
> + rtx_insn *xor_insns = get_insns ();
> + end_sequence ();
> + unsigned sub_cost = seq_cost (sub_insns, speed_p);
> + unsigned xor_cost = seq_cost (xor_insns, speed_p);
> + /* If costs are the same then use as tie breaker the other other
> + factor. */
> + if (sub_cost == xor_cost)
> + {
> + sub_cost = seq_cost (sub_insns, !speed_p);
> + xor_cost = seq_cost (xor_insns, !speed_p);
> + }
> +
> + if (sub_cost <= xor_cost)
> + {
> + emit_insn (sub_insns);
> + return subi;
> + }
> +
> + emit_insn (xor_insns);
> + return xori;
> + }
> +
> + return NULL_RTX;
> +}
> +
> /* Optimize x - y < 0 into x < 0 if x - y has undefined overflow. */
>
> void
> diff --git a/gcc/expr.h b/gcc/expr.h
> index 08b59b8d869a..43ea11042d26 100644
> --- a/gcc/expr.h
> +++ b/gcc/expr.h
> @@ -326,6 +326,8 @@ extern tree string_constant (tree, tree *, tree *,
> tree *);
> extern tree byte_representation (tree, tree *, tree *, tree *);
>
> extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *,
> tree *);
> +extern rtx maybe_optimize_cst_sub (enum tree_code, tree, tree,
> + machine_mode, int, tree , rtx, rtx);
> extern void maybe_optimize_sub_cmp_0 (enum tree_code, tree *, tree *);
>
> /* Two different ways of generating switch statements. */
> diff --git a/gcc/testsuite/gcc.target/s390/cst-minus-var.c
> b/gcc/testsuite/gcc.target/s390/cst-minus-var.c
> new file mode 100644
> index 000000000000..c713624a9784
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/cst-minus-var.c
> @@ -0,0 +1,55 @@
> +/* Check that we can convert const - x to const xor x if
> + const + 1 is a power of two and 0 <= x <= const. */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mzarch" } */
> +/* { dg-final { scan-assembler-times 8 "xr\t" { target { ! 390_z10_hw }
> } } } */
> +/* { dg-final { scan-assembler-times 8 "xilf\t" { target { s390_z10_hw
> } } } } */
> +
> +unsigned long long foo (unsigned long long a)
> +{
> + if (a > 1) __builtin_unreachable();
> + return 1 - a;
> +}
> +
> +unsigned long long bar (unsigned long long a)
> +{
> + if (a > 65535) __builtin_unreachable();
> + return 65535 - a;
> +}
> +
> +unsigned long long baz (unsigned long long a)
> +{
> + if (a > 4294967295) __builtin_unreachable();
> + return 4294967295 - a;
> +}
> +
> +int fooi (int a)
> +{
> + if (a > 127 || a < 0) __builtin_unreachable();
> + return 127 - a;
> +}
> +
> +int bari (int a)
> +{
> + if (a > 65535 || a < 0) __builtin_unreachable();
> + return 65535 - a;
> +}
> +
> +long bazl (long a)
> +{
> + if (a > 4294967295 || a < 0) __builtin_unreachable();
> + return 4294967295 - a;
> +}
> +
> +short foos (short a)
> +{
> + if (a > 31 || a < 0) __builtin_unreachable();
> + return 31 - a;
> +}
> +
> +short bars (int a)
> +{
> + if (a > 65535 || a < 0) __builtin_unreachable();
> + return 65535 - a;
> +}
> --
> 2.31.1
> cost might also depend on the context in case flag setting
> behavior differs for xor vs sub (on x86 sub looks strictly more
> powerful here). The same is probably true when looking for
> a combination with another bitwise operation.
>
> Btw, why not perform the optimization in expand_binop? That
> for example already does
>
> if (binoptab == sub_optab && CONST_INT_P (op1))
> {
> op1 = negate_rtx (mode, op1);
> binoptab = add_optab;
> }
>
> alternatively a targets expander can do the selection as well.
I was under the impression optabs/expand_binops is only supposed to
"optimize" when it's clear that it is an optimization/canonicalization.
I didn't see other functions there trying two alternatives and also none
seems to use range information already.
Regarding the proper costing (including the surroundings): is it even
possible to encompass everything in such a localized decision? A
target's expander decision would also not take this into account when
deciding? If so, should we not perform this conversion generally and not
only target specifc?
Regards
Robin
On Tue, Sep 6, 2022 at 4:01 PM Robin Dapp <rdapp@linux.ibm.com> wrote:
>
> > cost might also depend on the context in case flag setting
> > behavior differs for xor vs sub (on x86 sub looks strictly more
> > powerful here). The same is probably true when looking for
> > a combination with another bitwise operation.
> >
> > Btw, why not perform the optimization in expand_binop? That
> > for example already does
> >
> > if (binoptab == sub_optab && CONST_INT_P (op1))
> > {
> > op1 = negate_rtx (mode, op1);
> > binoptab = add_optab;
> > }
> >
> > alternatively a targets expander can do the selection as well.
>
> I was under the impression optabs/expand_binops is only supposed to
> "optimize" when it's clear that it is an optimization/canonicalization.
> I didn't see other functions there trying two alternatives and also none
> seems to use range information already.
>
> Regarding the proper costing (including the surroundings): is it even
> possible to encompass everything in such a localized decision? A
> target's expander decision would also not take this into account when
> deciding? If so, should we not perform this conversion generally and not
> only target specifc?
The question is really whether xor or sub is "better" statically. I can't
think of any reasons. On s390, why does xor end up "better"?
Richard.
>
> Regards
> Robin
> The question is really whether xor or sub is "better" statically. I can't
> think of any reasons. On s390, why does xor end up "better"?
There is an xor with immediate (as opposed to no "subtract from
immediate") which saves an instruction, usually. On x86, I think the
usual argument for xor is that it's shorter (if flags etc. are not needed).
It's not that I don't want to implement it in the backend, just that I
understood the original PR in a way that it would make sense to have
this conversion available for more targets. If there are too many
confounding factors that prevent this situation from being statically
costed properly, then sure, not much use in implementing it generally.
Regards
Robin
On Wed, Sep 7, 2022 at 2:20 PM Robin Dapp <rdapp@linux.ibm.com> wrote:
>
> > The question is really whether xor or sub is "better" statically. I can't
> > think of any reasons. On s390, why does xor end up "better"?
>
> There is an xor with immediate (as opposed to no "subtract from
> immediate") which saves an instruction, usually. On x86, I think the
> usual argument for xor is that it's shorter (if flags etc. are not needed).
>
> It's not that I don't want to implement it in the backend, just that I
> understood the original PR in a way that it would make sense to have
> this conversion available for more targets. If there are too many
> confounding factors that prevent this situation from being statically
> costed properly, then sure, not much use in implementing it generally.
Do we have evidence that targets properly cost XOR vs SUB RTXen?
It might actually be a reload optimization - when the constant is
available in a register use 'sub', when it needs to be reloaded
use 'xor'?
That said, I wonder if the fallout of changing some SUB to XOR
is bigger than the benefit when we do it early (missed combines, etc.)?
> Regards
> Robin
On 9/7/2022 6:45 AM, Richard Biener via Gcc-patches wrote:
> On Wed, Sep 7, 2022 at 2:20 PM Robin Dapp <rdapp@linux.ibm.com> wrote:
>>> The question is really whether xor or sub is "better" statically. I can't
>>> think of any reasons. On s390, why does xor end up "better"?
>> There is an xor with immediate (as opposed to no "subtract from
>> immediate") which saves an instruction, usually. On x86, I think the
>> usual argument for xor is that it's shorter (if flags etc. are not needed).
>>
>> It's not that I don't want to implement it in the backend, just that I
>> understood the original PR in a way that it would make sense to have
>> this conversion available for more targets. If there are too many
>> confounding factors that prevent this situation from being statically
>> costed properly, then sure, not much use in implementing it generally.
> Do we have evidence that targets properly cost XOR vs SUB RTXen?
Probably not. And I have vague memories of some targets not having xor
with immediate, but which did have more expressive sub instructions (so
the opposite of the situation on s390).
Checking costs seems like a very sensible thing to do though and if
targets need adjustment, then we can cope.
>
> It might actually be a reload optimization - when the constant is
> available in a register use 'sub', when it needs to be reloaded
> use 'xor'?
Perhaps. The question is finding out of the constant is actually
available in a register. We don't do a particularly good job at that.
>
> That said, I wonder if the fallout of changing some SUB to XOR
> is bigger than the benefit when we do it early (missed combines, etc.)?
Quite possibly -- though I doubt it matters in practice.
jeff
> Do we have evidence that targets properly cost XOR vs SUB RTXen?
>
> It might actually be a reload optimization - when the constant is
> available in a register use 'sub', when it needs to be reloaded
> use 'xor'?
>
> That said, I wonder if the fallout of changing some SUB to XOR
> is bigger than the benefit when we do it early (missed combines, etc.)?
Regarding fallout I did a bootstrap and regtest for various backends
now. No change on Power9, s390x and aarch64. On x86 there is one
additional FAIL in pr78103-3.c:
unsigned long long
bar (unsigned int x)
{
return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
}
is supposed to become
bsrl %edi, %eax
ret
but now is
bsrl %edi, %eax
xorl $31, %eax
xorq $31, %rax
ret
The x86 backend has various splitters catching and simplifying something
like
(xor (minus (const_int 63) (clz (match_operand))) (const_int 63))
to
(bsr ...).
From a quick glance, there are several combinations of 31, 63, xor, clz
which would need to be duplicated(?) to match against the changed
patterns. Perhaps xor is always cheaper on x86 and a simple change from
(minus (const_int 63) (...)) to (xor (const_int 63) (...)) would be
sufficient but this would still need to be reviewed separately.
Needing to keep both patterns (as neither minus nor xor can be
considered "more canonical" than the other) seems like an annoyance.
Regards
Robin
@@ -9397,6 +9397,21 @@ expand_expr_real_2 (sepops ops, rtx target,
machine_mode tmode,
return simplify_gen_binary (MINUS, mode, op0, op1);
}
+ /* Convert const - A to const xor A if integer_pow2p (const + 1)
+ and 0 <= A <= const. */
+ if (code == MINUS_EXPR
+ && SCALAR_INT_MODE_P (mode)
+ && TREE_CODE (treeop0) == INTEGER_CST
+ && TREE_CODE (TREE_TYPE (treeop1)) == INTEGER_TYPE
+ && wi::exact_log2 (wi::to_widest (treeop0) + 1) != -1)
+ {
+ rtx res = maybe_optimize_cst_sub (code, treeop0, treeop1,
+ mode, unsignedp, type,
+ target, subtarget);
+ if (res)
+ return res;
+ }
+
/* No sense saving up arithmetic to be done
if it's all in the wrong mode to form part of an address.
And force_operand won't know whether to sign-extend or
@@ -12692,6 +12707,70 @@ maybe_optimize_mod_cmp (enum tree_code code,
tree *arg0, tree *arg1)
return code == EQ_EXPR ? LE_EXPR : GT_EXPR;
}
+/* Convert const - A to const xor A if integer_pow2p (const + 1)
+ and 0 <= A <= const. */
+
+rtx
+maybe_optimize_cst_sub (enum tree_code code, tree treeop0, tree treeop1,
+ machine_mode mode, int unsignedp, tree type,
+ rtx target, rtx subtarget)
+{
+ gcc_checking_assert (code == MINUS_EXPR);
+ gcc_checking_assert (SCALAR_INT_MODE_P (mode));
+ gcc_checking_assert (TREE_CODE (treeop0) == INTEGER_CST);
+ gcc_checking_assert (TREE_CODE (TREE_TYPE (treeop1)) == INTEGER_TYPE);
+ gcc_checking_assert (wi::exact_log2 (wi::to_widest (treeop0) + 1) != -1);
+
+ if (!optimize)
+ return NULL_RTX;
+
+ optab this_optab;
+ rtx op0, op1;
+
+ if (wi::leu_p (tree_nonzero_bits (treeop1), tree_nonzero_bits (treeop0)))
+ {
+ expand_operands (treeop0, treeop1, subtarget, &op0, &op1,
+ EXPAND_NORMAL);
+ bool speed_p = optimize_insn_for_speed_p ();
+ do_pending_stack_adjust ();
+ start_sequence ();
+ this_optab = optab_for_tree_code (MINUS_EXPR, type,
+ optab_default);
+ rtx subi = expand_binop (mode, this_optab, op0, op1, target,
+ unsignedp, OPTAB_LIB_WIDEN);
+
+ rtx_insn *sub_insns = get_insns ();
+ end_sequence ();
+ start_sequence ();
+ this_optab = optab_for_tree_code (BIT_XOR_EXPR, type,
+ optab_default);
+ rtx xori = expand_binop (mode, this_optab, op0, op1, target,
+ unsignedp, OPTAB_LIB_WIDEN);
+ rtx_insn *xor_insns = get_insns ();
+ end_sequence ();
+ unsigned sub_cost = seq_cost (sub_insns, speed_p);
+ unsigned xor_cost = seq_cost (xor_insns, speed_p);
+ /* If costs are the same then use as tie breaker the other other
+ factor. */
+ if (sub_cost == xor_cost)
+ {
+ sub_cost = seq_cost (sub_insns, !speed_p);
+ xor_cost = seq_cost (xor_insns, !speed_p);
+ }
+
+ if (sub_cost <= xor_cost)
+ {
+ emit_insn (sub_insns);
+ return subi;
+ }
+
+ emit_insn (xor_insns);
+ return xori;
+ }
+
+ return NULL_RTX;
+}
+
/* Optimize x - y < 0 into x < 0 if x - y has undefined overflow. */
void
@@ -326,6 +326,8 @@ extern tree string_constant (tree, tree *, tree *,
tree *);
extern tree byte_representation (tree, tree *, tree *, tree *);
extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *,
tree *);
+extern rtx maybe_optimize_cst_sub (enum tree_code, tree, tree,
+ machine_mode, int, tree , rtx, rtx);
extern void maybe_optimize_sub_cmp_0 (enum tree_code, tree *, tree *);
/* Two different ways of generating switch statements. */
b/gcc/testsuite/gcc.target/s390/cst-minus-var.c
new file mode 100644
@@ -0,0 +1,55 @@
+/* Check that we can convert const - x to const xor x if
+ const + 1 is a power of two and 0 <= x <= const. */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -mzarch" } */
+/* { dg-final { scan-assembler-times 8 "xr\t" { target { ! 390_z10_hw }
} } } */
+/* { dg-final { scan-assembler-times 8 "xilf\t" { target { s390_z10_hw
} } } } */
+
+unsigned long long foo (unsigned long long a)
+{
+ if (a > 1) __builtin_unreachable();
+ return 1 - a;
+}
+
+unsigned long long bar (unsigned long long a)
+{
+ if (a > 65535) __builtin_unreachable();
+ return 65535 - a;
+}
+
+unsigned long long baz (unsigned long long a)
+{
+ if (a > 4294967295) __builtin_unreachable();
+ return 4294967295 - a;
+}
+
+int fooi (int a)
+{
+ if (a > 127 || a < 0) __builtin_unreachable();
+ return 127 - a;
+}
+
+int bari (int a)
+{
+ if (a > 65535 || a < 0) __builtin_unreachable();
+ return 65535 - a;
+}
+
+long bazl (long a)
+{
+ if (a > 4294967295 || a < 0) __builtin_unreachable();
+ return 4294967295 - a;
+}
+
+short foos (short a)
+{
+ if (a > 31 || a < 0) __builtin_unreachable();
+ return 31 - a;
+}
+
+short bars (int a)
+{
+ if (a > 65535 || a < 0) __builtin_unreachable();