RISC-V: Add divmod instruction support
Checks
Commit Message
Hi all,
If we have division and remainder calculations with the same operands:
a = b / c;
d = b % c;
We can replace the calculation of remainder with multiplication +
subtraction, using the result from the previous division:
a = b / c;
d = a * c;
d = b - d;
Which will be faster.
Currently, it isn't done for RISC-V.
I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
Best regards,
Matevos.
gcc/ChangeLog:
* config/riscv/riscv.md: Added divmod expander.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/divmod.c: New testcase.
--- inline copy of the patch ---
Comments
On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> Hi all,
> If we have division and remainder calculations with the same operands:
>
> a = b / c;
> d = b % c;
>
> We can replace the calculation of remainder with multiplication +
> subtraction, using the result from the previous division:
>
> a = b / c;
> d = a * c;
> d = b - d;
>
> Which will be faster.
Do you have any benchmarks that show that performance increase? The ISA
manual specifically says the suggested sequence is div+mod, and while
those suggestions don't always pan out for real hardware it's likely
that at least some implementations will end up with the ISA-suggested
fusions.
> Currently, it isn't done for RISC-V.
>
> I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
>
> Best regards,
> Matevos.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.md: Added divmod expander.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/riscv/divmod.c: New testcase.
>
> --- inline copy of the patch ---
>
> diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
> index f95dd405e12..d941483d9f1 100644
> --- a/gcc/config/riscv/iterators.md
> +++ b/gcc/config/riscv/iterators.md
> @@ -148,6 +148,11 @@
> ;; from the same template.
> (define_code_iterator any_mod [mod umod])
>
> +;; These code iterators allow unsigned and signed divmod to be generated
> +;; from the same template.
> +(define_code_iterator only_div [div udiv])
> +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
> +
> ;; These code iterators allow the signed and unsigned scc operations to use
> ;; the same template.
> (define_code_iterator any_gt [gt gtu])
> @@ -175,7 +180,8 @@
> (gt "") (gtu "u")
> (ge "") (geu "u")
> (lt "") (ltu "u")
> - (le "") (leu "u")])
> + (le "") (leu "u")
> + (div "") (udiv "u")])
>
> ;; <su> is like <u>, but the signed form expands to "s" rather than "".
> (define_code_attr su [(sign_extend "s") (zero_extend "u")])
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index c8adc5af5d2..2d48ff3f8de 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -1044,6 +1044,22 @@
> [(set_attr "type" "idiv")
> (set_attr "mode" "DI")])
>
> +(define_expand "<u>divmod<mode>4"
> + [(parallel
> + [(set (match_operand:GPR 0 "register_operand")
> + (only_div:GPR (match_operand:GPR 1 "register_operand")
> + (match_operand:GPR 2 "register_operand")))
> + (set (match_operand:GPR 3 "register_operand")
> + (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
> + "TARGET_DIV"
> + {
> + rtx tmp = gen_reg_rtx (<MODE>mode);
> + emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
> operands[2]));
> + emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
> + emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
> + DONE;
> + })
> +
> (define_insn "*<optab>si3_extended"
> [(set (match_operand:DI 0 "register_operand" "=r")
> (sign_extend:DI
> diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
> b/gcc/testsuite/gcc.target/riscv/divmod.c
> new file mode 100644
> index 00000000000..254b25e654d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
> +
> +void
> +foo(int a, int b, int *c, int *d)
> +{
> + *c = a / b;
> + *d = a % b;
> +}
> +
> +/* { dg-final { scan-assembler-not "rem" } } */
> +/* { dg-final { scan-assembler-times "mul" 1 } } */
> +/* { dg-final { scan-assembler-times "sub" 1 } } */
On Sat, Feb 18, 2023 at 10:27 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> > Hi all,
> > If we have division and remainder calculations with the same operands:
> >
> > a = b / c;
> > d = b % c;
> >
> > We can replace the calculation of remainder with multiplication +
> > subtraction, using the result from the previous division:
> >
> > a = b / c;
> > d = a * c;
> > d = b - d;
> >
> > Which will be faster.
>
> Do you have any benchmarks that show that performance increase? The ISA
> manual specifically says the suggested sequence is div+mod, and while
> those suggestions don't always pan out for real hardware it's likely
> that at least some implementations will end up with the ISA-suggested
> fusions.
I suspect I will be needing this kind of patch for the core that I am
going to be using.
If anything this should be under a tuning option.
Thanks,
Andrew Pinski
>
> > Currently, it isn't done for RISC-V.
> >
> > I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
> >
> > Best regards,
> > Matevos.
> >
> > gcc/ChangeLog:
> >
> > * config/riscv/riscv.md: Added divmod expander.
> >
> > gcc/testsuite/ChangeLog:
> > * gcc.target/riscv/divmod.c: New testcase.
> >
> > --- inline copy of the patch ---
> >
> > diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
> > index f95dd405e12..d941483d9f1 100644
> > --- a/gcc/config/riscv/iterators.md
> > +++ b/gcc/config/riscv/iterators.md
> > @@ -148,6 +148,11 @@
> > ;; from the same template.
> > (define_code_iterator any_mod [mod umod])
> >
> > +;; These code iterators allow unsigned and signed divmod to be generated
> > +;; from the same template.
> > +(define_code_iterator only_div [div udiv])
> > +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
> > +
> > ;; These code iterators allow the signed and unsigned scc operations to use
> > ;; the same template.
> > (define_code_iterator any_gt [gt gtu])
> > @@ -175,7 +180,8 @@
> > (gt "") (gtu "u")
> > (ge "") (geu "u")
> > (lt "") (ltu "u")
> > - (le "") (leu "u")])
> > + (le "") (leu "u")
> > + (div "") (udiv "u")])
> >
> > ;; <su> is like <u>, but the signed form expands to "s" rather than "".
> > (define_code_attr su [(sign_extend "s") (zero_extend "u")])
> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > index c8adc5af5d2..2d48ff3f8de 100644
> > --- a/gcc/config/riscv/riscv.md
> > +++ b/gcc/config/riscv/riscv.md
> > @@ -1044,6 +1044,22 @@
> > [(set_attr "type" "idiv")
> > (set_attr "mode" "DI")])
> >
> > +(define_expand "<u>divmod<mode>4"
> > + [(parallel
> > + [(set (match_operand:GPR 0 "register_operand")
> > + (only_div:GPR (match_operand:GPR 1 "register_operand")
> > + (match_operand:GPR 2 "register_operand")))
> > + (set (match_operand:GPR 3 "register_operand")
> > + (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
> > + "TARGET_DIV"
> > + {
> > + rtx tmp = gen_reg_rtx (<MODE>mode);
> > + emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
> > operands[2]));
> > + emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
> > + emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
> > + DONE;
> > + })
> > +
> > (define_insn "*<optab>si3_extended"
> > [(set (match_operand:DI 0 "register_operand" "=r")
> > (sign_extend:DI
> > diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
> > b/gcc/testsuite/gcc.target/riscv/divmod.c
> > new file mode 100644
> > index 00000000000..254b25e654d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
> > +
> > +void
> > +foo(int a, int b, int *c, int *d)
> > +{
> > + *c = a / b;
> > + *d = a % b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "rem" } } */
> > +/* { dg-final { scan-assembler-times "mul" 1 } } */
> > +/* { dg-final { scan-assembler-times "sub" 1 } } */
On Sat, 18 Feb 2023 10:42:51 PST (-0800), pinskia@gmail.com wrote:
> On Sat, Feb 18, 2023 at 10:27 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>> > Hi all,
>> > If we have division and remainder calculations with the same operands:
>> >
>> > a = b / c;
>> > d = b % c;
>> >
>> > We can replace the calculation of remainder with multiplication +
>> > subtraction, using the result from the previous division:
>> >
>> > a = b / c;
>> > d = a * c;
>> > d = b - d;
>> >
>> > Which will be faster.
>>
>> Do you have any benchmarks that show that performance increase? The ISA
>> manual specifically says the suggested sequence is div+mod, and while
>> those suggestions don't always pan out for real hardware it's likely
>> that at least some implementations will end up with the ISA-suggested
>> fusions.
>
> I suspect I will be needing this kind of patch for the core that I am
> going to be using.
OK, good to know. Presumably you guys aren't ready to show benchmarks,
though?
> If anything this should be under a tuning option.
That seems likely, as IIRC the SiFive cores do this fusion. It
generally seems like we're going to end up with implementations all over
the place when it comes to what's fused, so I bet we'll have a lot of
these differences between cores.
> Thanks,
> Andrew Pinski
>
>
>>
>> > Currently, it isn't done for RISC-V.
>> >
>> > I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
>> >
>> > Best regards,
>> > Matevos.
>> >
>> > gcc/ChangeLog:
>> >
>> > * config/riscv/riscv.md: Added divmod expander.
>> >
>> > gcc/testsuite/ChangeLog:
>> > * gcc.target/riscv/divmod.c: New testcase.
>> >
>> > --- inline copy of the patch ---
>> >
>> > diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
>> > index f95dd405e12..d941483d9f1 100644
>> > --- a/gcc/config/riscv/iterators.md
>> > +++ b/gcc/config/riscv/iterators.md
>> > @@ -148,6 +148,11 @@
>> > ;; from the same template.
>> > (define_code_iterator any_mod [mod umod])
>> >
>> > +;; These code iterators allow unsigned and signed divmod to be generated
>> > +;; from the same template.
>> > +(define_code_iterator only_div [div udiv])
>> > +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
>> > +
>> > ;; These code iterators allow the signed and unsigned scc operations to use
>> > ;; the same template.
>> > (define_code_iterator any_gt [gt gtu])
>> > @@ -175,7 +180,8 @@
>> > (gt "") (gtu "u")
>> > (ge "") (geu "u")
>> > (lt "") (ltu "u")
>> > - (le "") (leu "u")])
>> > + (le "") (leu "u")
>> > + (div "") (udiv "u")])
>> >
>> > ;; <su> is like <u>, but the signed form expands to "s" rather than "".
>> > (define_code_attr su [(sign_extend "s") (zero_extend "u")])
>> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
>> > index c8adc5af5d2..2d48ff3f8de 100644
>> > --- a/gcc/config/riscv/riscv.md
>> > +++ b/gcc/config/riscv/riscv.md
>> > @@ -1044,6 +1044,22 @@
>> > [(set_attr "type" "idiv")
>> > (set_attr "mode" "DI")])
>> >
>> > +(define_expand "<u>divmod<mode>4"
>> > + [(parallel
>> > + [(set (match_operand:GPR 0 "register_operand")
>> > + (only_div:GPR (match_operand:GPR 1 "register_operand")
>> > + (match_operand:GPR 2 "register_operand")))
>> > + (set (match_operand:GPR 3 "register_operand")
>> > + (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
>> > + "TARGET_DIV"
>> > + {
>> > + rtx tmp = gen_reg_rtx (<MODE>mode);
>> > + emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
>> > operands[2]));
>> > + emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
>> > + emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
>> > + DONE;
>> > + })
>> > +
>> > (define_insn "*<optab>si3_extended"
>> > [(set (match_operand:DI 0 "register_operand" "=r")
>> > (sign_extend:DI
>> > diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
>> > b/gcc/testsuite/gcc.target/riscv/divmod.c
>> > new file mode 100644
>> > index 00000000000..254b25e654d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
>> > @@ -0,0 +1,14 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
>> > +
>> > +void
>> > +foo(int a, int b, int *c, int *d)
>> > +{
>> > + *c = a / b;
>> > + *d = a % b;
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-not "rem" } } */
>> > +/* { dg-final { scan-assembler-times "mul" 1 } } */
>> > +/* { dg-final { scan-assembler-times "sub" 1 } } */
On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:
> > > If we have division and remainder calculations with the same operands:
> > >
> > > a = b / c;
> > > d = b % c;
> > >
> > > We can replace the calculation of remainder with multiplication +
> > > subtraction, using the result from the previous division:
> > >
> > > a = b / c;
> > > d = a * c;
> > > d = b - d;
> > >
> > > Which will be faster.
> >
> > Do you have any benchmarks that show that performance increase? The ISA
> > manual specifically says the suggested sequence is div+mod, and while
> > those suggestions don't always pan out for real hardware it's likely
> > that at least some implementations will end up with the ISA-suggested
> > fusions.
>
> I suspect I will be needing this kind of patch for the core that I am
> going to be using.
> If anything this should be under a tuning option.
Barring the fusion case, which indeed asks for a dedicated `divmod'
pattern (and then I suppose a post-reload splitter or a peephole so that
where one of the two results produced has eventually turned out unused, we
have means to discard the unneeded machine instruction), isn't the generic
transformation something for the middle end to do based on RTX costs?
Maciej
On Sun, 19 Feb 2023 at 01:01, Maciej W. Rozycki <macro@orcam.me.uk> wrote:
>
> On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:
>
> > > > If we have division and remainder calculations with the same operands:
> > > >
> > > > a = b / c;
> > > > d = b % c;
> > > >
> > > > We can replace the calculation of remainder with multiplication +
> > > > subtraction, using the result from the previous division:
> > > >
> > > > a = b / c;
> > > > d = a * c;
> > > > d = b - d;
> > > >
> > > > Which will be faster.
> > >
> > > Do you have any benchmarks that show that performance increase? The ISA
> > > manual specifically says the suggested sequence is div+mod, and while
> > > those suggestions don't always pan out for real hardware it's likely
> > > that at least some implementations will end up with the ISA-suggested
> > > fusions.
> >
> > I suspect I will be needing this kind of patch for the core that I am
> > going to be using.
> > If anything this should be under a tuning option.
>
> Barring the fusion case, which indeed asks for a dedicated `divmod'
> pattern (and then I suppose a post-reload splitter or a peephole so that
> where one of the two results produced has eventually turned out unused, we
> have means to discard the unneeded machine instruction), isn't the generic
> transformation something for the middle end to do based on RTX costs?
Hi,
I am not sure if this answers your question, but we have target independent
div+mod -> divmod transform on GIMPLE in widening_mul pass if target
supports divmod insn.
It is implemented in tree-ssa-math-opts.cc:convert_to_divmod.
Thanks,
Prathamesh
>
> Maciej
On 2/18/23 11:26, Palmer Dabbelt wrote:
> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>> Hi all,
>> If we have division and remainder calculations with the same operands:
>>
>> a = b / c;
>> d = b % c;
>>
>> We can replace the calculation of remainder with multiplication +
>> subtraction, using the result from the previous division:
>>
>> a = b / c;
>> d = a * c;
>> d = b - d;
>>
>> Which will be faster.
>
> Do you have any benchmarks that show that performance increase? The ISA
> manual specifically says the suggested sequence is div+mod, and while
> those suggestions don't always pan out for real hardware it's likely
> that at least some implementations will end up with the ISA-suggested
> fusions.
It'll almost certainly be visible in mcf. Been there, done that. In
fact, that's why I asked the team Matevos works on to poke at this case
as I went through this issue on another processor.
It can also be run through LLVM's MCA to estimate counts if you've got a
pipeline description. THe div+rem will come out at around ~40c while a
div+mul+sub should weigh in around 25c for Veyron v1.
Jeff
On 2/18/23 12:31, Maciej W. Rozycki wrote:
> On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:
>
>>>> If we have division and remainder calculations with the same operands:
>>>>
>>>> a = b / c;
>>>> d = b % c;
>>>>
>>>> We can replace the calculation of remainder with multiplication +
>>>> subtraction, using the result from the previous division:
>>>>
>>>> a = b / c;
>>>> d = a * c;
>>>> d = b - d;
>>>>
>>>> Which will be faster.
>>>
>>> Do you have any benchmarks that show that performance increase? The ISA
>>> manual specifically says the suggested sequence is div+mod, and while
>>> those suggestions don't always pan out for real hardware it's likely
>>> that at least some implementations will end up with the ISA-suggested
>>> fusions.
>>
>> I suspect I will be needing this kind of patch for the core that I am
>> going to be using.
>> If anything this should be under a tuning option.
>
> Barring the fusion case, which indeed asks for a dedicated `divmod'
> pattern (and then I suppose a post-reload splitter or a peephole so that
> where one of the two results produced has eventually turned out unused, we
> have means to discard the unneeded machine instruction), isn't the generic
> transformation something for the middle end to do based on RTX costs?
I originally though the same way you did Maciej.
The problem is you don't see it as a divmod in expand_divmod unless you
expose a divmod optab. See tree-ssa-mathopts.cc's divmod handling.
jeff
On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
>
>
> On 2/18/23 11:26, Palmer Dabbelt wrote:
>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>>> Hi all,
>>> If we have division and remainder calculations with the same operands:
>>>
>>> a = b / c;
>>> d = b % c;
>>>
>>> We can replace the calculation of remainder with multiplication +
>>> subtraction, using the result from the previous division:
>>>
>>> a = b / c;
>>> d = a * c;
>>> d = b - d;
>>>
>>> Which will be faster.
>>
>> Do you have any benchmarks that show that performance increase? The ISA
>> manual specifically says the suggested sequence is div+mod, and while
>> those suggestions don't always pan out for real hardware it's likely
>> that at least some implementations will end up with the ISA-suggested
>> fusions.
> It'll almost certainly be visible in mcf. Been there, done that. In
> fact, that's why I asked the team Matevos works on to poke at this case
> as I went through this issue on another processor.
>
> It can also be run through LLVM's MCA to estimate counts if you've got a
> pipeline description. THe div+rem will come out at around ~40c while a
> div+mul+sub should weigh in around 25c for Veyron v1.
Do you have a link to the patches somewhere? I couldn't find them
online, just the custom instruction support. Or even just some docs
describing what the pipeline does, as just basing one performance model
on another is kind of a double-edged sword.
That said, I think just knowing the processor doesn't do the div+mod
fusion is sufficient to turn something like this on for the mtune for
that processor. That's different than turning it on globally, though --
unless it turns out nobody is actually doing the fusion suggested in the
ISA manual, which wouldn't be super surprising.
Maybe some of the SiFive and T-Head folks can chime in on whether or not
their processors perform the fusion in question -- and if so, do the
instructions need to say back-to-back? It doesn't look like we're
really targeting the code sequences the ISA suggests as it stands, so
maybe it's OK to just switch the default over too?
It also brings up the question of mulh+mul fusions, which I don't think
we've really looked at (though maybe they're a lot less important for
rv64).
On 2/18/23 14:30, Palmer Dabbelt wrote:
> On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
>>
>>
>> On 2/18/23 11:26, Palmer Dabbelt wrote:
>>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>>>> Hi all,
>>>> If we have division and remainder calculations with the same operands:
>>>>
>>>> a = b / c;
>>>> d = b % c;
>>>>
>>>> We can replace the calculation of remainder with multiplication +
>>>> subtraction, using the result from the previous division:
>>>>
>>>> a = b / c;
>>>> d = a * c;
>>>> d = b - d;
>>>>
>>>> Which will be faster.
>>>
>>> Do you have any benchmarks that show that performance increase? The ISA
>>> manual specifically says the suggested sequence is div+mod, and while
>>> those suggestions don't always pan out for real hardware it's likely
>>> that at least some implementations will end up with the ISA-suggested
>>> fusions.
>> It'll almost certainly be visible in mcf. Been there, done that. In
>> fact, that's why I asked the team Matevos works on to poke at this case
>> as I went through this issue on another processor.
>>
>> It can also be run through LLVM's MCA to estimate counts if you've got a
>> pipeline description. THe div+rem will come out at around ~40c while a
>> div+mul+sub should weigh in around 25c for Veyron v1.
>
> Do you have a link to the patches somewhere? I couldn't find them
> online, just the custom instruction support. Or even just some docs
> describing what the pipeline does, as just basing one performance model
> on another is kind of a double-edged sword.
It is. But div/rem is pretty simple. 20c each, not pipelined, using a
shared unit. There's some early out paths, but the compiler isn't going
to be able to model those as they depend on the number of bits on in the
inputs. Basically as long as we can do a mult+sub in < 20c, Matevos's
sequence is faster.
If we have implementations that support fusion at some point, then we
can twiddle the expander appropriately. Similarly we could easily
consider selecting on -Os as well since div+rem is smaller than
div+mul+sub. I'm sure Matevos is open to adjustments to that patch.
We haven't done a full eval on the pipeline modeling yet and with gcc in
stage4, it didn't seem advisable to try and push it through. Similarly
I don't think Matevos's patch should really be a gcc-13 thing, it really
should be gcc-14.
>
> That said, I think just knowing the processor doesn't do the div+mod
> fusion is sufficient to turn something like this on for the mtune for
> that processor. That's different than turning it on globally, though --
> unless it turns out nobody is actually doing the fusion suggested in the
> ISA manual, which wouldn't be super surprising.
I'm not aware of anyone doing fusion of divmod in the risc-v space.
For prior ports I've worked on, the hardware folks made is painfully
clear that the cost of adding another output port on the unit was a
non-starter. That port had a pretty fast divider with at least some
overlap and the div + mul + sub sequence was still better in general,
though the early out cases made it much harder to evaluate.
>
> Maybe some of the SiFive and T-Head folks can chime in on whether or not
> their processors perform the fusion in question -- and if so, do the
> instructions need to say back-to-back? It doesn't look like we're
> really targeting the code sequences the ISA suggests as it stands, so
> maybe it's OK to just switch the default over too?
Happy to take in their input. I suspect they'll ultimately prefer the
sequence Matevos is generating.
>
> It also brings up the question of mulh+mul fusions, which I don't think
> we've really looked at (though maybe they're a lot less important for
> rv64).
Not on our radar for V1 or V2.
jeff
On Sat, 18 Feb 2023, Jeff Law wrote:
> > Barring the fusion case, which indeed asks for a dedicated `divmod'
> > pattern (and then I suppose a post-reload splitter or a peephole so that
> > where one of the two results produced has eventually turned out unused, we
> > have means to discard the unneeded machine instruction), isn't the generic
> > transformation something for the middle end to do based on RTX costs?
> I originally though the same way you did Maciej.
>
> The problem is you don't see it as a divmod in expand_divmod unless you expose
> a divmod optab. See tree-ssa-mathopts.cc's divmod handling.
That's the kind of stuff I'd expect to happen at the tree level though,
before expand.
Maciej
On Sat, Feb 18, 2023 at 1:30 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
> >
> >
> > On 2/18/23 11:26, Palmer Dabbelt wrote:
> >> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> >>> Hi all,
> >>> If we have division and remainder calculations with the same operands:
> >>>
> >>> a = b / c;
> >>> d = b % c;
> >>>
> >>> We can replace the calculation of remainder with multiplication +
> >>> subtraction, using the result from the previous division:
> >>>
> >>> a = b / c;
> >>> d = a * c;
> >>> d = b - d;
> >>>
> >>> Which will be faster.
> >>
> >> Do you have any benchmarks that show that performance increase? The ISA
> >> manual specifically says the suggested sequence is div+mod, and while
> >> those suggestions don't always pan out for real hardware it's likely
> >> that at least some implementations will end up with the ISA-suggested
> >> fusions.
> > It'll almost certainly be visible in mcf. Been there, done that. In
> > fact, that's why I asked the team Matevos works on to poke at this case
> > as I went through this issue on another processor.
> >
> > It can also be run through LLVM's MCA to estimate counts if you've got a
> > pipeline description. THe div+rem will come out at around ~40c while a
> > div+mul+sub should weigh in around 25c for Veyron v1.
>
> Do you have a link to the patches somewhere? I couldn't find them
> online, just the custom instruction support. Or even just some docs
> describing what the pipeline does, as just basing one performance model
> on another is kind of a double-edged sword.
>
> That said, I think just knowing the processor doesn't do the div+mod
> fusion is sufficient to turn something like this on for the mtune for
> that processor. That's different than turning it on globally, though --
> unless it turns out nobody is actually doing the fusion suggested in the
> ISA manual, which wouldn't be super surprising.
>
> Maybe some of the SiFive and T-Head folks can chime in on whether or not
> their processors perform the fusion in question -- and if so, do the
> instructions need to say back-to-back?
AFAIK, the sequence with the multiplication will normally be faster on
SiFive cores when both the quotient and the remainder are needed.
> It doesn't look like we're
> really targeting the code sequences the ISA suggests as it stands, so
> maybe it's OK to just switch the default over too?
>
> It also brings up the question of mulh+mul fusions, which I don't think
> we've really looked at (though maybe they're a lot less important for
> rv64).
On Sun, Feb 19, 2023 at 2:15 AM Maciej W. Rozycki <macro@orcam.me.uk> wrote:
>
> On Sat, 18 Feb 2023, Jeff Law wrote:
>
> > > Barring the fusion case, which indeed asks for a dedicated `divmod'
> > > pattern (and then I suppose a post-reload splitter or a peephole so that
> > > where one of the two results produced has eventually turned out unused, we
> > > have means to discard the unneeded machine instruction), isn't the generic
> > > transformation something for the middle end to do based on RTX costs?
> > I originally though the same way you did Maciej.
> >
> > The problem is you don't see it as a divmod in expand_divmod unless you expose
> > a divmod optab. See tree-ssa-mathopts.cc's divmod handling.
>
> That's the kind of stuff I'd expect to happen at the tree level though,
> before expand.
The GIMPLE pass forming divmod could indeed choose to emit the
div + mul/sub sequence instead if an actual divmod pattern isn't available.
It could even generate some fake mul/sub/mod RTXen to cost the two
variants against each other but I seriously doubt any uarch that implements
division/modulo has a slower mul/sub.
Richard.
>
> Maciej
On Mon, 20 Feb 2023, Richard Biener via Gcc-patches wrote:
> On Sun, Feb 19, 2023 at 2:15 AM Maciej W. Rozycki <macro@orcam.me.uk> wrote:
> >
> > > The problem is you don't see it as a divmod in expand_divmod unless you expose
> > > a divmod optab. See tree-ssa-mathopts.cc's divmod handling.
> >
> > That's the kind of stuff I'd expect to happen at the tree level though,
> > before expand.
>
> The GIMPLE pass forming divmod could indeed choose to emit the
> div + mul/sub sequence instead if an actual divmod pattern isn't available.
> It could even generate some fake mul/sub/mod RTXen to cost the two
> variants against each other but I seriously doubt any uarch that implements
> division/modulo has a slower mul/sub.
Making a correct decision requires knowing to which degree the divider is
pipelined, and costs won't properly reflect that. If the divider accepts
a new div/mod instruction every couple of cycles, it's faster to just issue
a div followed by a mod with the same operands.
Therefore I think in this case it's fair for GIMPLE level to just check if
the divmod pattern is available, and let the target do the fine tuning via
the divmod expander.
It would make sense for tree-ssa-mathopts to emit div + mul/sub when neither
'divmod' nor 'mod' patterns are available, because RTL expansion will do the
same, just later, and we'll rely on RTL CSE to clean up the redundant div.
But RISC-V has both 'div' and 'mod', so as I tried to explain in the first
paragraph we should let the target decide.
Alexander
On Mon, 20 Feb 2023, Alexander Monakov wrote:
> > > That's the kind of stuff I'd expect to happen at the tree level though,
> > > before expand.
> >
> > The GIMPLE pass forming divmod could indeed choose to emit the
> > div + mul/sub sequence instead if an actual divmod pattern isn't available.
> > It could even generate some fake mul/sub/mod RTXen to cost the two
> > variants against each other but I seriously doubt any uarch that implements
> > division/modulo has a slower mul/sub.
>
> Making a correct decision requires knowing to which degree the divider is
> pipelined, and costs won't properly reflect that. If the divider accepts
> a new div/mod instruction every couple of cycles, it's faster to just issue
> a div followed by a mod with the same operands.
I guess there exist microarchitectures that have their divider pipelined,
but I haven't come across one. I think division is not an operation that
is commonly optimised for in hw RTL, whether integer or FP, not at least
in embedded applications, and given the nature of the operation I believe
it would be particularly costly in terms of silicon. I've seen latencies
and repeat rates of up to 50 quoted in CPU documentation even for 32-bit
integer division while multiplication had a latency of 5 and a repeat rate
of 1 (i.e. fully pipelined) for the same microarchitecture.
So (taking the data dependency into account) the latency of a DIV + MOD
operation would be 100 for said microarchitecture (and the same repeat
rate), while a DIV + MUL/SUB sequence would have a latency of 56 and a
repeat rate of 50. Quite an improvement.
> Therefore I think in this case it's fair for GIMPLE level to just check if
> the divmod pattern is available, and let the target do the fine tuning via
> the divmod expander.
Hmm, we have DFA scheduling available that is supposed to give latencies
and repeat rates for individual operations. Wouldn't it be possible to
get hold of this information?
If we cannot make use of this information at the GIMPLE level (which I'd
consider regrettable), then I think that maybe we need a target hook to
say division is cheap that would prevent a DIV + MOD to DIV + MUL/SUB
transformation from happening, perhaps off by default. I think it would
be regrettable too if every backend for targets/subtargets that do not
have a hardware DIVMOD operation (e.g. MIPS has one at certain ISA levels
only) or a pipelined division would have to code the transformation by
hand.
> It would make sense for tree-ssa-mathopts to emit div + mul/sub when neither
> 'divmod' nor 'mod' patterns are available, because RTL expansion will do the
> same, just later, and we'll rely on RTL CSE to clean up the redundant div.
That as well.
> But RISC-V has both 'div' and 'mod', so as I tried to explain in the first
> paragraph we should let the target decide.
Still I think it would be best if RISC-V ought to supply a divmod pattern
only where fused DIV/MOD execution is present in the microarchitecture.
Maciej
On 2/17/23 07:02, Matevos Mehrabyan via Gcc-patches wrote:
> Hi all,
> If we have division and remainder calculations with the same operands:
>
> a = b / c;
> d = b % c;
>
> We can replace the calculation of remainder with multiplication +
> subtraction, using the result from the previous division:
>
> a = b / c;
> d = a * c;
> d = b - d;
>
> Which will be faster.
> Currently, it isn't done for RISC-V.
>
> I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
>
> Best regards,
> Matevos.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.md: Added divmod expander.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/riscv/divmod.c: New testcase.
So here's an update to the patch that I think addresses the key concerns.
Specifically use of the divmod expander is now conditional on the tuning
info which allows for better control over using div+rem or div+mul+sub.
Given I don't know the right tunings for any implementation other than
Veyron V1, I left them as-is. So all the implementations will still use
the div+rem sequence. Obviously when we submit the Veyron V1 tunings,
it will use div+mul+sub. I expect other implementations will prefer
div+mul+sub as well.
The testcase is split into two tests. One to verify the old div+rem
sequence, the other to test for div+mul+sub. The latter test is
disabled for now. Once the first uarch has flipped tuning, we can add
the proper tune parameter and enable the test.
Attached is the patch I committed.
Thanks and sorry for the long delay.
jeff
commit 065be0ffbcd676b635d492f4679e635b6ece4fe4
Author: Matevos Mehrabyan <matevosmehrabyan@gmail.com>
Date: Fri Apr 28 14:01:30 2023 -0600
RISC-V: Add divmod expansion support
Hi all,
If we have division and remainder calculations with the same operands:
a = b / c;
d = b % c;
We can replace the calculation of remainder with multiplication +
subtraction, using the result from the previous division:
a = b / c;
d = a * c;
d = b - d;
Which will be faster.
Currently, it isn't done for RISC-V.
I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
Best regards,
Matevos.
gcc/ChangeLog:
* config/riscv/iterators.md (only_div, paired_mod): New iterators.
(u): Add div/udiv cases.
* config/riscv/riscv-protos.h (riscv_use_divmod_expander): Prototype.
* config/riscv/riscv.cc (struct riscv_tune_param): Add field for
divmod expansion.
(rocket_tune_info, sifive_7_tune_info): Initialize new field.
(thead_c906_tune_info): Likewise.
(optimize_size_tune_info): Likewise.
(riscv_use_divmod_expander): New function.
* config/riscv/riscv.md (<u>divmod<mode>4): New expander.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/divmod-1.c: New testcase.
* gcc.target/riscv/divmod-2.c: New testcase.
diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index 9b767038452..1d56324df03 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -152,6 +152,11 @@ (define_code_iterator any_div [div udiv mod umod])
;; from the same template.
(define_code_iterator any_mod [mod umod])
+;; These code iterators allow unsigned and signed divmod to be generated
+;; from the same template.
+(define_code_iterator only_div [div udiv])
+(define_code_attr paired_mod [(div "mod") (udiv "umod")])
+
;; These code iterators allow the signed and unsigned scc operations to use
;; the same template.
(define_code_iterator any_gt [gt gtu])
@@ -181,6 +186,7 @@ (define_code_attr u [(sign_extend "") (zero_extend "u")
(lt "") (ltu "u")
(le "") (leu "u")
(fix "") (unsigned_fix "u")
+ (div "") (udiv "u")
(float "") (unsigned_float "u")])
;; <su> is like <u>, but the signed form expands to "s" rather than "".
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index f87661bde2c..5a927bdf1b0 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -239,4 +239,5 @@ extern const char*
th_mempair_output_move (rtx[4], bool, machine_mode, RTX_CODE);
#endif
+extern bool riscv_use_divmod_expander (void);
#endif /* ! GCC_RISCV_PROTOS_H */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 1529855a2b4..09a30dc260f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -236,6 +236,7 @@ struct riscv_tune_param
unsigned short memory_cost;
unsigned short fmv_cost;
bool slow_unaligned_access;
+ bool use_divmod_expansion;
};
/* Information about one micro-arch we know about. */
@@ -323,6 +324,7 @@ static const struct riscv_tune_param rocket_tune_info = {
5, /* memory_cost */
8, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* use_divmod_expansion */
};
/* Costs to use when optimizing for Sifive 7 Series. */
@@ -337,6 +339,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
3, /* memory_cost */
8, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* use_divmod_expansion */
};
/* Costs to use when optimizing for T-HEAD c906. */
@@ -351,6 +354,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
5, /* memory_cost */
8, /* fmv_cost */
false, /* slow_unaligned_access */
+ false /* use_divmod_expansion */
};
/* Costs to use when optimizing for size. */
@@ -365,6 +369,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
2, /* memory_cost */
8, /* fmv_cost */
false, /* slow_unaligned_access */
+ false, /* use_divmod_expansion */
};
static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *);
@@ -7210,6 +7215,16 @@ riscv_lshift_subword (machine_mode mode, rtx value, rtx shift,
gen_lowpart (QImode, shift)));
}
+/* Return TRUE if we should use the divmod expander, FALSE otherwise. This
+ allows the behavior to be tuned for specific implementations as well as
+ when optimizing for size. */
+
+bool
+riscv_use_divmod_expander (void)
+{
+ return tune_param->use_divmod_expansion;
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 376a8831820..c508ee3ad89 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1063,6 +1063,22 @@ (define_insn "<optab>di3"
[(set_attr "type" "idiv")
(set_attr "mode" "DI")])
+(define_expand "<u>divmod<mode>4"
+ [(parallel
+ [(set (match_operand:GPR 0 "register_operand")
+ (only_div:GPR (match_operand:GPR 1 "register_operand")
+ (match_operand:GPR 2 "register_operand")))
+ (set (match_operand:GPR 3 "register_operand")
+ (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
+ "TARGET_DIV && riscv_use_divmod_expander ()"
+ {
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1], operands[2]));
+ emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
+ emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
+ DONE;
+ })
+
(define_insn "*<optab>si3_extended"
[(set (match_operand:DI 0 "register_operand" "=r")
(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/riscv/divmod-1.c b/gcc/testsuite/gcc.target/riscv/divmod-1.c
new file mode 100644
index 00000000000..9706611b500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/divmod-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+ *c = a / b;
+ *d = a % b;
+}
+
+/* { dg-final { scan-assembler-times "\tdiv" 1 } } */
+/* { dg-final { scan-assembler-times "\trem" 1} } */
diff --git a/gcc/testsuite/gcc.target/riscv/divmod-2.c b/gcc/testsuite/gcc.target/riscv/divmod-2.c
new file mode 100644
index 00000000000..dfd319e52c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/divmod-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* Skip this everywhere for now. Once we have a target with
+ divmod enabled, only skip for -O0, -O1, -Og, -Oz, -Os. */
+/* { dg-skip-if "" { *-*-* } { } } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+ *c = a / b;
+ *d = a % b;
+}
+
+/* { dg-final { scan-assembler-not "\trem" } } */
+/* { dg-final { scan-assembler-times "\tdiv" 1 } } */
+/* { dg-final { scan-assembler-times "\tmul" 1 } } */
+/* { dg-final { scan-assembler-times "\tsub" 1 } } */
@@ -148,6 +148,11 @@
;; from the same template.
(define_code_iterator any_mod [mod umod])
+;; These code iterators allow unsigned and signed divmod to be generated
+;; from the same template.
+(define_code_iterator only_div [div udiv])
+(define_code_attr paired_mod [(div "mod") (udiv "umod")])
+
;; These code iterators allow the signed and unsigned scc operations to use
;; the same template.
(define_code_iterator any_gt [gt gtu])
@@ -175,7 +180,8 @@
(gt "") (gtu "u")
(ge "") (geu "u")
(lt "") (ltu "u")
- (le "") (leu "u")])
+ (le "") (leu "u")
+ (div "") (udiv "u")])
;; <su> is like <u>, but the signed form expands to "s" rather than "".
(define_code_attr su [(sign_extend "s") (zero_extend "u")])
@@ -1044,6 +1044,22 @@
[(set_attr "type" "idiv")
(set_attr "mode" "DI")])
+(define_expand "<u>divmod<mode>4"
+ [(parallel
+ [(set (match_operand:GPR 0 "register_operand")
+ (only_div:GPR (match_operand:GPR 1 "register_operand")
+ (match_operand:GPR 2 "register_operand")))
+ (set (match_operand:GPR 3 "register_operand")
+ (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
+ "TARGET_DIV"
+ {
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
operands[2]));
+ emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
+ emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
+ DONE;
+ })
+
(define_insn "*<optab>si3_extended"
[(set (match_operand:DI 0 "register_operand" "=r")
(sign_extend:DI
b/gcc/testsuite/gcc.target/riscv/divmod.c
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+ *c = a / b;
+ *d = a % b;
+}
+
+/* { dg-final { scan-assembler-not "rem" } } */
+/* { dg-final { scan-assembler-times "mul" 1 } } */
+/* { dg-final { scan-assembler-times "sub" 1 } } */