diff mbox series

RISC-V: Add divmod instruction support

Message ID	CALXbNshUoKPUqJiL2Nit7t4Hahg0wYMPMxcWRrwWZf8=7BADng@mail.gmail.com
State	Corrupt patch
Headers	Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 8.43.85.97 as permitted sender) client-ip=8.43.85.97; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 3728E382E6B8 MIME-Version: 1.0 Date: Fri, 17 Feb 2023 18:02:40 +0400 Message-ID: <CALXbNshUoKPUqJiL2Nit7t4Hahg0wYMPMxcWRrwWZf8=7BADng@mail.gmail.com> Subject: RISC-V: Add divmod instruction support To: gcc-patches@gcc.gnu.org Content-Type: text/plain; charset="UTF-8" Precedence: list From: Matevos Mehrabyan via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Matevos Mehrabyan <matevosmehrabyan@gmail.com> Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org> X-getmail-retrieved-from-mailbox: =?utf-8?q?INBOX?=
Series	RISC-V: Add divmod instruction support \| RISC-V: Add divmod instruction support

Checks

Context	Check	Description
snail/gcc-patch-check	warning	Git am fail log

Commit Message

Matevos Mehrabyan Feb. 17, 2023, 2:02 p.m. UTC

  Hi all,
If we have division and remainder calculations with the same operands:

  a = b / c;
  d = b % c;

We can replace the calculation of remainder with multiplication +
subtraction, using the result from the previous division:

  a = b / c;
  d = a * c;
  d = b - d;

Which will be faster.
Currently, it isn't done for RISC-V.

I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.

Best regards,
Matevos.

gcc/ChangeLog:

            * config/riscv/riscv.md: Added divmod expander.

gcc/testsuite/ChangeLog:
            * gcc.target/riscv/divmod.c: New testcase.

--- inline copy of the patch ---

Comments

Palmer Dabbelt Feb. 18, 2023, 6:26 p.m. UTC | #1

On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> Hi all,
> If we have division and remainder calculations with the same operands:
>
>   a = b / c;
>   d = b % c;
>
> We can replace the calculation of remainder with multiplication +
> subtraction, using the result from the previous division:
>
>   a = b / c;
>   d = a * c;
>   d = b - d;
>
> Which will be faster.

Do you have any benchmarks that show that performance increase?  The ISA 
manual specifically says the suggested sequence is div+mod, and while 
those suggestions don't always pan out for real hardware it's likely 
that at least some implementations will end up with the ISA-suggested 
fusions.

> Currently, it isn't done for RISC-V.
>
> I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
>
> Best regards,
> Matevos.
>
> gcc/ChangeLog:
>
>             * config/riscv/riscv.md: Added divmod expander.
>
> gcc/testsuite/ChangeLog:
>             * gcc.target/riscv/divmod.c: New testcase.
>
> --- inline copy of the patch ---
>
> diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
> index f95dd405e12..d941483d9f1 100644
> --- a/gcc/config/riscv/iterators.md
> +++ b/gcc/config/riscv/iterators.md
> @@ -148,6 +148,11 @@
>  ;; from the same template.
>  (define_code_iterator any_mod [mod umod])
>
> +;; These code iterators allow unsigned and signed divmod to be generated
> +;; from the same template.
> +(define_code_iterator only_div [div udiv])
> +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
> +
>  ;; These code iterators allow the signed and unsigned scc operations to use
>  ;; the same template.
>  (define_code_iterator any_gt [gt gtu])
> @@ -175,7 +180,8 @@
>       (gt "") (gtu "u")
>       (ge "") (geu "u")
>       (lt "") (ltu "u")
> -     (le "") (leu "u")])
> +     (le "") (leu "u")
> +     (div "") (udiv "u")])
>
>  ;; <su> is like <u>, but the signed form expands to "s" rather than "".
>  (define_code_attr su [(sign_extend "s") (zero_extend "u")])
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index c8adc5af5d2..2d48ff3f8de 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -1044,6 +1044,22 @@
>    [(set_attr "type" "idiv")
>     (set_attr "mode" "DI")])
>
> +(define_expand "<u>divmod<mode>4"
> +  [(parallel
> +     [(set (match_operand:GPR 0 "register_operand")
> +           (only_div:GPR (match_operand:GPR 1 "register_operand")
> +                         (match_operand:GPR 2 "register_operand")))
> +      (set (match_operand:GPR 3 "register_operand")
> +           (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
> +  "TARGET_DIV"
> +  {
> +      rtx tmp = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
> operands[2]));
> +      emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
> +      emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
> +      DONE;
> +  })
> +
>  (define_insn "*<optab>si3_extended"
>    [(set (match_operand:DI                 0 "register_operand" "=r")
>   (sign_extend:DI
> diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
> b/gcc/testsuite/gcc.target/riscv/divmod.c
> new file mode 100644
> index 00000000000..254b25e654d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
> +
> +void
> +foo(int a, int b, int *c, int *d)
> +{
> +   *c = a / b;
> +   *d = a % b;
> +}
> +
> +/* { dg-final { scan-assembler-not "rem" } } */
> +/* { dg-final { scan-assembler-times "mul" 1 } } */
> +/* { dg-final { scan-assembler-times "sub" 1 } } */

Andrew Pinski Feb. 18, 2023, 6:42 p.m. UTC | #2

On Sat, Feb 18, 2023 at 10:27 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> > Hi all,
> > If we have division and remainder calculations with the same operands:
> >
> >   a = b / c;
> >   d = b % c;
> >
> > We can replace the calculation of remainder with multiplication +
> > subtraction, using the result from the previous division:
> >
> >   a = b / c;
> >   d = a * c;
> >   d = b - d;
> >
> > Which will be faster.
>
> Do you have any benchmarks that show that performance increase?  The ISA
> manual specifically says the suggested sequence is div+mod, and while
> those suggestions don't always pan out for real hardware it's likely
> that at least some implementations will end up with the ISA-suggested
> fusions.

I suspect I will be needing this kind of patch for the core that I am
going to be using.
If anything this should be under a tuning option.

Thanks,
Andrew Pinski


>
> > Currently, it isn't done for RISC-V.
> >
> > I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
> >
> > Best regards,
> > Matevos.
> >
> > gcc/ChangeLog:
> >
> >             * config/riscv/riscv.md: Added divmod expander.
> >
> > gcc/testsuite/ChangeLog:
> >             * gcc.target/riscv/divmod.c: New testcase.
> >
> > --- inline copy of the patch ---
> >
> > diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
> > index f95dd405e12..d941483d9f1 100644
> > --- a/gcc/config/riscv/iterators.md
> > +++ b/gcc/config/riscv/iterators.md
> > @@ -148,6 +148,11 @@
> >  ;; from the same template.
> >  (define_code_iterator any_mod [mod umod])
> >
> > +;; These code iterators allow unsigned and signed divmod to be generated
> > +;; from the same template.
> > +(define_code_iterator only_div [div udiv])
> > +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
> > +
> >  ;; These code iterators allow the signed and unsigned scc operations to use
> >  ;; the same template.
> >  (define_code_iterator any_gt [gt gtu])
> > @@ -175,7 +180,8 @@
> >       (gt "") (gtu "u")
> >       (ge "") (geu "u")
> >       (lt "") (ltu "u")
> > -     (le "") (leu "u")])
> > +     (le "") (leu "u")
> > +     (div "") (udiv "u")])
> >
> >  ;; <su> is like <u>, but the signed form expands to "s" rather than "".
> >  (define_code_attr su [(sign_extend "s") (zero_extend "u")])
> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > index c8adc5af5d2..2d48ff3f8de 100644
> > --- a/gcc/config/riscv/riscv.md
> > +++ b/gcc/config/riscv/riscv.md
> > @@ -1044,6 +1044,22 @@
> >    [(set_attr "type" "idiv")
> >     (set_attr "mode" "DI")])
> >
> > +(define_expand "<u>divmod<mode>4"
> > +  [(parallel
> > +     [(set (match_operand:GPR 0 "register_operand")
> > +           (only_div:GPR (match_operand:GPR 1 "register_operand")
> > +                         (match_operand:GPR 2 "register_operand")))
> > +      (set (match_operand:GPR 3 "register_operand")
> > +           (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
> > +  "TARGET_DIV"
> > +  {
> > +      rtx tmp = gen_reg_rtx (<MODE>mode);
> > +      emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
> > operands[2]));
> > +      emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
> > +      emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
> > +      DONE;
> > +  })
> > +
> >  (define_insn "*<optab>si3_extended"
> >    [(set (match_operand:DI                 0 "register_operand" "=r")
> >   (sign_extend:DI
> > diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
> > b/gcc/testsuite/gcc.target/riscv/divmod.c
> > new file mode 100644
> > index 00000000000..254b25e654d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
> > +
> > +void
> > +foo(int a, int b, int *c, int *d)
> > +{
> > +   *c = a / b;
> > +   *d = a % b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "rem" } } */
> > +/* { dg-final { scan-assembler-times "mul" 1 } } */
> > +/* { dg-final { scan-assembler-times "sub" 1 } } */

Palmer Dabbelt Feb. 18, 2023, 7:26 p.m. UTC | #3

On Sat, 18 Feb 2023 10:42:51 PST (-0800), pinskia@gmail.com wrote:
> On Sat, Feb 18, 2023 at 10:27 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>> > Hi all,
>> > If we have division and remainder calculations with the same operands:
>> >
>> >   a = b / c;
>> >   d = b % c;
>> >
>> > We can replace the calculation of remainder with multiplication +
>> > subtraction, using the result from the previous division:
>> >
>> >   a = b / c;
>> >   d = a * c;
>> >   d = b - d;
>> >
>> > Which will be faster.
>>
>> Do you have any benchmarks that show that performance increase?  The ISA
>> manual specifically says the suggested sequence is div+mod, and while
>> those suggestions don't always pan out for real hardware it's likely
>> that at least some implementations will end up with the ISA-suggested
>> fusions.
>
> I suspect I will be needing this kind of patch for the core that I am
> going to be using.

OK, good to know.  Presumably you guys aren't ready to show benchmarks, 
though? 

> If anything this should be under a tuning option.

That seems likely, as IIRC the SiFive cores do this fusion.  It 
generally seems like we're going to end up with implementations all over 
the place when it comes to what's fused, so I bet we'll have a lot of 
these differences between cores.

> Thanks,
> Andrew Pinski
>
>
>>
>> > Currently, it isn't done for RISC-V.
>> >
>> > I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
>> >
>> > Best regards,
>> > Matevos.
>> >
>> > gcc/ChangeLog:
>> >
>> >             * config/riscv/riscv.md: Added divmod expander.
>> >
>> > gcc/testsuite/ChangeLog:
>> >             * gcc.target/riscv/divmod.c: New testcase.
>> >
>> > --- inline copy of the patch ---
>> >
>> > diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
>> > index f95dd405e12..d941483d9f1 100644
>> > --- a/gcc/config/riscv/iterators.md
>> > +++ b/gcc/config/riscv/iterators.md
>> > @@ -148,6 +148,11 @@
>> >  ;; from the same template.
>> >  (define_code_iterator any_mod [mod umod])
>> >
>> > +;; These code iterators allow unsigned and signed divmod to be generated
>> > +;; from the same template.
>> > +(define_code_iterator only_div [div udiv])
>> > +(define_code_attr paired_mod [(div "mod") (udiv "umod")])
>> > +
>> >  ;; These code iterators allow the signed and unsigned scc operations to use
>> >  ;; the same template.
>> >  (define_code_iterator any_gt [gt gtu])
>> > @@ -175,7 +180,8 @@
>> >       (gt "") (gtu "u")
>> >       (ge "") (geu "u")
>> >       (lt "") (ltu "u")
>> > -     (le "") (leu "u")])
>> > +     (le "") (leu "u")
>> > +     (div "") (udiv "u")])
>> >
>> >  ;; <su> is like <u>, but the signed form expands to "s" rather than "".
>> >  (define_code_attr su [(sign_extend "s") (zero_extend "u")])
>> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
>> > index c8adc5af5d2..2d48ff3f8de 100644
>> > --- a/gcc/config/riscv/riscv.md
>> > +++ b/gcc/config/riscv/riscv.md
>> > @@ -1044,6 +1044,22 @@
>> >    [(set_attr "type" "idiv")
>> >     (set_attr "mode" "DI")])
>> >
>> > +(define_expand "<u>divmod<mode>4"
>> > +  [(parallel
>> > +     [(set (match_operand:GPR 0 "register_operand")
>> > +           (only_div:GPR (match_operand:GPR 1 "register_operand")
>> > +                         (match_operand:GPR 2 "register_operand")))
>> > +      (set (match_operand:GPR 3 "register_operand")
>> > +           (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
>> > +  "TARGET_DIV"
>> > +  {
>> > +      rtx tmp = gen_reg_rtx (<MODE>mode);
>> > +      emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
>> > operands[2]));
>> > +      emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
>> > +      emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
>> > +      DONE;
>> > +  })
>> > +
>> >  (define_insn "*<optab>si3_extended"
>> >    [(set (match_operand:DI                 0 "register_operand" "=r")
>> >   (sign_extend:DI
>> > diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
>> > b/gcc/testsuite/gcc.target/riscv/divmod.c
>> > new file mode 100644
>> > index 00000000000..254b25e654d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/riscv/divmod.c
>> > @@ -0,0 +1,14 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
>> > +
>> > +void
>> > +foo(int a, int b, int *c, int *d)
>> > +{
>> > +   *c = a / b;
>> > +   *d = a % b;
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-not "rem" } } */
>> > +/* { dg-final { scan-assembler-times "mul" 1 } } */
>> > +/* { dg-final { scan-assembler-times "sub" 1 } } */

Maciej W. Rozycki Feb. 18, 2023, 7:31 p.m. UTC | #4

On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:

> > > If we have division and remainder calculations with the same operands:
> > >
> > >   a = b / c;
> > >   d = b % c;
> > >
> > > We can replace the calculation of remainder with multiplication +
> > > subtraction, using the result from the previous division:
> > >
> > >   a = b / c;
> > >   d = a * c;
> > >   d = b - d;
> > >
> > > Which will be faster.
> >
> > Do you have any benchmarks that show that performance increase?  The ISA
> > manual specifically says the suggested sequence is div+mod, and while
> > those suggestions don't always pan out for real hardware it's likely
> > that at least some implementations will end up with the ISA-suggested
> > fusions.
> 
> I suspect I will be needing this kind of patch for the core that I am
> going to be using.
> If anything this should be under a tuning option.

 Barring the fusion case, which indeed asks for a dedicated `divmod' 
pattern (and then I suppose a post-reload splitter or a peephole so that 
where one of the two results produced has eventually turned out unused, we 
have means to discard the unneeded machine instruction), isn't the generic 
transformation something for the middle end to do based on RTX costs?

  Maciej

Prathamesh Kulkarni Feb. 18, 2023, 8:57 p.m. UTC | #5

On Sun, 19 Feb 2023 at 01:01, Maciej W. Rozycki <macro@orcam.me.uk> wrote:
>
> On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:
>
> > > > If we have division and remainder calculations with the same operands:
> > > >
> > > >   a = b / c;
> > > >   d = b % c;
> > > >
> > > > We can replace the calculation of remainder with multiplication +
> > > > subtraction, using the result from the previous division:
> > > >
> > > >   a = b / c;
> > > >   d = a * c;
> > > >   d = b - d;
> > > >
> > > > Which will be faster.
> > >
> > > Do you have any benchmarks that show that performance increase?  The ISA
> > > manual specifically says the suggested sequence is div+mod, and while
> > > those suggestions don't always pan out for real hardware it's likely
> > > that at least some implementations will end up with the ISA-suggested
> > > fusions.
> >
> > I suspect I will be needing this kind of patch for the core that I am
> > going to be using.
> > If anything this should be under a tuning option.
>
>  Barring the fusion case, which indeed asks for a dedicated `divmod'
> pattern (and then I suppose a post-reload splitter or a peephole so that
> where one of the two results produced has eventually turned out unused, we
> have means to discard the unneeded machine instruction), isn't the generic
> transformation something for the middle end to do based on RTX costs?
Hi,
I am not sure if this answers your question, but we have target independent
div+mod -> divmod transform on GIMPLE in widening_mul pass if target
supports divmod insn.
It is implemented in tree-ssa-math-opts.cc:convert_to_divmod.

Thanks,
Prathamesh
>
>   Maciej

Jeff Law Feb. 18, 2023, 9:06 p.m. UTC | #6

On 2/18/23 11:26, Palmer Dabbelt wrote:
> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>> Hi all,
>> If we have division and remainder calculations with the same operands:
>>
>>   a = b / c;
>>   d = b % c;
>>
>> We can replace the calculation of remainder with multiplication +
>> subtraction, using the result from the previous division:
>>
>>   a = b / c;
>>   d = a * c;
>>   d = b - d;
>>
>> Which will be faster.
> 
> Do you have any benchmarks that show that performance increase?  The ISA 
> manual specifically says the suggested sequence is div+mod, and while 
> those suggestions don't always pan out for real hardware it's likely 
> that at least some implementations will end up with the ISA-suggested 
> fusions.
It'll almost certainly be visible in mcf.  Been there, done that.  In 
fact, that's why I asked the team Matevos works on to poke at this case 
as I went through this issue on another processor.

It can also be run through LLVM's MCA to estimate counts if you've got a 
pipeline description.  THe div+rem will come out at around ~40c while a 
div+mul+sub should weigh in around 25c for Veyron v1.

Jeff

Jeff Law Feb. 18, 2023, 9:07 p.m. UTC | #7

On 2/18/23 12:31, Maciej W. Rozycki wrote:
> On Sat, 18 Feb 2023, Andrew Pinski via Gcc-patches wrote:
> 
>>>> If we have division and remainder calculations with the same operands:
>>>>
>>>>    a = b / c;
>>>>    d = b % c;
>>>>
>>>> We can replace the calculation of remainder with multiplication +
>>>> subtraction, using the result from the previous division:
>>>>
>>>>    a = b / c;
>>>>    d = a * c;
>>>>    d = b - d;
>>>>
>>>> Which will be faster.
>>>
>>> Do you have any benchmarks that show that performance increase?  The ISA
>>> manual specifically says the suggested sequence is div+mod, and while
>>> those suggestions don't always pan out for real hardware it's likely
>>> that at least some implementations will end up with the ISA-suggested
>>> fusions.
>>
>> I suspect I will be needing this kind of patch for the core that I am
>> going to be using.
>> If anything this should be under a tuning option.
> 
>   Barring the fusion case, which indeed asks for a dedicated `divmod'
> pattern (and then I suppose a post-reload splitter or a peephole so that
> where one of the two results produced has eventually turned out unused, we
> have means to discard the unneeded machine instruction), isn't the generic
> transformation something for the middle end to do based on RTX costs?
I originally though the same way you did Maciej.

The problem is you don't see it as a divmod in expand_divmod unless you 
expose a divmod optab.  See tree-ssa-mathopts.cc's divmod handling.

jeff

Palmer Dabbelt Feb. 18, 2023, 9:30 p.m. UTC | #8

On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
>
>
> On 2/18/23 11:26, Palmer Dabbelt wrote:
>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>>> Hi all,
>>> If we have division and remainder calculations with the same operands:
>>>
>>>   a = b / c;
>>>   d = b % c;
>>>
>>> We can replace the calculation of remainder with multiplication +
>>> subtraction, using the result from the previous division:
>>>
>>>   a = b / c;
>>>   d = a * c;
>>>   d = b - d;
>>>
>>> Which will be faster.
>>
>> Do you have any benchmarks that show that performance increase?  The ISA
>> manual specifically says the suggested sequence is div+mod, and while
>> those suggestions don't always pan out for real hardware it's likely
>> that at least some implementations will end up with the ISA-suggested
>> fusions.
> It'll almost certainly be visible in mcf.  Been there, done that.  In
> fact, that's why I asked the team Matevos works on to poke at this case
> as I went through this issue on another processor.
>
> It can also be run through LLVM's MCA to estimate counts if you've got a
> pipeline description.  THe div+rem will come out at around ~40c while a
> div+mul+sub should weigh in around 25c for Veyron v1.

Do you have a link to the patches somewhere?  I couldn't find them 
online, just the custom instruction support.  Or even just some docs 
describing what the pipeline does, as just basing one performance model 
on another is kind of a double-edged sword.

That said, I think just knowing the processor doesn't do the div+mod 
fusion is sufficient to turn something like this on for the mtune for 
that processor.  That's different than turning it on globally, though -- 
unless it turns out nobody is actually doing the fusion suggested in the 
ISA manual, which wouldn't be super surprising.

Maybe some of the SiFive and T-Head folks can chime in on whether or not 
their processors perform the fusion in question -- and if so, do the 
instructions need to say back-to-back?  It doesn't look like we're 
really targeting the code sequences the ISA suggests as it stands, so 
maybe it's OK to just switch the default over too?

It also brings up the question of mulh+mul fusions, which I don't think 
we've really looked at (though maybe they're a lot less important for 
rv64).

Jeff Law Feb. 18, 2023, 9:57 p.m. UTC | #9

On 2/18/23 14:30, Palmer Dabbelt wrote:
> On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
>>
>>
>> On 2/18/23 11:26, Palmer Dabbelt wrote:
>>> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
>>>> Hi all,
>>>> If we have division and remainder calculations with the same operands:
>>>>
>>>>   a = b / c;
>>>>   d = b % c;
>>>>
>>>> We can replace the calculation of remainder with multiplication +
>>>> subtraction, using the result from the previous division:
>>>>
>>>>   a = b / c;
>>>>   d = a * c;
>>>>   d = b - d;
>>>>
>>>> Which will be faster.
>>>
>>> Do you have any benchmarks that show that performance increase?  The ISA
>>> manual specifically says the suggested sequence is div+mod, and while
>>> those suggestions don't always pan out for real hardware it's likely
>>> that at least some implementations will end up with the ISA-suggested
>>> fusions.
>> It'll almost certainly be visible in mcf.  Been there, done that.  In
>> fact, that's why I asked the team Matevos works on to poke at this case
>> as I went through this issue on another processor.
>>
>> It can also be run through LLVM's MCA to estimate counts if you've got a
>> pipeline description.  THe div+rem will come out at around ~40c while a
>> div+mul+sub should weigh in around 25c for Veyron v1.
> 
> Do you have a link to the patches somewhere?  I couldn't find them 
> online, just the custom instruction support.  Or even just some docs 
> describing what the pipeline does, as just basing one performance model 
> on another is kind of a double-edged sword.
It is.  But div/rem is pretty simple.  20c each, not pipelined, using a 
shared unit.  There's some early out paths, but the compiler isn't going 
to be able to model those as they depend on the number of bits on in the 
inputs.  Basically as long as we can do a mult+sub in < 20c, Matevos's 
sequence is faster.

If we have implementations that support fusion at some point, then we 
can twiddle the expander appropriately.  Similarly we could easily 
consider selecting on -Os as well since div+rem is smaller than 
div+mul+sub.  I'm sure Matevos is open to adjustments to that patch.

We haven't done a full eval on the pipeline modeling yet and with gcc in 
stage4, it didn't seem advisable to try and push it through.  Similarly 
I don't think Matevos's patch should really be a gcc-13 thing, it really 
should be gcc-14.

> 
> That said, I think just knowing the processor doesn't do the div+mod 
> fusion is sufficient to turn something like this on for the mtune for 
> that processor.  That's different than turning it on globally, though -- 
> unless it turns out nobody is actually doing the fusion suggested in the 
> ISA manual, which wouldn't be super surprising.
I'm not aware of anyone doing fusion of divmod in the risc-v space.

For prior ports I've worked on, the hardware folks made is painfully 
clear that the cost of adding another output port on the unit was a 
non-starter.  That port had a pretty fast divider with at least some 
overlap and the div + mul + sub sequence was still better in general, 
though the early out cases made it much harder to evaluate.




> 
> Maybe some of the SiFive and T-Head folks can chime in on whether or not 
> their processors perform the fusion in question -- and if so, do the 
> instructions need to say back-to-back?  It doesn't look like we're 
> really targeting the code sequences the ISA suggests as it stands, so 
> maybe it's OK to just switch the default over too?
Happy to take in their input.  I suspect they'll ultimately prefer the 
sequence Matevos is generating.


> 
> It also brings up the question of mulh+mul fusions, which I don't think 
> we've really looked at (though maybe they're a lot less important for 
> rv64).
Not on our radar for V1 or V2.
jeff

Maciej W. Rozycki Feb. 19, 2023, 1:14 a.m. UTC | #10

On Sat, 18 Feb 2023, Jeff Law wrote:

> >   Barring the fusion case, which indeed asks for a dedicated `divmod'
> > pattern (and then I suppose a post-reload splitter or a peephole so that
> > where one of the two results produced has eventually turned out unused, we
> > have means to discard the unneeded machine instruction), isn't the generic
> > transformation something for the middle end to do based on RTX costs?
> I originally though the same way you did Maciej.
> 
> The problem is you don't see it as a divmod in expand_divmod unless you expose
> a divmod optab.  See tree-ssa-mathopts.cc's divmod handling.

 That's the kind of stuff I'd expect to happen at the tree level though, 
before expand.

  Maciej

Andrew Waterman Feb. 20, 2023, 1:27 a.m. UTC | #11

On Sat, Feb 18, 2023 at 1:30 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Sat, 18 Feb 2023 13:06:02 PST (-0800), jeffreyalaw@gmail.com wrote:
> >
> >
> > On 2/18/23 11:26, Palmer Dabbelt wrote:
> >> On Fri, 17 Feb 2023 06:02:40 PST (-0800), gcc-patches@gcc.gnu.org wrote:
> >>> Hi all,
> >>> If we have division and remainder calculations with the same operands:
> >>>
> >>>   a = b / c;
> >>>   d = b % c;
> >>>
> >>> We can replace the calculation of remainder with multiplication +
> >>> subtraction, using the result from the previous division:
> >>>
> >>>   a = b / c;
> >>>   d = a * c;
> >>>   d = b - d;
> >>>
> >>> Which will be faster.
> >>
> >> Do you have any benchmarks that show that performance increase?  The ISA
> >> manual specifically says the suggested sequence is div+mod, and while
> >> those suggestions don't always pan out for real hardware it's likely
> >> that at least some implementations will end up with the ISA-suggested
> >> fusions.
> > It'll almost certainly be visible in mcf.  Been there, done that.  In
> > fact, that's why I asked the team Matevos works on to poke at this case
> > as I went through this issue on another processor.
> >
> > It can also be run through LLVM's MCA to estimate counts if you've got a
> > pipeline description.  THe div+rem will come out at around ~40c while a
> > div+mul+sub should weigh in around 25c for Veyron v1.
>
> Do you have a link to the patches somewhere?  I couldn't find them
> online, just the custom instruction support.  Or even just some docs
> describing what the pipeline does, as just basing one performance model
> on another is kind of a double-edged sword.
>
> That said, I think just knowing the processor doesn't do the div+mod
> fusion is sufficient to turn something like this on for the mtune for
> that processor.  That's different than turning it on globally, though --
> unless it turns out nobody is actually doing the fusion suggested in the
> ISA manual, which wouldn't be super surprising.
>
> Maybe some of the SiFive and T-Head folks can chime in on whether or not
> their processors perform the fusion in question -- and if so, do the
> instructions need to say back-to-back?

AFAIK, the sequence with the multiplication will normally be faster on
SiFive cores when both the quotient and the remainder are needed.

>  It doesn't look like we're
> really targeting the code sequences the ISA suggests as it stands, so
> maybe it's OK to just switch the default over too?
>
> It also brings up the question of mulh+mul fusions, which I don't think
> we've really looked at (though maybe they're a lot less important for
> rv64).

Richard Biener Feb. 20, 2023, 8:11 a.m. UTC | #12

On Sun, Feb 19, 2023 at 2:15 AM Maciej W. Rozycki <macro@orcam.me.uk> wrote:
>
> On Sat, 18 Feb 2023, Jeff Law wrote:
>
> > >   Barring the fusion case, which indeed asks for a dedicated `divmod'
> > > pattern (and then I suppose a post-reload splitter or a peephole so that
> > > where one of the two results produced has eventually turned out unused, we
> > > have means to discard the unneeded machine instruction), isn't the generic
> > > transformation something for the middle end to do based on RTX costs?
> > I originally though the same way you did Maciej.
> >
> > The problem is you don't see it as a divmod in expand_divmod unless you expose
> > a divmod optab.  See tree-ssa-mathopts.cc's divmod handling.
>
>  That's the kind of stuff I'd expect to happen at the tree level though,
> before expand.

The GIMPLE pass forming divmod could indeed choose to emit the
div + mul/sub sequence instead if an actual divmod pattern isn't available.
It could even generate some fake mul/sub/mod RTXen to cost the two
variants against each other but I seriously doubt any uarch that implements
division/modulo has a slower mul/sub.

Richard.

>
>   Maciej

Alexander Monakov Feb. 20, 2023, 1:32 p.m. UTC | #13

On Mon, 20 Feb 2023, Richard Biener via Gcc-patches wrote:

> On Sun, Feb 19, 2023 at 2:15 AM Maciej W. Rozycki <macro@orcam.me.uk> wrote:
> >
> > > The problem is you don't see it as a divmod in expand_divmod unless you expose
> > > a divmod optab.  See tree-ssa-mathopts.cc's divmod handling.
> >
> >  That's the kind of stuff I'd expect to happen at the tree level though,
> > before expand.
> 
> The GIMPLE pass forming divmod could indeed choose to emit the
> div + mul/sub sequence instead if an actual divmod pattern isn't available.
> It could even generate some fake mul/sub/mod RTXen to cost the two
> variants against each other but I seriously doubt any uarch that implements
> division/modulo has a slower mul/sub.

Making a correct decision requires knowing to which degree the divider is
pipelined, and costs won't properly reflect that. If the divider accepts
a new div/mod instruction every couple of cycles, it's faster to just issue
a div followed by a mod with the same operands.

Therefore I think in this case it's fair for GIMPLE level to just check if
the divmod pattern is available, and let the target do the fine tuning via
the divmod expander.

It would make sense for tree-ssa-mathopts to emit div + mul/sub when neither
'divmod' nor 'mod' patterns are available, because RTL expansion will do the
same, just later, and we'll rely on RTL CSE to clean up the redundant div.
But RISC-V has both 'div' and 'mod', so as I tried to explain in the first
paragraph we should let the target decide.

Alexander

Maciej W. Rozycki Feb. 28, 2023, 12:54 p.m. UTC | #14

On Mon, 20 Feb 2023, Alexander Monakov wrote:

> > >  That's the kind of stuff I'd expect to happen at the tree level though,
> > > before expand.
> > 
> > The GIMPLE pass forming divmod could indeed choose to emit the
> > div + mul/sub sequence instead if an actual divmod pattern isn't available.
> > It could even generate some fake mul/sub/mod RTXen to cost the two
> > variants against each other but I seriously doubt any uarch that implements
> > division/modulo has a slower mul/sub.
> 
> Making a correct decision requires knowing to which degree the divider is
> pipelined, and costs won't properly reflect that. If the divider accepts
> a new div/mod instruction every couple of cycles, it's faster to just issue
> a div followed by a mod with the same operands.

 I guess there exist microarchitectures that have their divider pipelined, 
but I haven't come across one.  I think division is not an operation that 
is commonly optimised for in hw RTL, whether integer or FP, not at least 
in embedded applications, and given the nature of the operation I believe 
it would be particularly costly in terms of silicon.  I've seen latencies 
and repeat rates of up to 50 quoted in CPU documentation even for 32-bit 
integer division while multiplication had a latency of 5 and a repeat rate 
of 1 (i.e. fully pipelined) for the same microarchitecture.

 So (taking the data dependency into account) the latency of a DIV + MOD 
operation would be 100 for said microarchitecture (and the same repeat 
rate), while a DIV + MUL/SUB sequence would have a latency of 56 and a 
repeat rate of 50.  Quite an improvement.

> Therefore I think in this case it's fair for GIMPLE level to just check if
> the divmod pattern is available, and let the target do the fine tuning via
> the divmod expander.

 Hmm, we have DFA scheduling available that is supposed to give latencies 
and repeat rates for individual operations.  Wouldn't it be possible to 
get hold of this information?

 If we cannot make use of this information at the GIMPLE level (which I'd 
consider regrettable), then I think that maybe we need a target hook to 
say division is cheap that would prevent a DIV + MOD to DIV + MUL/SUB 
transformation from happening, perhaps off by default.  I think it would 
be regrettable too if every backend for targets/subtargets that do not 
have a hardware DIVMOD operation (e.g. MIPS has one at certain ISA levels 
only) or a pipelined division would have to code the transformation by 
hand.

> It would make sense for tree-ssa-mathopts to emit div + mul/sub when neither
> 'divmod' nor 'mod' patterns are available, because RTL expansion will do the
> same, just later, and we'll rely on RTL CSE to clean up the redundant div.

 That as well.

> But RISC-V has both 'div' and 'mod', so as I tried to explain in the first
> paragraph we should let the target decide.

 Still I think it would be best if RISC-V ought to supply a divmod pattern 
only where fused DIV/MOD execution is present in the microarchitecture.

  Maciej

Jeff Law April 28, 2023, 8:09 p.m. UTC | #15

On 2/17/23 07:02, Matevos Mehrabyan via Gcc-patches wrote:
> Hi all,
> If we have division and remainder calculations with the same operands:
> 
>    a = b / c;
>    d = b % c;
> 
> We can replace the calculation of remainder with multiplication +
> subtraction, using the result from the previous division:
> 
>    a = b / c;
>    d = a * c;
>    d = b - d;
> 
> Which will be faster.
> Currently, it isn't done for RISC-V.
> 
> I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
> 
> Best regards,
> Matevos.
> 
> gcc/ChangeLog:
> 
>              * config/riscv/riscv.md: Added divmod expander.
> 
> gcc/testsuite/ChangeLog:
>              * gcc.target/riscv/divmod.c: New testcase.
So here's an update to the patch that I think addresses the key concerns.

Specifically use of the divmod expander is now conditional on the tuning 
info which allows for better control over using div+rem or div+mul+sub.

Given I don't know the right tunings for any implementation other than 
Veyron V1, I left them as-is.  So all the implementations will still use 
the div+rem sequence.  Obviously when we submit the Veyron V1 tunings, 
it will use div+mul+sub.  I expect other implementations will prefer 
div+mul+sub as well.

The testcase is split into two tests.  One to verify the old div+rem 
sequence, the other to test for div+mul+sub.  The latter test is 
disabled for now.  Once the first uarch has flipped tuning, we can add 
the proper tune parameter and enable the test.

Attached is the patch I committed.

Thanks and sorry for the long delay.


jeff
commit 065be0ffbcd676b635d492f4679e635b6ece4fe4
Author: Matevos Mehrabyan <matevosmehrabyan@gmail.com>
Date:   Fri Apr 28 14:01:30 2023 -0600

    RISC-V: Add divmod expansion support
    
    Hi all,
    If we have division and remainder calculations with the same operands:
    
      a = b / c;
      d = b % c;
    
    We can replace the calculation of remainder with multiplication +
    subtraction, using the result from the previous division:
    
      a = b / c;
      d = a * c;
      d = b - d;
    
    Which will be faster.
    Currently, it isn't done for RISC-V.
    
    I've added an expander for DIVMOD which replaces 'rem' with 'mul + sub'.
    
    Best regards,
    Matevos.
    
    gcc/ChangeLog:
    
            * config/riscv/iterators.md (only_div, paired_mod): New iterators.
            (u): Add div/udiv cases.
            * config/riscv/riscv-protos.h (riscv_use_divmod_expander): Prototype.
            * config/riscv/riscv.cc (struct riscv_tune_param): Add field for
            divmod expansion.
            (rocket_tune_info, sifive_7_tune_info): Initialize new field.
            (thead_c906_tune_info): Likewise.
            (optimize_size_tune_info): Likewise.
            (riscv_use_divmod_expander): New function.
            * config/riscv/riscv.md (<u>divmod<mode>4): New expander.
    
    gcc/testsuite/ChangeLog:
            * gcc.target/riscv/divmod-1.c: New testcase.
            * gcc.target/riscv/divmod-2.c: New testcase.

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index 9b767038452..1d56324df03 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -152,6 +152,11 @@ (define_code_iterator any_div [div udiv mod umod])
 ;; from the same template.
 (define_code_iterator any_mod [mod umod])
 
+;; These code iterators allow unsigned and signed divmod to be generated
+;; from the same template.
+(define_code_iterator only_div [div udiv])
+(define_code_attr paired_mod [(div "mod") (udiv "umod")])
+
 ;; These code iterators allow the signed and unsigned scc operations to use
 ;; the same template.
 (define_code_iterator any_gt [gt gtu])
@@ -181,6 +186,7 @@ (define_code_attr u [(sign_extend "") (zero_extend "u")
 		     (lt "") (ltu "u")
 		     (le "") (leu "u")
 		     (fix "") (unsigned_fix "u")
+		     (div "") (udiv "u")
 		     (float "") (unsigned_float "u")])
 
 ;; <su> is like <u>, but the signed form expands to "s" rather than "".
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index f87661bde2c..5a927bdf1b0 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -239,4 +239,5 @@ extern const char*
 th_mempair_output_move (rtx[4], bool, machine_mode, RTX_CODE);
 #endif
 
+extern bool riscv_use_divmod_expander (void);
 #endif /* ! GCC_RISCV_PROTOS_H */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 1529855a2b4..09a30dc260f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -236,6 +236,7 @@ struct riscv_tune_param
   unsigned short memory_cost;
   unsigned short fmv_cost;
   bool slow_unaligned_access;
+  bool use_divmod_expansion;
 };
 
 /* Information about one micro-arch we know about.  */
@@ -323,6 +324,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   5,						/* memory_cost */
   8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
+  false,					/* use_divmod_expansion */
 };
 
 /* Costs to use when optimizing for Sifive 7 Series.  */
@@ -337,6 +339,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   3,						/* memory_cost */
   8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
+  false,					/* use_divmod_expansion */
 };
 
 /* Costs to use when optimizing for T-HEAD c906.  */
@@ -351,6 +354,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
   5,            /* memory_cost */
   8,		/* fmv_cost */
   false,            /* slow_unaligned_access */
+  false		/* use_divmod_expansion */
 };
 
 /* Costs to use when optimizing for size.  */
@@ -365,6 +369,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
   2,						/* memory_cost */
   8,						/* fmv_cost */
   false,					/* slow_unaligned_access */
+  false,					/* use_divmod_expansion */
 };
 
 static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *);
@@ -7210,6 +7215,16 @@ riscv_lshift_subword (machine_mode mode, rtx value, rtx shift,
 						  gen_lowpart (QImode, shift)));
 }
 
+/* Return TRUE if we should use the divmod expander, FALSE otherwise.  This
+   allows the behavior to be tuned for specific implementations as well as
+   when optimizing for size.  */
+
+bool
+riscv_use_divmod_expander (void)
+{
+  return tune_param->use_divmod_expansion;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 376a8831820..c508ee3ad89 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1063,6 +1063,22 @@ (define_insn "<optab>di3"
   [(set_attr "type" "idiv")
    (set_attr "mode" "DI")])
 
+(define_expand "<u>divmod<mode>4"
+  [(parallel
+     [(set (match_operand:GPR 0 "register_operand")
+           (only_div:GPR (match_operand:GPR 1 "register_operand")
+                         (match_operand:GPR 2 "register_operand")))
+      (set (match_operand:GPR 3 "register_operand")
+           (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
+  "TARGET_DIV && riscv_use_divmod_expander ()"
+  {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1], operands[2]));
+      emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
+      emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
+      DONE;
+  })
+
 (define_insn "*<optab>si3_extended"
   [(set (match_operand:DI                 0 "register_operand" "=r")
 	(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/riscv/divmod-1.c b/gcc/testsuite/gcc.target/riscv/divmod-1.c
new file mode 100644
index 00000000000..9706611b500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/divmod-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+   *c = a / b;
+   *d = a % b;
+}
+
+/* { dg-final { scan-assembler-times "\tdiv" 1 } } */
+/* { dg-final { scan-assembler-times "\trem" 1} } */
diff --git a/gcc/testsuite/gcc.target/riscv/divmod-2.c b/gcc/testsuite/gcc.target/riscv/divmod-2.c
new file mode 100644
index 00000000000..dfd319e52c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/divmod-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* Skip this everywhere for now.  Once we have a target with
+   divmod enabled, only skip for -O0, -O1, -Og, -Oz, -Os.  */
+/* { dg-skip-if "" { *-*-* } { } } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+   *c = a / b;
+   *d = a % b;
+}
+
+/* { dg-final { scan-assembler-not "\trem" } } */
+/* { dg-final { scan-assembler-times "\tdiv" 1 } } */
+/* { dg-final { scan-assembler-times "\tmul" 1 } } */
+/* { dg-final { scan-assembler-times "\tsub" 1 } } */

diff mbox series

Patch

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index f95dd405e12..d941483d9f1 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -148,6 +148,11 @@ 
 ;; from the same template.
 (define_code_iterator any_mod [mod umod])

+;; These code iterators allow unsigned and signed divmod to be generated
+;; from the same template.
+(define_code_iterator only_div [div udiv])
+(define_code_attr paired_mod [(div "mod") (udiv "umod")])
+
 ;; These code iterators allow the signed and unsigned scc operations to use
 ;; the same template.
 (define_code_iterator any_gt [gt gtu])
@@ -175,7 +180,8 @@ 
      (gt "") (gtu "u")
      (ge "") (geu "u")
      (lt "") (ltu "u")
-     (le "") (leu "u")])
+     (le "") (leu "u")
+     (div "") (udiv "u")])

 ;; <su> is like <u>, but the signed form expands to "s" rather than "".
 (define_code_attr su [(sign_extend "s") (zero_extend "u")])
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index c8adc5af5d2..2d48ff3f8de 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1044,6 +1044,22 @@ 
   [(set_attr "type" "idiv")
    (set_attr "mode" "DI")])

+(define_expand "<u>divmod<mode>4"
+  [(parallel
+     [(set (match_operand:GPR 0 "register_operand")
+           (only_div:GPR (match_operand:GPR 1 "register_operand")
+                         (match_operand:GPR 2 "register_operand")))
+      (set (match_operand:GPR 3 "register_operand")
+           (<paired_mod>:GPR (match_dup 1) (match_dup 2)))])]
+  "TARGET_DIV"
+  {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_<u>div<GPR:mode>3 (operands[0], operands[1],
operands[2]));
+      emit_insn (gen_mul<GPR:mode>3 (tmp, operands[0], operands[2]));
+      emit_insn (gen_sub<GPR:mode>3 (operands[3], operands[1], tmp));
+      DONE;
+  })
+
 (define_insn "*<optab>si3_extended"
   [(set (match_operand:DI                 0 "register_operand" "=r")
  (sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/riscv/divmod.c
b/gcc/testsuite/gcc.target/riscv/divmod.c
new file mode 100644
index 00000000000..254b25e654d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/divmod.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */
+
+void
+foo(int a, int b, int *c, int *d)
+{
+   *c = a / b;
+   *d = a % b;
+}
+
+/* { dg-final { scan-assembler-not "rem" } } */
+/* { dg-final { scan-assembler-times "mul" 1 } } */
+/* { dg-final { scan-assembler-times "sub" 1 } } */