This patch implements scalar-to-vector (STV) support for DImode and SImode
rotations by constant bit counts. Scalar rotations are almost always
optimal on x86, requiring only one or two instructions, but it is also
possible to implement these efficiently with SSE2, requiring only one
or two instructions for SImode rotations and at most 3 instructions for
DImode rotations. This allows GCC to STV rotations with a small or no
penalty if there are other (net) benefits to converting a chain. An
example of the benefits is shown below, which is based upon the BLAKE2
cryptographic hash function:
unsigned long long a,b,c,d;
unsigned long rot(unsigned long long x, int y)
{
return (x<<y) | (x>>(64-y));
}
void foo()
{
d = rot(d ^ a,32);
c = c + d;
b = rot(b ^ c,24);
a = a + b;
d = rot(d ^ a,16);
c = c + d;
b = rot(b ^ c,63);
}
where with -m32 -O2 -msse2
Before (59 insns, 247 bytes):
foo: pushl %edi
xorl %edx, %edx
pushl %esi
pushl %ebx
subl $16, %esp
movq a, %xmm1
movq d, %xmm0
movq b, %xmm2
pxor %xmm1, %xmm0
psrlq $32, %xmm0
movd %xmm0, %eax
movd %edx, %xmm0
movd %eax, %xmm3
punpckldq %xmm0, %xmm3
movq c, %xmm0
paddq %xmm3, %xmm0
pxor %xmm0, %xmm2
movd %xmm2, %ecx
psrlq $32, %xmm2
movd %xmm2, %ebx
movl %ecx, %eax
shldl $24, %ebx, %ecx
shldl $24, %eax, %ebx
movd %ebx, %xmm4
movd %ecx, %xmm2
punpckldq %xmm4, %xmm2
movdqa .LC0, %xmm4
pand %xmm4, %xmm2
paddq %xmm2, %xmm1
movq %xmm1, a
pxor %xmm3, %xmm1
movd %xmm1, %esi
psrlq $32, %xmm1
movd %xmm1, %edi
movl %esi, %eax
shldl $16, %edi, %esi
shldl $16, %eax, %edi
movd %esi, %xmm1
movd %edi, %xmm3
punpckldq %xmm3, %xmm1
pand %xmm4, %xmm1
movq %xmm1, d
paddq %xmm1, %xmm0
movq %xmm0, c
pxor %xmm2, %xmm0
movd %xmm0, 8(%esp)
psrlq $32, %xmm0
movl 8(%esp), %eax
movd %xmm0, 12(%esp)
movl 12(%esp), %edx
shrdl $1, %edx, %eax
xorl %edx, %edx
movl %eax, b
movl %edx, b+4
addl $16, %esp
popl %ebx
popl %esi
popl %edi
ret
After (32 insns, 165 bytes):
movq a, %xmm1
xorl %edx, %edx
movq d, %xmm0
movq b, %xmm2
movdqa .LC0, %xmm4
pxor %xmm1, %xmm0
psrlq $32, %xmm0
movd %xmm0, %eax
movd %edx, %xmm0
movd %eax, %xmm3
punpckldq %xmm0, %xmm3
movq c, %xmm0
paddq %xmm3, %xmm0
pxor %xmm0, %xmm2
pshufd $68, %xmm2, %xmm2
psrldq $5, %xmm2
pand %xmm4, %xmm2
paddq %xmm2, %xmm1
movq %xmm1, a
pxor %xmm3, %xmm1
pshuflw $147, %xmm1, %xmm1
pand %xmm4, %xmm1
movq %xmm1, d
paddq %xmm1, %xmm0
movq %xmm0, c
pxor %xmm2, %xmm0
pshufd $20, %xmm0, %xmm0
psrlq $1, %xmm0
pshufd $136, %xmm0, %xmm0
pand %xmm4, %xmm0
movq %xmm0, b
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline?
2023-06-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Provide
gains/costs for ROTATE and ROTATERT (by an integer constant).
(general_scalar_chain::convert_rotate): New helper function to
convert a DImode or SImode rotation by an integer constant into
SSE vector form.
(general_scalar_chain::convert_insn): Call the new convert_rotate
for ROTATE and ROTATERT.
(general_scalar_to_vector_candidate_p): Consider ROTATE and
ROTATERT to be candidates if the second operand is an integer
constant, valid for a rotation (or shift) in the given mode.
* config/i386/i386-features.h (general_scalar_chain): Add new
helper method convert_rotate.
gcc/testsuite/ChangeLog
* gcc.target/i386/rotate-6.c: New test case.
* gcc.target/i386/sse2-stv-1.c: Likewise.
Thanks in advance,
Roger
--
On Fri, Jun 30, 2023 at 9:29 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch implements scalar-to-vector (STV) support for DImode and SImode
> rotations by constant bit counts. Scalar rotations are almost always
> optimal on x86, requiring only one or two instructions, but it is also
> possible to implement these efficiently with SSE2, requiring only one
> or two instructions for SImode rotations and at most 3 instructions for
> DImode rotations. This allows GCC to STV rotations with a small or no
> penalty if there are other (net) benefits to converting a chain. An
> example of the benefits is shown below, which is based upon the BLAKE2
> cryptographic hash function:
>
> unsigned long long a,b,c,d;
>
> unsigned long rot(unsigned long long x, int y)
> {
> return (x<<y) | (x>>(64-y));
> }
>
> void foo()
> {
> d = rot(d ^ a,32);
> c = c + d;
> b = rot(b ^ c,24);
> a = a + b;
> d = rot(d ^ a,16);
> c = c + d;
> b = rot(b ^ c,63);
> }
>
> where with -m32 -O2 -msse2
>
> Before (59 insns, 247 bytes):
> foo: pushl %edi
> xorl %edx, %edx
> pushl %esi
> pushl %ebx
> subl $16, %esp
> movq a, %xmm1
> movq d, %xmm0
> movq b, %xmm2
> pxor %xmm1, %xmm0
> psrlq $32, %xmm0
> movd %xmm0, %eax
> movd %edx, %xmm0
> movd %eax, %xmm3
> punpckldq %xmm0, %xmm3
> movq c, %xmm0
> paddq %xmm3, %xmm0
> pxor %xmm0, %xmm2
> movd %xmm2, %ecx
> psrlq $32, %xmm2
> movd %xmm2, %ebx
> movl %ecx, %eax
> shldl $24, %ebx, %ecx
> shldl $24, %eax, %ebx
> movd %ebx, %xmm4
> movd %ecx, %xmm2
> punpckldq %xmm4, %xmm2
> movdqa .LC0, %xmm4
> pand %xmm4, %xmm2
> paddq %xmm2, %xmm1
> movq %xmm1, a
> pxor %xmm3, %xmm1
> movd %xmm1, %esi
> psrlq $32, %xmm1
> movd %xmm1, %edi
> movl %esi, %eax
> shldl $16, %edi, %esi
> shldl $16, %eax, %edi
> movd %esi, %xmm1
> movd %edi, %xmm3
> punpckldq %xmm3, %xmm1
> pand %xmm4, %xmm1
> movq %xmm1, d
> paddq %xmm1, %xmm0
> movq %xmm0, c
> pxor %xmm2, %xmm0
> movd %xmm0, 8(%esp)
> psrlq $32, %xmm0
> movl 8(%esp), %eax
> movd %xmm0, 12(%esp)
> movl 12(%esp), %edx
> shrdl $1, %edx, %eax
> xorl %edx, %edx
> movl %eax, b
> movl %edx, b+4
> addl $16, %esp
> popl %ebx
> popl %esi
> popl %edi
> ret
>
> After (32 insns, 165 bytes):
> movq a, %xmm1
> xorl %edx, %edx
> movq d, %xmm0
> movq b, %xmm2
> movdqa .LC0, %xmm4
> pxor %xmm1, %xmm0
> psrlq $32, %xmm0
> movd %xmm0, %eax
> movd %edx, %xmm0
> movd %eax, %xmm3
> punpckldq %xmm0, %xmm3
> movq c, %xmm0
> paddq %xmm3, %xmm0
> pxor %xmm0, %xmm2
> pshufd $68, %xmm2, %xmm2
> psrldq $5, %xmm2
> pand %xmm4, %xmm2
> paddq %xmm2, %xmm1
> movq %xmm1, a
> pxor %xmm3, %xmm1
> pshuflw $147, %xmm1, %xmm1
> pand %xmm4, %xmm1
> movq %xmm1, d
> paddq %xmm1, %xmm0
> movq %xmm0, c
> pxor %xmm2, %xmm0
> pshufd $20, %xmm0, %xmm0
> psrlq $1, %xmm0
> pshufd $136, %xmm0, %xmm0
> pand %xmm4, %xmm0
> movq %xmm0, b
> ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures. Ok for mainline?
>
>
> 2023-06-30 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/i386-features.cc (compute_convert_gain): Provide
> gains/costs for ROTATE and ROTATERT (by an integer constant).
> (general_scalar_chain::convert_rotate): New helper function to
> convert a DImode or SImode rotation by an integer constant into
> SSE vector form.
> (general_scalar_chain::convert_insn): Call the new convert_rotate
> for ROTATE and ROTATERT.
> (general_scalar_to_vector_candidate_p): Consider ROTATE and
> ROTATERT to be candidates if the second operand is an integer
> constant, valid for a rotation (or shift) in the given mode.
> * config/i386/i386-features.h (general_scalar_chain): Add new
> helper method convert_rotate.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/rotate-6.c: New test case.
> * gcc.target/i386/sse2-stv-1.c: Likewise.
LGTM.
Please note that AVX512VL provides VPROLD/VPROLQ and VPRORD/VPRORQ
native rotate instructions that can come handy here.
Thanks,
Uros.
>
>
> Thanks in advance,
> Roger
> --
>
@@ -582,6 +582,25 @@ general_scalar_chain::compute_convert_gain ()
igain -= vector_const_cost (XEXP (src, 0));
break;
+ case ROTATE:
+ case ROTATERT:
+ igain += m * ix86_cost->shift_const;
+ if (smode == DImode)
+ {
+ int bits = INTVAL (XEXP (src, 1));
+ if ((bits & 0x0f) == 0)
+ igain -= ix86_cost->sse_op;
+ else if ((bits & 0x07) == 0)
+ igain -= 2 * ix86_cost->sse_op;
+ else
+ igain -= 3 * ix86_cost->sse_op;
+ }
+ else if (INTVAL (XEXP (src, 1)) == 16)
+ igain -= ix86_cost->sse_op;
+ else
+ igain -= 2 * ix86_cost->sse_op;
+ break;
+
case AND:
case IOR:
case XOR:
@@ -1154,6 +1173,95 @@ scalar_chain::convert_insn_common (rtx_insn *insn)
}
}
+/* Convert INSN which is an SImode or DImode rotation by a constant
+ to vector mode. CODE is either ROTATE or ROTATERT with operands
+ OP0 and OP1. Returns the SET_SRC of the last instruction in the
+ resulting sequence, which is emitted before INSN. */
+
+rtx
+general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
+ rtx_insn *insn)
+{
+ int bits = INTVAL (op1);
+ rtx pat, result;
+
+ convert_op (&op0, insn);
+ if (bits == 0)
+ return op0;
+
+ if (smode == DImode)
+ {
+ if (code == ROTATE)
+ bits = 64 - bits;
+ if (bits == 32)
+ {
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+ GEN_INT (225));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V2DImode, tmp1);
+ }
+ else if (bits == 16 || bits == 48)
+ {
+ rtx tmp1 = gen_reg_rtx (V8HImode);
+ pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
+ GEN_INT (bits == 16 ? 57 : 147));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V2DImode, tmp1);
+ }
+ else if ((bits & 0x07) == 0)
+ {
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+ GEN_INT (68));
+ emit_insn_before (pat, insn);
+ rtx tmp2 = gen_reg_rtx (V1TImode);
+ pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
+ GEN_INT (bits));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V2DImode, tmp2);
+ }
+ else
+ {
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+ GEN_INT (20));
+ emit_insn_before (pat, insn);
+ rtx tmp2 = gen_reg_rtx (V2DImode);
+ pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
+ GEN_INT (bits & 31));
+ emit_insn_before (pat, insn);
+ rtx tmp3 = gen_reg_rtx (V4SImode);
+ pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
+ GEN_INT (bits > 32 ? 34 : 136));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V2DImode, tmp3);
+ }
+ }
+ else if (bits == 16)
+ {
+ rtx tmp1 = gen_reg_rtx (V8HImode);
+ pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V4SImode, tmp1);
+ }
+ else
+ {
+ if (code == ROTATE)
+ bits = 32 - bits;
+
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
+ rtx tmp2 = gen_reg_rtx (V2DImode);
+ pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
+ GEN_INT (bits));
+ emit_insn_before (pat, insn);
+ result = gen_lowpart (V4SImode, tmp2);
+ }
+
+ return result;
+}
+
/* Convert INSN to vector mode. */
void
@@ -1209,6 +1317,12 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
PUT_MODE (src, vmode);
break;
+ case ROTATE:
+ case ROTATERT:
+ src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
+ insn);
+ break;
+
case NEG:
src = XEXP (src, 0);
@@ -1982,6 +2096,8 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
case ASHIFT:
case LSHIFTRT:
+ case ROTATE:
+ case ROTATERT:
if (!CONST_INT_P (XEXP (src, 1))
|| !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
return false;
@@ -189,6 +189,7 @@ class general_scalar_chain : public scalar_chain
void convert_insn (rtx_insn *insn) final override;
void convert_op (rtx *op, rtx_insn *insn) final override;
int vector_const_cost (rtx exp);
+ rtx convert_rotate (enum rtx_code, rtx op0, rtx op1, rtx_insn *insn);
};
class timode_scalar_chain : public scalar_chain
new file mode 100644
@@ -0,0 +1,195 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+/* scalar 64-bit DImode rotations. */
+unsigned long long rot1(unsigned long long x) { return (x>>1) | (x<<63); }
+unsigned long long rot2(unsigned long long x) { return (x>>2) | (x<<62); }
+unsigned long long rot3(unsigned long long x) { return (x>>3) | (x<<61); }
+unsigned long long rot4(unsigned long long x) { return (x>>4) | (x<<60); }
+unsigned long long rot5(unsigned long long x) { return (x>>5) | (x<<59); }
+unsigned long long rot6(unsigned long long x) { return (x>>6) | (x<<58); }
+unsigned long long rot7(unsigned long long x) { return (x>>7) | (x<<57); }
+unsigned long long rot8(unsigned long long x) { return (x>>8) | (x<<56); }
+unsigned long long rot9(unsigned long long x) { return (x>>9) | (x<<55); }
+unsigned long long rot10(unsigned long long x) { return (x>>10) | (x<<54); }
+unsigned long long rot15(unsigned long long x) { return (x>>15) | (x<<49); }
+unsigned long long rot16(unsigned long long x) { return (x>>16) | (x<<48); }
+unsigned long long rot17(unsigned long long x) { return (x>>17) | (x<<47); }
+unsigned long long rot20(unsigned long long x) { return (x>>20) | (x<<44); }
+unsigned long long rot24(unsigned long long x) { return (x>>24) | (x<<40); }
+unsigned long long rot30(unsigned long long x) { return (x>>30) | (x<<34); }
+unsigned long long rot31(unsigned long long x) { return (x>>31) | (x<<33); }
+unsigned long long rot32(unsigned long long x) { return (x>>32) | (x<<32); }
+unsigned long long rot33(unsigned long long x) { return (x>>33) | (x<<31); }
+unsigned long long rot34(unsigned long long x) { return (x>>34) | (x<<30); }
+unsigned long long rot40(unsigned long long x) { return (x>>40) | (x<<24); }
+unsigned long long rot42(unsigned long long x) { return (x>>42) | (x<<22); }
+unsigned long long rot48(unsigned long long x) { return (x>>48) | (x<<16); }
+unsigned long long rot50(unsigned long long x) { return (x>>50) | (x<<14); }
+unsigned long long rot56(unsigned long long x) { return (x>>56) | (x<<8); }
+unsigned long long rot58(unsigned long long x) { return (x>>58) | (x<<6); }
+unsigned long long rot60(unsigned long long x) { return (x>>60) | (x<<4); }
+unsigned long long rot61(unsigned long long x) { return (x>>61) | (x<<3); }
+unsigned long long rot62(unsigned long long x) { return (x>>62) | (x<<2); }
+unsigned long long rot63(unsigned long long x) { return (x>>63) | (x<<1); }
+
+/* DImode mem-to-mem rotations. These STV with -m32. */
+void mem1(unsigned long long *p) { *p = rot1(*p); }
+void mem2(unsigned long long *p) { *p = rot2(*p); }
+void mem3(unsigned long long *p) { *p = rot3(*p); }
+void mem4(unsigned long long *p) { *p = rot4(*p); }
+void mem5(unsigned long long *p) { *p = rot5(*p); }
+void mem6(unsigned long long *p) { *p = rot6(*p); }
+void mem7(unsigned long long *p) { *p = rot7(*p); }
+void mem8(unsigned long long *p) { *p = rot8(*p); }
+void mem9(unsigned long long *p) { *p = rot9(*p); }
+void mem10(unsigned long long *p) { *p = rot10(*p); }
+void mem15(unsigned long long *p) { *p = rot15(*p); }
+void mem16(unsigned long long *p) { *p = rot16(*p); }
+void mem17(unsigned long long *p) { *p = rot17(*p); }
+void mem20(unsigned long long *p) { *p = rot20(*p); }
+void mem24(unsigned long long *p) { *p = rot24(*p); }
+void mem30(unsigned long long *p) { *p = rot30(*p); }
+void mem31(unsigned long long *p) { *p = rot31(*p); }
+void mem32(unsigned long long *p) { *p = rot32(*p); }
+void mem33(unsigned long long *p) { *p = rot33(*p); }
+void mem34(unsigned long long *p) { *p = rot34(*p); }
+void mem40(unsigned long long *p) { *p = rot40(*p); }
+void mem42(unsigned long long *p) { *p = rot42(*p); }
+void mem48(unsigned long long *p) { *p = rot48(*p); }
+void mem50(unsigned long long *p) { *p = rot50(*p); }
+void mem56(unsigned long long *p) { *p = rot56(*p); }
+void mem58(unsigned long long *p) { *p = rot58(*p); }
+void mem60(unsigned long long *p) { *p = rot60(*p); }
+void mem61(unsigned long long *p) { *p = rot61(*p); }
+void mem62(unsigned long long *p) { *p = rot62(*p); }
+void mem63(unsigned long long *p) { *p = rot63(*p); }
+
+/* Check that rotN and memN give the same result. */
+typedef unsigned long long (*rotN)(unsigned long long);
+typedef void (*memN)(unsigned long long*);
+
+void eval(rotN s, memN v, unsigned long long x)
+{
+ unsigned long long r = s(x);
+ unsigned long long t = x;
+ v(&t);
+
+ if (t != r)
+ __builtin_abort ();
+}
+
+void test(rotN s, memN v)
+{
+ eval(s,v,0x0000000000000000ll);
+ eval(s,v,0x0000000000000001ll);
+ eval(s,v,0x0000000000000002ll);
+ eval(s,v,0x0000000000000004ll);
+ eval(s,v,0x0000000000000008ll);
+ eval(s,v,0x0000000000000010ll);
+ eval(s,v,0x0000000000000020ll);
+ eval(s,v,0x0000000000000040ll);
+ eval(s,v,0x0000000000000080ll);
+ eval(s,v,0x0000000000000100ll);
+ eval(s,v,0x0000000000000200ll);
+ eval(s,v,0x0000000000000400ll);
+ eval(s,v,0x0000000000000800ll);
+ eval(s,v,0x0000000000001000ll);
+ eval(s,v,0x0000000000002000ll);
+ eval(s,v,0x0000000000004000ll);
+ eval(s,v,0x0000000000008000ll);
+ eval(s,v,0x0000000000010000ll);
+ eval(s,v,0x0000000000020000ll);
+ eval(s,v,0x0000000000040000ll);
+ eval(s,v,0x0000000000080000ll);
+ eval(s,v,0x0000000000100000ll);
+ eval(s,v,0x0000000000200000ll);
+ eval(s,v,0x0000000000400000ll);
+ eval(s,v,0x0000000000800000ll);
+ eval(s,v,0x0000000001000000ll);
+ eval(s,v,0x0000000002000000ll);
+ eval(s,v,0x0000000004000000ll);
+ eval(s,v,0x0000000008000000ll);
+ eval(s,v,0x0000000010000000ll);
+ eval(s,v,0x0000000020000000ll);
+ eval(s,v,0x0000000040000000ll);
+ eval(s,v,0x0000000080000000ll);
+ eval(s,v,0x0000000100000000ll);
+ eval(s,v,0x0000000200000000ll);
+ eval(s,v,0x0000000400000000ll);
+ eval(s,v,0x0000000800000000ll);
+ eval(s,v,0x0000001000000000ll);
+ eval(s,v,0x0000002000000000ll);
+ eval(s,v,0x0000004000000000ll);
+ eval(s,v,0x0000008000000000ll);
+ eval(s,v,0x0000010000000000ll);
+ eval(s,v,0x0000020000000000ll);
+ eval(s,v,0x0000040000000000ll);
+ eval(s,v,0x0000080000000000ll);
+ eval(s,v,0x0000100000000000ll);
+ eval(s,v,0x0000200000000000ll);
+ eval(s,v,0x0000400000000000ll);
+ eval(s,v,0x0000800000000000ll);
+ eval(s,v,0x0001000000000000ll);
+ eval(s,v,0x0002000000000000ll);
+ eval(s,v,0x0004000000000000ll);
+ eval(s,v,0x0008000000000000ll);
+ eval(s,v,0x0010000000000000ll);
+ eval(s,v,0x0020000000000000ll);
+ eval(s,v,0x0040000000000000ll);
+ eval(s,v,0x0080000000000000ll);
+ eval(s,v,0x0100000000000000ll);
+ eval(s,v,0x0200000000000000ll);
+ eval(s,v,0x0400000000000000ll);
+ eval(s,v,0x0800000000000000ll);
+ eval(s,v,0x1000000000000000ll);
+ eval(s,v,0x2000000000000000ll);
+ eval(s,v,0x4000000000000000ll);
+ eval(s,v,0x8000000000000000ll);
+ eval(s,v,0x0123456789abcdefll);
+ eval(s,v,0x1111111111111111ll);
+ eval(s,v,0x5555555555555555ll);
+ eval(s,v,0x8888888888888888ll);
+ eval(s,v,0xaaaaaaaaaaaaaaaall);
+ eval(s,v,0xcafebabecafebabell);
+ eval(s,v,0xdeadbeefdeadbeefll);
+ eval(s,v,0xfedcba9876543210ll);
+ eval(s,v,0xffffffffffffffffll);
+}
+
+int main()
+{
+ test(rot1,mem1);
+ test(rot2,mem2);
+ test(rot3,mem3);
+ test(rot4,mem4);
+ test(rot5,mem5);
+ test(rot6,mem6);
+ test(rot7,mem7);
+ test(rot8,mem8);
+ test(rot9,mem9);
+ test(rot10,mem10);
+ test(rot15,mem15);
+ test(rot16,mem16);
+ test(rot17,mem17);
+ test(rot20,mem20);
+ test(rot24,mem24);
+ test(rot30,mem30);
+ test(rot31,mem31);
+ test(rot32,mem32);
+ test(rot33,mem33);
+ test(rot34,mem34);
+ test(rot40,mem40);
+ test(rot42,mem42);
+ test(rot48,mem48);
+ test(rot50,mem50);
+ test(rot56,mem56);
+ test(rot58,mem58);
+ test(rot60,mem60);
+ test(rot61,mem61);
+ test(rot62,mem62);
+ test(rot63,mem63);
+ return 0;
+}
+
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2" } */
+
+unsigned long long a,b,c,d;
+
+static unsigned long rot(unsigned long long x, int y)
+{
+ /* Only called with y in 1..63. */
+ return (x<<y) | (x>>(64-y));
+}
+
+void foo()
+{
+ d = rot(d ^ a,32);
+ c = c + d;
+ b = rot(b ^ c,24);
+ a = a + b;
+ d = rot(d ^ a,16);
+ c = c + d;
+ b = rot(b ^ c,63);
+}
+
+/* { dg-final { scan-assembler-not "shldl" } } */
+/* { dg-final { scan-assembler-not "%\[er\]sp" } } */