Hello
This patch adds support for some additional floating-point operations,
in scalar and vector modes, which are natively supported by the AMD GCN
instruction set, but haven't been implemented in GCC yet. With the
exception of frexp, these implement standard RTL names, and should be
utilised automatically by GCC.
The instructions for the transcendental functions are documented to have
limited numerical precision, so they are only used if
unsafe_math_optimizations are enabled for now.
The sin and cos instructions for some reason are scaled by 2*PI radians
(i.e. 1.0 == 2*PI radians/360 degrees), so their inputs need to be
scaled by 1/(2*PI) first. I've implemented this as an expander to two
instructions - one to do the pre-scaling, one to do the sin/cos.
1/(2*PI) is a builtin constant for GCN, but the syntax to use it in the
LLVM assembler was wrong - now fixed.
I have also added some extra GCN-specific builtins to access the vector
versions of some of these operations (to implement vectorized versions
of library math routines) and to access the frexp operations.
Okay for trunk?
Thanks
Kwok
From 5592c4512212ba74a7a690821650ddcba05df848 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcy@codesourcery.com>
Date: Thu, 8 Sep 2022 17:37:26 +0000
Subject: [PATCH] amdgcn: Add support for additional natively supported
floating-point operations
This adds support for the following natively supported floating-point
operations, in scalar and vectorized modes:
floor, ceil, exp2*, log2*, sin*, cos*, ldexp, frexp
* These operations are single-precision float only and are only active
if unsafe_math_optimizations are enabled (due to potential numerical
precision issues).
2022-09-08 Kwok Cheung Yeung <kcy@codesourcery.com>
gcc/
* config/gcn/gcn-builtins.def (FABSVF, LDEXPVF, LDEXPV, FREXPVF_EXP,
FREXPVF_MANT, FREXPV_EXP, FREXPV_MANT): Add new builtins.
* config/gcn/gcn-protos.h (gcn_dconst1over2pi): New prototype.
* config/gcn/gcn-valu.md (MATH_UNOP_1OR2REG, MATH_UNOP_1REG,
MATH_UNOP_TRIG): New iterators.
(math_unop): New attributes.
(<math_unop><mode>2, <math_unop><mode>2<exec>,
<math_unop><mode>2, <math_unop><mode>2<exec>,
<math_unop><mode>2_insn, <math_unop><mode>2<exec>_insn,
ldexp<mode>3, ldexp<mode>3<exec>,
frexp<mode>_exp2, frexp<mode>_mant2,
frexp<mode>_exp2<exec>, frexp<mode>_mant2<exec>): New instructions.
(<math_unop><mode>2, <math_unop><mode>2<exec>): New expanders.
* config/gcn/gcn.cc (init_ext_gcn_constants): Update definition of
dconst1over2pi.
(gcn_dconst1over2pi): New.
(gcn_builtin_type_index): Add entry for v64df type.
(v64df_type_node): New.
(gcn_init_builtin_types): Initialize v64df_type_node.
(gcn_expand_builtin_1): Expand new builtins to instructions.
(print_operand): Fix assembler output for 1/(2*PI) constant.
* config/gcn/gcn.md (unspec): Add new entries.
---
gcc/config/gcn/gcn-builtins.def | 35 ++++++
gcc/config/gcn/gcn-protos.h | 1 +
gcc/config/gcn/gcn-valu.md | 181 ++++++++++++++++++++++++++++++++
gcc/config/gcn/gcn.cc | 114 +++++++++++++++++++-
gcc/config/gcn/gcn.md | 4 +-
5 files changed, 332 insertions(+), 3 deletions(-)
@@ -59,6 +59,41 @@ DEF_BUILTIN (SQRTF, 3 /*CODE_FOR_sqrtf */,
_A2 (GCN_BTI_SF, GCN_BTI_SF),
gcn_expand_builtin_1)
+DEF_BUILTIN (FABSVF, 3 /*CODE_FOR_fabsvf */,
+ "fabsvf", B_INSN,
+ _A2 (GCN_BTI_V64SF, GCN_BTI_V64SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (LDEXPVF, 3 /*CODE_FOR_ldexpvf */,
+ "ldexpvf", B_INSN,
+ _A3 (GCN_BTI_V64SF, GCN_BTI_V64SF, GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (LDEXPV, 3 /*CODE_FOR_ldexpv */,
+ "ldexpv", B_INSN,
+ _A3 (GCN_BTI_V64DF, GCN_BTI_V64DF, GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FREXPVF_EXP, 3 /*CODE_FOR_frexpvf_exp */,
+ "frexpvf_exp", B_INSN,
+ _A2 (GCN_BTI_V64SI, GCN_BTI_V64SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FREXPVF_MANT, 3 /*CODE_FOR_frexpvf_mant */,
+ "frexpvf_mant", B_INSN,
+ _A2 (GCN_BTI_V64SF, GCN_BTI_V64SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FREXPV_EXP, 3 /*CODE_FOR_frexpv_exp */,
+ "frexpv_exp", B_INSN,
+ _A2 (GCN_BTI_V64SI, GCN_BTI_V64DF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FREXPV_MANT, 3 /*CODE_FOR_frexpv_mant */,
+ "frexpv_mant", B_INSN,
+ _A2 (GCN_BTI_V64DF, GCN_BTI_V64DF),
+ gcn_expand_builtin_1)
+
DEF_BUILTIN (CMP_SWAP, -1,
"cmp_swap", B_INSN,
_A4 (GCN_BTI_UINT, GCN_BTI_VOIDPTR, GCN_BTI_UINT, GCN_BTI_UINT),
@@ -54,6 +54,7 @@ extern int gcn_hard_regno_nregs (int regno, machine_mode mode);
extern void gcn_hsa_declare_function_name (FILE *file, const char *name,
tree decl);
extern HOST_WIDE_INT gcn_initial_elimination_offset (int, int);
+extern REAL_VALUE_TYPE gcn_dconst1over2pi (void);
extern bool gcn_inline_constant64_p (rtx, bool);
extern bool gcn_inline_constant_p (rtx);
extern int gcn_inline_fp_constant_p (rtx, bool);
@@ -2290,6 +2290,187 @@
[(set_attr "type" "vop1")
(set_attr "length" "8")])
+; These FP unops have f64, f32 and f16 versions.
+(define_int_iterator MATH_UNOP_1OR2REG
+ [UNSPEC_FLOOR UNSPEC_CEIL])
+
+; These FP unops only have f16/f32 versions.
+(define_int_iterator MATH_UNOP_1REG
+ [UNSPEC_EXP2 UNSPEC_LOG2])
+
+(define_int_iterator MATH_UNOP_TRIG
+ [UNSPEC_SIN UNSPEC_COS])
+
+(define_int_attr math_unop
+ [(UNSPEC_FLOOR "floor")
+ (UNSPEC_CEIL "ceil")
+ (UNSPEC_EXP2 "exp2")
+ (UNSPEC_LOG2 "log2")
+ (UNSPEC_SIN "sin")
+ (UNSPEC_COS "cos")])
+
+(define_insn "<math_unop><mode>2"
+ [(set (match_operand:FP 0 "register_operand" "= v")
+ (unspec:FP
+ [(match_operand:FP 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_1OR2REG))]
+ ""
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<math_unop><mode>2<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v")
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_1OR2REG))]
+ ""
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<math_unop><mode>2"
+ [(set (match_operand:FP_1REG 0 "register_operand" "= v")
+ (unspec:FP_1REG
+ [(match_operand:FP_1REG 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_1REG))]
+ "flag_unsafe_math_optimizations"
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<math_unop><mode>2<exec>"
+ [(set (match_operand:V_FP_1REG 0 "register_operand" "= v")
+ (unspec:V_FP_1REG
+ [(match_operand:V_FP_1REG 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_1REG))]
+ "flag_unsafe_math_optimizations"
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "*<math_unop><mode>2_insn"
+ [(set (match_operand:FP_1REG 0 "register_operand" "= v")
+ (unspec:FP_1REG
+ [(match_operand:FP_1REG 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_TRIG))]
+ "flag_unsafe_math_optimizations"
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "*<math_unop><mode>2<exec>_insn"
+ [(set (match_operand:V_FP_1REG 0 "register_operand" "= v")
+ (unspec:V_FP_1REG
+ [(match_operand:V_FP_1REG 1 "gcn_alu_operand" "vSvB")]
+ MATH_UNOP_TRIG))]
+ "flag_unsafe_math_optimizations"
+ "v_<math_unop>%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+; Trigonometric functions need their input scaled by 1/(2*PI) first.
+
+(define_expand "<math_unop><mode>2"
+ [(set (match_dup 2)
+ (mult:FP_1REG
+ (match_dup 3)
+ (match_operand:FP_1REG 1 "gcn_alu_operand")))
+ (set (match_operand:FP_1REG 0 "register_operand")
+ (unspec:FP_1REG
+ [(match_dup 2)]
+ MATH_UNOP_TRIG))]
+ "flag_unsafe_math_optimizations"
+ {
+ operands[2] = gen_reg_rtx (<MODE>mode);
+ operands[3] = const_double_from_real_value (gcn_dconst1over2pi (),
+ <MODE>mode);
+ })
+
+(define_expand "<math_unop><mode>2<exec>"
+ [(set (match_dup 2)
+ (mult:V_FP_1REG
+ (match_dup 3)
+ (match_operand:V_FP_1REG 1 "gcn_alu_operand")))
+ (set (match_operand:V_FP_1REG 0 "register_operand")
+ (unspec:V_FP_1REG
+ [(match_dup 2)]
+ MATH_UNOP_TRIG))]
+ "flag_unsafe_math_optimizations"
+ {
+ operands[2] = gen_reg_rtx (<MODE>mode);
+ operands[3] =
+ gcn_vec_constant (<MODE>mode,
+ const_double_from_real_value (gcn_dconst1over2pi (),
+ <SCALAR_MODE>mode));
+ })
+
+; Implement ldexp pattern
+
+(define_insn "ldexp<mode>3"
+ [(set (match_operand:FP 0 "register_operand" "=v")
+ (unspec:FP
+ [(match_operand:FP 1 "gcn_alu_operand" "vB")
+ (match_operand:SI 2 "gcn_alu_operand" "vSvA")]
+ UNSPEC_LDEXP))]
+ ""
+ "v_ldexp%i0\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "ldexp<mode>3<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "=v")
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "gcn_alu_operand" "vB")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSvA")]
+ UNSPEC_LDEXP))]
+ ""
+ "v_ldexp%i0\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+; Implement frexp patterns
+
+(define_insn "frexp<mode>_exp2"
+ [(set (match_operand:SI 0 "register_operand" "=v")
+ (unspec:SI
+ [(match_operand:FP 1 "gcn_alu_operand" "vB")]
+ UNSPEC_FREXP_EXP))]
+ ""
+ "v_frexp_exp_i32%i1\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "frexp<mode>_mant2"
+ [(set (match_operand:FP 0 "register_operand" "=v")
+ (unspec:FP
+ [(match_operand:FP 1 "gcn_alu_operand" "vB")]
+ UNSPEC_FREXP_MANT))]
+ ""
+ "v_frexp_mant%i1\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "frexp<mode>_exp2<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "=v")
+ (unspec:V64SI
+ [(match_operand:V_FP 1 "gcn_alu_operand" "vB")]
+ UNSPEC_FREXP_EXP))]
+ ""
+ "v_frexp_exp_i32%i1\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "frexp<mode>_mant2<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "=v")
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "gcn_alu_operand" "vB")]
+ UNSPEC_FREXP_MANT))]
+ ""
+ "v_frexp_mant%i1\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
;; }}}
;; {{{ FP fused multiply and add
@@ -779,12 +779,20 @@ init_ext_gcn_constants (void)
/* FIXME: this constant probably does not match what hardware really loads.
Reality check it eventually. */
real_from_string (&dconst1over2pi,
- "0.1591549430918953357663423455968866839");
+ "0.15915494309189532");
real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
ext_gcn_constants_init = 1;
}
+REAL_VALUE_TYPE
+gcn_dconst1over2pi (void)
+{
+ if (!ext_gcn_constants_init)
+ init_ext_gcn_constants ();
+ return dconst1over2pi;
+}
+
/* Return non-zero if X is a constant that can appear as an inline operand.
This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
Or a vector of those.
@@ -3605,6 +3613,7 @@ enum gcn_builtin_type_index
GCN_BTI_SF,
GCN_BTI_V64SI,
GCN_BTI_V64SF,
+ GCN_BTI_V64DF,
GCN_BTI_V64PTR,
GCN_BTI_SIPTR,
GCN_BTI_SFPTR,
@@ -3621,6 +3630,7 @@ static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
+#define v64df_type_node (gcn_builtin_types[GCN_BTI_V64DF])
#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
@@ -3710,6 +3720,7 @@ gcn_init_builtin_types (void)
sf_type_node = float32_type_node;
v64si_type_node = build_vector_type (intSI_type_node, 64);
v64sf_type_node = build_vector_type (float_type_node, 64);
+ v64df_type_node = build_vector_type (double_type_node, 64);
v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
/*build_pointer_type
(integer_type_node) */
@@ -3977,6 +3988,105 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
emit_insn (gen_sqrtsf2 (target, arg));
return target;
}
+ case GCN_BUILTIN_FABSVF:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_absv64sf2_exec
+ (target, arg, gcn_gen_undef (V64SFmode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_LDEXPVF:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg1 = force_reg (V64SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ rtx arg2 = force_reg (V64SImode,
+ expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
+ V64SImode,
+ EXPAND_NORMAL));
+ emit_insn (gen_ldexpv64sf3_exec
+ (target, arg1, arg2, gcn_gen_undef (V64SFmode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_LDEXPV:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg1 = force_reg (V64DFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ rtx arg2 = force_reg (V64SImode,
+ expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
+ V64SImode,
+ EXPAND_NORMAL));
+ emit_insn (gen_ldexpv64df3_exec
+ (target, arg1, arg2, gcn_gen_undef (V64DFmode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_FREXPVF_EXP:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_frexpv64sf_exp2_exec
+ (target, arg, gcn_gen_undef (V64SImode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_FREXPVF_MANT:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_frexpv64sf_mant2_exec
+ (target, arg, gcn_gen_undef (V64SFmode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_FREXPV_EXP:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64DFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64DFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_frexpv64df_exp2_exec
+ (target, arg, gcn_gen_undef (V64SImode), exec));
+ return target;
+ }
+ case GCN_BUILTIN_FREXPV_MANT:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64DFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64DFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_frexpv64df_mant2_exec
+ (target, arg, gcn_gen_undef (V64DFmode), exec));
+ return target;
+ }
case GCN_BUILTIN_OMP_DIM_SIZE:
{
if (ignore)
@@ -6476,7 +6586,7 @@ print_operand (FILE *file, rtx x, int code)
str = "-4.0";
break;
case 248:
- str = "1/pi";
+ str = "0.15915494";
break;
default:
rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
@@ -82,7 +82,9 @@
UNSPEC_GATHER
UNSPEC_SCATTER
UNSPEC_RCP
- UNSPEC_FLBIT_INT])
+ UNSPEC_FLBIT_INT
+ UNSPEC_FLOOR UNSPEC_CEIL UNSPEC_SIN UNSPEC_COS UNSPEC_EXP2 UNSPEC_LOG2
+ UNSPEC_LDEXP UNSPEC_FREXP_EXP UNSPEC_FREXP_MANT])
;; }}}
;; {{{ Attributes