Support Intel AMX-FP16 ISA
Checks
Commit Message
From: Hongyu Wang <hongyu.wang@intel.com>
Hi all,
This patch aimed to add Intel AMX-FP16 ISA according to newly
released Intel Architecture Instruction Set Extensions and Future Features.
The document comes following:
https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
Regtested on x86_64-pc-linux-gnu. Ok for trunk?
BRs,
Haochen
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features): Detect
amx-fp16.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_FP16_SET,
(OPTION_MASK_ISA2_AMX_FP16_UNSET): New macros.
(ix86_handle_option): Handle -mamx-fp16.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AMX_FP16.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
amx-fp16.
* config.gcc: Add amxfp16intrin.h.
* config/i386/cpuid.h (bit_AMX_FP16): New.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AMX_FP16__.
* config/i386/i386-options.cc (isa2_opts): Add -mamx-fp16.
(ix86_valid_target_attribute_inner_p): Add new ATTR.
(ix86_option_override_internal): Handle AMX-FP16.
* config/i386/i386-isas.def: Add DEF_PTA for AMX_FP16.
* config/i386/i386.opt: Add -mamx-fp16.
* config/i386/immintrin.h: Include amxfp16intrin.h.
* doc/extend.texi: Document -mamx-fp16.
* doc/invoke.texi: Document amx-fp16.
* doc/sourcebuild.texi: Document amx_fp16.
* config/i386/amxfp16intrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.dg/other/i386-2.C: Add -mamx-fp16.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/sse-12.c: Ditto.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp: (check_effective_target_amx_fp16):
New proc.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/amx-helper.h: New file to support amx-fp16.
* gcc.target/i386/amxfp16-asmatt-1.c: New test.
* gcc.target/i386/amxfp16-asmintel-1.c: Ditto.
* gcc.target/i386/amxfp16-dpfp16ps-2.c: Ditto.
Co-authored-by: Haochen Jiang <haochen.jiang@intel.com>
---
gcc/common/config/i386/cpuinfo.h | 5 ++
gcc/common/config/i386/i386-common.cc | 15 +++++
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/common/config/i386/i386-isas.h | 1 +
gcc/config.gcc | 2 +-
gcc/config/i386/amxfp16intrin.h | 46 ++++++++++++++
gcc/config/i386/cpuid.h | 1 +
gcc/config/i386/i386-c.cc | 2 +
gcc/config/i386/i386-isa.def | 1 +
gcc/config/i386/i386-options.cc | 4 +-
gcc/config/i386/i386.opt | 4 ++
gcc/config/i386/immintrin.h | 2 +
gcc/doc/extend.texi | 5 ++
gcc/doc/invoke.texi | 11 ++--
gcc/doc/sourcebuild.texi | 3 +
gcc/testsuite/g++.dg/other/i386-2.C | 2 +-
gcc/testsuite/g++.dg/other/i386-3.C | 2 +-
gcc/testsuite/gcc.target/i386/amx-check.h | 3 +
gcc/testsuite/gcc.target/i386/amx-helper.h | 61 +++++++++++++++++++
.../gcc.target/i386/amxfp16-asmatt-1.c | 13 ++++
.../gcc.target/i386/amxfp16-asmintel-1.c | 10 +++
.../gcc.target/i386/amxfp16-dpfp16ps-2.c | 57 +++++++++++++++++
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 +
gcc/testsuite/gcc.target/i386/sse-12.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-13.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-14.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-22.c | 4 +-
gcc/testsuite/gcc.target/i386/sse-23.c | 2 +-
gcc/testsuite/lib/target-supports.exp | 11 ++++
29 files changed, 262 insertions(+), 14 deletions(-)
create mode 100644 gcc/config/i386/amxfp16intrin.h
create mode 100644 gcc/testsuite/gcc.target/i386/amx-helper.h
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
Comments
On Fri, Oct 14, 2022 at 4:00 PM Haochen Jiang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: Hongyu Wang <hongyu.wang@intel.com>
>
> Hi all,
>
> This patch aimed to add Intel AMX-FP16 ISA according to newly
> released Intel Architecture Instruction Set Extensions and Future Features.
>
> The document comes following:
> https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Ok.
>
> BRs,
> Haochen
>
> gcc/ChangeLog:
>
> * common/config/i386/cpuinfo.h (get_available_features): Detect
> amx-fp16.
> * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_FP16_SET,
> (OPTION_MASK_ISA2_AMX_FP16_UNSET): New macros.
> (ix86_handle_option): Handle -mamx-fp16.
> * common/config/i386/i386-cpuinfo.h (enum processor_features):
> Add FEATURE_AMX_FP16.
> * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
> amx-fp16.
> * config.gcc: Add amxfp16intrin.h.
> * config/i386/cpuid.h (bit_AMX_FP16): New.
> * config/i386/i386-c.cc (ix86_target_macros_internal): Define
> __AMX_FP16__.
> * config/i386/i386-options.cc (isa2_opts): Add -mamx-fp16.
> (ix86_valid_target_attribute_inner_p): Add new ATTR.
> (ix86_option_override_internal): Handle AMX-FP16.
> * config/i386/i386-isas.def: Add DEF_PTA for AMX_FP16.
> * config/i386/i386.opt: Add -mamx-fp16.
> * config/i386/immintrin.h: Include amxfp16intrin.h.
> * doc/extend.texi: Document -mamx-fp16.
> * doc/invoke.texi: Document amx-fp16.
> * doc/sourcebuild.texi: Document amx_fp16.
> * config/i386/amxfp16intrin.h: New file.
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/other/i386-2.C: Add -mamx-fp16.
> * g++.dg/other/i386-3.C: Ditto.
> * gcc.target/i386/sse-12.c: Ditto.
> * gcc.target/i386/sse-13.c: Ditto.
> * gcc.target/i386/sse-14.c: Ditto.
> * gcc.target/i386/sse-22.c: Ditto.
> * gcc.target/i386/sse-23.c: Ditto.
> * lib/target-supports.exp: (check_effective_target_amx_fp16):
> New proc.
> * gcc.target/i386/funcspec-56.inc: Add new target attribute.
> * gcc.target/i386/amx-helper.h: New file to support amx-fp16.
> * gcc.target/i386/amxfp16-asmatt-1.c: New test.
> * gcc.target/i386/amxfp16-asmintel-1.c: Ditto.
> * gcc.target/i386/amxfp16-dpfp16ps-2.c: Ditto.
>
> Co-authored-by: Haochen Jiang <haochen.jiang@intel.com>
> ---
> gcc/common/config/i386/cpuinfo.h | 5 ++
> gcc/common/config/i386/i386-common.cc | 15 +++++
> gcc/common/config/i386/i386-cpuinfo.h | 1 +
> gcc/common/config/i386/i386-isas.h | 1 +
> gcc/config.gcc | 2 +-
> gcc/config/i386/amxfp16intrin.h | 46 ++++++++++++++
> gcc/config/i386/cpuid.h | 1 +
> gcc/config/i386/i386-c.cc | 2 +
> gcc/config/i386/i386-isa.def | 1 +
> gcc/config/i386/i386-options.cc | 4 +-
> gcc/config/i386/i386.opt | 4 ++
> gcc/config/i386/immintrin.h | 2 +
> gcc/doc/extend.texi | 5 ++
> gcc/doc/invoke.texi | 11 ++--
> gcc/doc/sourcebuild.texi | 3 +
> gcc/testsuite/g++.dg/other/i386-2.C | 2 +-
> gcc/testsuite/g++.dg/other/i386-3.C | 2 +-
> gcc/testsuite/gcc.target/i386/amx-check.h | 3 +
> gcc/testsuite/gcc.target/i386/amx-helper.h | 61 +++++++++++++++++++
> .../gcc.target/i386/amxfp16-asmatt-1.c | 13 ++++
> .../gcc.target/i386/amxfp16-asmintel-1.c | 10 +++
> .../gcc.target/i386/amxfp16-dpfp16ps-2.c | 57 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 +
> gcc/testsuite/gcc.target/i386/sse-12.c | 2 +-
> gcc/testsuite/gcc.target/i386/sse-13.c | 2 +-
> gcc/testsuite/gcc.target/i386/sse-14.c | 2 +-
> gcc/testsuite/gcc.target/i386/sse-22.c | 4 +-
> gcc/testsuite/gcc.target/i386/sse-23.c | 2 +-
> gcc/testsuite/lib/target-supports.exp | 11 ++++
> 29 files changed, 262 insertions(+), 14 deletions(-)
> create mode 100644 gcc/config/i386/amxfp16intrin.h
> create mode 100644 gcc/testsuite/gcc.target/i386/amx-helper.h
> create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
>
> diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
> index cc499c46ed0..118f3a42abd 100644
> --- a/gcc/common/config/i386/cpuinfo.h
> +++ b/gcc/common/config/i386/cpuinfo.h
> @@ -813,6 +813,11 @@ get_available_features (struct __processor_model *cpu_model,
> if (eax & bit_AVX512BF16)
> set_feature (FEATURE_AVX512BF16);
> }
> + if (amx_usable)
> + {
> + if (eax & bit_AMX_FP16)
> + set_feature (FEATURE_AMX_FP16);
> + }
> }
>
> /* Get Advanced Features at level 0xd (eax = 0xd, ecx = 1). */
> diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
> index 6ccc4d2f03c..f3d00ce4bc9 100644
> --- a/gcc/common/config/i386/i386-common.cc
> +++ b/gcc/common/config/i386/i386-common.cc
> @@ -111,6 +111,7 @@ along with GCC; see the file COPYING3. If not see
> #define OPTION_MASK_ISA2_AVXVNNIINT8_SET OPTION_MASK_ISA2_AVXVNNIINT8
> #define OPTION_MASK_ISA2_AVXNECONVERT_SET OPTION_MASK_ISA2_AVXNECONVERT
> #define OPTION_MASK_ISA2_CMPCCXADD_SET OPTION_MASK_ISA2_CMPCCXADD
> +#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16
>
> /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
> as -msse4.2. */
> @@ -285,6 +286,7 @@ along with GCC; see the file COPYING3. If not see
> #define OPTION_MASK_ISA2_AVXVNNIINT8_UNSET OPTION_MASK_ISA2_AVXVNNIINT8
> #define OPTION_MASK_ISA2_AVXNECONVERT_UNSET OPTION_MASK_ISA2_AVXNECONVERT
> #define OPTION_MASK_ISA2_CMPCCXADD_UNSET OPTION_MASK_ISA2_CMPCCXADD
> +#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16
>
> /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
> as -mno-sse4.1. */
> @@ -1196,6 +1198,19 @@ ix86_handle_option (struct gcc_options *opts,
> }
> return true;
>
> + case OPT_mamx_fp16:
> + if (value)
> + {
> + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP16_SET;
> + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_SET;
> + }
> + else
> + {
> + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16_UNSET;
> + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_UNSET;
> + }
> + return true;
> +
> case OPT_mfma:
> if (value)
> {
> diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
> index a71a10ebbd7..f9d5b7238ea 100644
> --- a/gcc/common/config/i386/i386-cpuinfo.h
> +++ b/gcc/common/config/i386/i386-cpuinfo.h
> @@ -245,6 +245,7 @@ enum processor_features
> FEATURE_AVXVNNIINT8,
> FEATURE_AVXNECONVERT,
> FEATURE_CMPCCXADD,
> + FEATURE_AMX_FP16,
> CPU_FEATURE_MAX
> };
>
> diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h
> index 3035e4a8186..7c4a71413b5 100644
> --- a/gcc/common/config/i386/i386-isas.h
> +++ b/gcc/common/config/i386/i386-isas.h
> @@ -181,4 +181,5 @@ ISA_NAMES_TABLE_START
> ISA_NAMES_TABLE_ENTRY("avxneconvert", FEATURE_AVXNECONVERT,
> P_NONE, "-mavxneconvert")
> ISA_NAMES_TABLE_ENTRY("cmpccxadd", FEATURE_CMPCCXADD, P_NONE, "-mcmpccxadd")
> + ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16")
> ISA_NAMES_TABLE_END
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index c0e10a72bd5..8a8712d1466 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -423,7 +423,7 @@ i[34567]86-*-* | x86_64-*-*)
> hresetintrin.h keylockerintrin.h avxvnniintrin.h
> mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
> avxifmaintrin.h avxvnniint8intrin.h avxneconvertintrin.h
> - cmpccxaddintrin.h"
> + cmpccxaddintrin.h amxfp16intrin.h"
> ;;
> ia64-*-*)
> extra_headers=ia64intrin.h
> diff --git a/gcc/config/i386/amxfp16intrin.h b/gcc/config/i386/amxfp16intrin.h
> new file mode 100644
> index 00000000000..6a114741aa9
> --- /dev/null
> +++ b/gcc/config/i386/amxfp16intrin.h
> @@ -0,0 +1,46 @@
> +/* Copyright (C) 2020 Free Software Foundation, Inc.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published by
> + the Free Software Foundation; either version 3, or (at your option)
> + any later version.
> +
> + GCC is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + GNU General Public License for more details.
> +
> + Under Section 7 of GPL version 3, you are granted additional
> + permissions described in the GCC Runtime Library Exception, version
> + 3.1, as published by the Free Software Foundation.
> +
> + You should have received a copy of the GNU General Public License and
> + a copy of the GCC Runtime Library Exception along with this program;
> + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#if !defined _IMMINTRIN_H_INCLUDED
> +#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
> +#endif
> +
> +#ifndef _AMXFP16INTRIN_H_INCLUDED
> +#define _AMXFP16INTRIN_H_INCLUDED
> +
> +#if defined(__x86_64__)
> +#define _tile_dpfp16ps_internal(dst,src1,src2) \
> + __asm__ volatile \
> + ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
> +
> +#define _tile_dpfp16ps(dst,src1,src2) \
> + _tile_dpfp16ps_internal (dst,src1,src2)
> +
> +#endif
> +
> +#ifdef __DISABLE_AMX_FP16__
> +#undef __DISABLE_AMX_FP16__
> +#pragma GCC pop_options
> +#endif /* __DISABLE_AMX_FP16__ */
> +
> +#endif /* _AMXFP16INTRIN_H_INCLUDED */
> diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
> index 19c0d033921..229c15c5950 100644
> --- a/gcc/config/i386/cpuid.h
> +++ b/gcc/config/i386/cpuid.h
> @@ -28,6 +28,7 @@
> #define bit_AVXVNNI (1 << 4)
> #define bit_AVX512BF16 (1 << 5)
> #define bit_CMPCCXADD (1 << 7)
> +#define bit_AMX_FP16 (1 << 21)
> #define bit_HRESET (1 << 22)
> #define bit_AVXIFMA (1 << 23)
>
> diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
> index 4494c412995..3020b5f267a 100644
> --- a/gcc/config/i386/i386-c.cc
> +++ b/gcc/config/i386/i386-c.cc
> @@ -648,6 +648,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
> def_or_undef (parse_in, "__AVXNECONVERT__");
> if (isa_flag2 & OPTION_MASK_ISA2_CMPCCXADD)
> def_or_undef (parse_in, "__CMPCCXADD__");
> + if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP16)
> + def_or_undef (parse_in, "__AMX_FP16__");
> if (TARGET_IAMCU)
> {
> def_or_undef (parse_in, "__iamcu");
> diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def
> index 7ffc73ba23e..55b25763957 100644
> --- a/gcc/config/i386/i386-isa.def
> +++ b/gcc/config/i386/i386-isa.def
> @@ -113,3 +113,4 @@ DEF_PTA(AVXIFMA)
> DEF_PTA(AVXVNNIINT8)
> DEF_PTA(AVXNECONVERT)
> DEF_PTA(CMPCCXADD)
> +DEF_PTA(AMX_FP16)
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 4526dc09fc4..bf37c77589e 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -231,7 +231,8 @@ static struct ix86_target_opts isa2_opts[] =
> { "-mavxifma", OPTION_MASK_ISA2_AVXIFMA },
> { "-mavxvnniint8", OPTION_MASK_ISA2_AVXVNNIINT8 },
> { "-mavxneconvert", OPTION_MASK_ISA2_AVXNECONVERT },
> - { "-mcmpccxadd", OPTION_MASK_ISA2_CMPCCXADD }
> + { "-mcmpccxadd", OPTION_MASK_ISA2_CMPCCXADD },
> + { "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 }
> };
> static struct ix86_target_opts isa_opts[] =
> {
> @@ -1082,6 +1083,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
> IX86_ATTR_ISA ("avxvnniint8", OPT_mavxvnniint8),
> IX86_ATTR_ISA ("avxneconvert", OPT_mavxneconvert),
> IX86_ATTR_ISA ("cmpccxadd", OPT_mcmpccxadd),
> + IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
>
> /* enum options */
> IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index c4a3bdcf960..eaa43946341 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1234,3 +1234,7 @@ mcmpccxadd
> Target Mask(ISA2_CMPCCXADD) Var(ix86_isa_flags2) Save
> Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, and
> CMPCCXADD build-in functions and code generation.
> +
> +mamx-fp16
> +Target Mask(ISA2_AMX_FP16) Var(ix86_isa_flags2) Save
> +Support AMX-FP16 built-in functions and code generation.
> diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
> index d7433f639c8..d8415863f52 100644
> --- a/gcc/config/i386/immintrin.h
> +++ b/gcc/config/i386/immintrin.h
> @@ -138,4 +138,6 @@
>
> #include <keylockerintrin.h>
>
> +#include <amxfp16intrin.h>
> +
> #endif /* _IMMINTRIN_H_INCLUDED */
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index adee772f7bc..e51d7835e69 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -7080,6 +7080,11 @@ Enable/disable the generation of the AVXNECONVERT instructions.
> @cindex @code{target("cmpccxadd")} function attribute, x86
> Enable/disable the generation of the CMPccXADD instructions.
>
> +@item amx-fp16
> +@itemx no-amx-fp16
> +@cindex @code{target("amx-fp16")} function attribute, x86
> +Enable/disable the generation of the AMX-FP16 instructions.
> +
> @item cld
> @itemx no-cld
> @cindex @code{target("cld")} function attribute, x86
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 962c6c177b6..1014e2ded99 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1436,7 +1436,7 @@ See RS/6000 and PowerPC Options.
> -mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol
> -mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
> -mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni@gol
> --mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd @gol
> +-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 @gol
> -mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops @gol
> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
> -mkl -mwidekl @gol
> @@ -32913,6 +32913,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
> @need 200
> @itemx -mcmpccxadd
> @opindex mcmpccxadd
> +@need 200
> +@itemx -mamx-fp16
> +@opindex mamx-fp16
> These switches enable the use of instructions in the MMX, SSE,
> SSE2, SSE3, SSSE3, SSE4, SSE4A, SSE4.1, SSE4.2, AVX, AVX2, AVX512F, AVX512PF,
> AVX512ER, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA,
> @@ -32923,9 +32926,9 @@ XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
> GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
> ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
> UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512FP16,
> -AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD or CLDEMOTE extended instruction
> -sets. Each has a corresponding @option{-mno-} option to disable use of these
> -instructions.
> +AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AMX-FP16 or CLDEMOTE extended
> +instruction sets. Each has a corresponding @option{-mno-} option to disable
> +use of these instructions.
>
> These extensions are also available as built-in functions: see
> @ref{x86 Built-in Functions}, for details of the functions enabled and
> diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
> index 714595d33bf..5de5e9576d5 100644
> --- a/gcc/doc/sourcebuild.texi
> +++ b/gcc/doc/sourcebuild.texi
> @@ -2508,6 +2508,9 @@ Target supports the execution of @code{amx-int8} instructions.
> @item amx_bf16
> Target supports the execution of @code{amx-bf16} instructions.
>
> +@item amx_fp16
> +Target supports the execution of @code{amx-fp16} instructions.
> +
> @item cell_hw
> Test system can execute AltiVec and Cell PPU instructions.
>
> diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
> index f7dbbbbf619..79b84af0a75 100644
> --- a/gcc/testsuite/g++.dg/other/i386-2.C
> +++ b/gcc/testsuite/g++.dg/other/i386-2.C
> @@ -1,5 +1,5 @@
> /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
> -/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
> +/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
>
> /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
> xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
> diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
> index 2ac5d9f2df5..c811a4454bf 100644
> --- a/gcc/testsuite/g++.dg/other/i386-3.C
> +++ b/gcc/testsuite/g++.dg/other/i386-3.C
> @@ -1,5 +1,5 @@
> /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
> -/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
> +/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
>
> /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
> xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
> diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
> index 6fff5ff4631..27dd37bf993 100644
> --- a/gcc/testsuite/gcc.target/i386/amx-check.h
> +++ b/gcc/testsuite/gcc.target/i386/amx-check.h
> @@ -213,6 +213,9 @@ main ()
> #ifdef AMX_BF16
> && __builtin_cpu_supports ("amx-bf16")
> #endif
> +#ifdef AMX_FP16
> + && __builtin_cpu_supports ("amx-fp16")
> +#endif
> #ifdef __linux__
> && request_perm_xtile_data ()
> #endif
> diff --git a/gcc/testsuite/gcc.target/i386/amx-helper.h b/gcc/testsuite/gcc.target/i386/amx-helper.h
> new file mode 100644
> index 00000000000..fe24d7067a5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/amx-helper.h
> @@ -0,0 +1,61 @@
> +#ifndef AMX_HELPER_H_INCLUDED
> +#define AMX_HELPER_H_INCLUDED
> +#if defined(AMX_FP16)
> +#include <immintrin.h>
> +#include <xmmintrin.h>
> +#endif
> +#include "amx-check.h"
> +
> +typedef union
> +{
> + _Float16 f16;
> + uint16_t u;
> +} union16f_uw;
> +
> +#if defined(AMX_FP16)
> +/* Transformation functions between fp16/float */
> +static uint16_t make_f32_fp16 (float f)
> +{
> + union16f_uw tmp;
> + __m128 b = _mm_set_ss (f);
> + __m128h a;
> + tmp.f16 = _mm_cvtsh_h (_mm_cvtss_sh (a, b));
> + return tmp.u;
> +}
> +
> +static float make_fp16_f32 (uint16_t fp)
> +{
> + union16f_uw tmp;
> + tmp.u = fp;
> + __m128h b = _mm_set_sh (tmp.f16);
> + __m128 a;
> + return _mm_cvtss_f32 (_mm_cvtsh_ss (a, b));
> +}
> +
> +/* Init tile buffer with fp16 pairs */
> +void init_fp16_max_tile_buffer (uint8_t* buf)
> +{
> + int i, j;
> + uint16_t* ptr = (uint16_t *) buf;
> +
> + for (i = 0; i < 16; i++)
> + for (j = 0; j < 32; j++)
> + {
> + float f = 2.5f * i + 1.25f * j;
> + ptr[i * 32 + j] = make_f32_fp16 (f);
> + }
> +}
> +
> +/* Init tile fp16 pair buffer with zero */
> +void init_fp16_max_tile_zero_buffer (uint8_t* buf)
> +{
> + int i, j;
> + uint16_t* ptr = (uint16_t *) buf;
> +
> + for (i = 0; i < 16; i++)
> + for (j = 0; j < 32; j++)
> + ptr[i * 32 + j] = make_f32_fp16 (0.0f);
> +}
> +#endif
> +
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c b/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
> new file mode 100644
> index 00000000000..09ae6d408f1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mamx-fp16" } */
> +/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
> +#include <immintrin.h>
> +
> +#define TMM1 1
> +#define TMM2 2
> +#define TMM3 3
> +
> +void TEST ()
> +{
> + _tile_dpfp16ps (TMM1, TMM2, TMM3);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c b/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
> new file mode 100644
> index 00000000000..a8dff945f23
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-require-effective-target masm_intel } */
> +/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
> +/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
> +#include <immintrin.h>
> +
> +void TEST ()
> +{
> + _tile_dpfp16ps (1, 2, 3);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
> new file mode 100644
> index 00000000000..2d359a689ea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
> @@ -0,0 +1,57 @@
> +/* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_fp16 } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-options "-O2 -mamx-tile -mamx-fp16 -mavx512fp16" } */
> +#define AMX_FP16
> +#define DO_TEST test_amx_fp16_dpfp16ps
> +void test_amx_fp16_dpfp16ps ();
> +#include "amx-helper.h"
> +
> +void calc_matrix_dpfp16ps (__tile *dst, __tile *src1, __tile *src2)
> +{
> + uint16_t *src1_buf = (uint16_t *)src1->buf;
> + uint16_t *src2_buf = (uint16_t *)src2->buf;
> + float *dst_buf = (float *)dst->buf;
> +
> + int M = src1->rows;
> + int N = src1->colsb / 4;
> + int K = src2->colsb / 4;
> + int i, j, k, t;
> +
> + for (i = 0; i < M; i++)
> + for (j = 0; j < N; j++)
> + for (k = 0; k < K; k++)
> + for (t = 0; t < 2; t+=2)
> + {
> + dst_buf[i * K + k] +=
> + (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t]) *
> + make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t])) +
> + (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t + 1]) *
> + make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t + 1]));
> + }
> +
> +}
> +
> +void test_amx_fp16_dpfp16ps ()
> +{
> + __tilecfg_u cfg;
> + __tile dst, dst_ref, src1, src2;
> + uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
> +
> + init_fp16_max_tile_buffer (tmp_dst_buf);
> + init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
> +
> + init_tile_config (&cfg);
> + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
> + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
> + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
> +
> + calc_matrix_dpfp16ps (&dst, &src1, &src2);
> +
> + _tile_dpfp16ps (1, 2, 3);
> + _tile_stored (1, dst_ref.buf, _STRIDE);
> +
> + if (!check_float_tile_register (&dst_ref, &dst))
> + abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> index f7e9c243597..ef9d4c5f5a4 100644
> --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> @@ -84,6 +84,7 @@ extern void test_avxifma (void) __attribute__((__target__("avxifma")));
> extern void test_avxvnniint8 (void) __attribute__((__target__("avxvnniint8")));
> extern void test_avxneconvert (void) __attribute__((__target__("avxneconvert")));
> extern void test_cmpccxadd (void) __attribute__((__target__("cmpccxadd")));
> +extern void test_amx_fp16 (void) __attribute__((__target__("amx-fp16")));
>
> extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
> extern void test_no_avx5124fmaps(void) __attribute__((__target__("no-avx5124fmaps")));
> @@ -169,6 +170,7 @@ extern void test_no_avxifma (void) __attribute__((__target__("no-avxifma")));
> extern void test_no_avxvnniint8 (void) __attribute__((__target__("no-avxvnniint8")));
> extern void test_no_avxneconvert (void) __attribute__((__target__("no-avxneconvert")));
> extern void test_no_cmpccxadd (void) __attribute__((__target__("no-cmpccxadd")));
> +extern void test_no_amx_fp16 (void) __attribute__((__target__("no-amx-fp16")));
>
> extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
> extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
> diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c
> index 3eabc49a6ab..df2684abbb6 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-12.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-12.c
> @@ -3,7 +3,7 @@
> popcntintrin.h gfniintrin.h and mm_malloc.h are usable
> with -O -std=c89 -pedantic-errors. */
> /* { dg-do compile } */
> -/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert" } */
> +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16" } */
>
> #include <x86intrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
> index e947b4347f4..ca662f7bd47 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-13.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-13.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
> +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
> /* { dg-add-options bind_pic_locally } */
>
> #include <mm_malloc.h>
> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
> index b6ee3806dcc..4a47d4093a2 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-14.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-14.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert" } */
> +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16" } */
> /* { dg-add-options bind_pic_locally } */
>
> #include <mm_malloc.h>
> diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
> index 71ac0f3da19..178a2fce492 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-22.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-22.c
> @@ -103,7 +103,7 @@
>
>
> #ifndef DIFFERENT_PRAGMAS
> -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert")
> +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16")
> #endif
>
> /* Following intrinsics require immediate arguments. They
> @@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
>
> /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
> #ifdef DIFFERENT_PRAGMAS
> -#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert")
> +#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16")
> #endif
> #include <immintrin.h>
> test_1 (_cvtss_sh, unsigned short, float, 1)
> diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
> index 757ba9c9a7d..ba1310f9f89 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-23.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-23.c
> @@ -847,6 +847,6 @@
> #define __builtin_ia32_cmpccxadd(A, B, C, D) __builtin_ia32_cmpccxadd(A, B, C, 1)
> #define __builtin_ia32_cmpccxadd64(A, B, C, D) __builtin_ia32_cmpccxadd64(A, B, C, 1)
>
> -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,cmpccxadd")
> +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16")
>
> #include <x86intrin.h>
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index d3b9aafb8f0..c70b6a21642 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -10103,6 +10103,17 @@ proc check_effective_target_amx_bf16 { } {
> } "-mamx-bf16" ]
> }
>
> +# Return 1 if amx-fp16 instructions can be compiled.
> +proc check_effective_target_amx_fp16 { } {
> + return [check_no_compiler_messages amx_fp16 object {
> + void
> + foo ()
> + {
> + __asm__ volatile ("tdpfp16ps\t%%tmm1, %%tmm2, %%tmm3" ::);
> + }
> + } "-mamx-fp16" ]
> +}
> +
> # Return 1 if vpclmulqdq instructions can be compiled.
> proc check_effective_target_vpclmulqdq { } {
> return [check_no_compiler_messages vpclmulqdq object {
> --
> 2.18.1
>
@@ -813,6 +813,11 @@ get_available_features (struct __processor_model *cpu_model,
if (eax & bit_AVX512BF16)
set_feature (FEATURE_AVX512BF16);
}
+ if (amx_usable)
+ {
+ if (eax & bit_AMX_FP16)
+ set_feature (FEATURE_AMX_FP16);
+ }
}
/* Get Advanced Features at level 0xd (eax = 0xd, ecx = 1). */
@@ -111,6 +111,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AVXVNNIINT8_SET OPTION_MASK_ISA2_AVXVNNIINT8
#define OPTION_MASK_ISA2_AVXNECONVERT_SET OPTION_MASK_ISA2_AVXNECONVERT
#define OPTION_MASK_ISA2_CMPCCXADD_SET OPTION_MASK_ISA2_CMPCCXADD
+#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
@@ -285,6 +286,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AVXVNNIINT8_UNSET OPTION_MASK_ISA2_AVXVNNIINT8
#define OPTION_MASK_ISA2_AVXNECONVERT_UNSET OPTION_MASK_ISA2_AVXNECONVERT
#define OPTION_MASK_ISA2_CMPCCXADD_UNSET OPTION_MASK_ISA2_CMPCCXADD
+#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1196,6 +1198,19 @@ ix86_handle_option (struct gcc_options *opts,
}
return true;
+ case OPT_mamx_fp16:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP16_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
@@ -245,6 +245,7 @@ enum processor_features
FEATURE_AVXVNNIINT8,
FEATURE_AVXNECONVERT,
FEATURE_CMPCCXADD,
+ FEATURE_AMX_FP16,
CPU_FEATURE_MAX
};
@@ -181,4 +181,5 @@ ISA_NAMES_TABLE_START
ISA_NAMES_TABLE_ENTRY("avxneconvert", FEATURE_AVXNECONVERT,
P_NONE, "-mavxneconvert")
ISA_NAMES_TABLE_ENTRY("cmpccxadd", FEATURE_CMPCCXADD, P_NONE, "-mcmpccxadd")
+ ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16")
ISA_NAMES_TABLE_END
@@ -423,7 +423,7 @@ i[34567]86-*-* | x86_64-*-*)
hresetintrin.h keylockerintrin.h avxvnniintrin.h
mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
avxifmaintrin.h avxvnniint8intrin.h avxneconvertintrin.h
- cmpccxaddintrin.h"
+ cmpccxaddintrin.h amxfp16intrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
new file mode 100644
@@ -0,0 +1,46 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXFP16INTRIN_H_INCLUDED
+#define _AMXFP16INTRIN_H_INCLUDED
+
+#if defined(__x86_64__)
+#define _tile_dpfp16ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpfp16ps(dst,src1,src2) \
+ _tile_dpfp16ps_internal (dst,src1,src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_FP16__
+#undef __DISABLE_AMX_FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP16__ */
+
+#endif /* _AMXFP16INTRIN_H_INCLUDED */
@@ -28,6 +28,7 @@
#define bit_AVXVNNI (1 << 4)
#define bit_AVX512BF16 (1 << 5)
#define bit_CMPCCXADD (1 << 7)
+#define bit_AMX_FP16 (1 << 21)
#define bit_HRESET (1 << 22)
#define bit_AVXIFMA (1 << 23)
@@ -648,6 +648,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__AVXNECONVERT__");
if (isa_flag2 & OPTION_MASK_ISA2_CMPCCXADD)
def_or_undef (parse_in, "__CMPCCXADD__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP16)
+ def_or_undef (parse_in, "__AMX_FP16__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
@@ -113,3 +113,4 @@ DEF_PTA(AVXIFMA)
DEF_PTA(AVXVNNIINT8)
DEF_PTA(AVXNECONVERT)
DEF_PTA(CMPCCXADD)
+DEF_PTA(AMX_FP16)
@@ -231,7 +231,8 @@ static struct ix86_target_opts isa2_opts[] =
{ "-mavxifma", OPTION_MASK_ISA2_AVXIFMA },
{ "-mavxvnniint8", OPTION_MASK_ISA2_AVXVNNIINT8 },
{ "-mavxneconvert", OPTION_MASK_ISA2_AVXNECONVERT },
- { "-mcmpccxadd", OPTION_MASK_ISA2_CMPCCXADD }
+ { "-mcmpccxadd", OPTION_MASK_ISA2_CMPCCXADD },
+ { "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 }
};
static struct ix86_target_opts isa_opts[] =
{
@@ -1082,6 +1083,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
IX86_ATTR_ISA ("avxvnniint8", OPT_mavxvnniint8),
IX86_ATTR_ISA ("avxneconvert", OPT_mavxneconvert),
IX86_ATTR_ISA ("cmpccxadd", OPT_mcmpccxadd),
+ IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
@@ -1234,3 +1234,7 @@ mcmpccxadd
Target Mask(ISA2_CMPCCXADD) Var(ix86_isa_flags2) Save
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, and
CMPCCXADD build-in functions and code generation.
+
+mamx-fp16
+Target Mask(ISA2_AMX_FP16) Var(ix86_isa_flags2) Save
+Support AMX-FP16 built-in functions and code generation.
@@ -138,4 +138,6 @@
#include <keylockerintrin.h>
+#include <amxfp16intrin.h>
+
#endif /* _IMMINTRIN_H_INCLUDED */
@@ -7080,6 +7080,11 @@ Enable/disable the generation of the AVXNECONVERT instructions.
@cindex @code{target("cmpccxadd")} function attribute, x86
Enable/disable the generation of the CMPccXADD instructions.
+@item amx-fp16
+@itemx no-amx-fp16
+@cindex @code{target("amx-fp16")} function attribute, x86
+Enable/disable the generation of the AMX-FP16 instructions.
+
@item cld
@itemx no-cld
@cindex @code{target("cld")} function attribute, x86
@@ -1436,7 +1436,7 @@ See RS/6000 and PowerPC Options.
-mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol
-mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni@gol
--mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd @gol
+-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 @gol
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mkl -mwidekl @gol
@@ -32913,6 +32913,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@need 200
@itemx -mcmpccxadd
@opindex mcmpccxadd
+@need 200
+@itemx -mamx-fp16
+@opindex mamx-fp16
These switches enable the use of instructions in the MMX, SSE,
SSE2, SSE3, SSSE3, SSE4, SSE4A, SSE4.1, SSE4.2, AVX, AVX2, AVX512F, AVX512PF,
AVX512ER, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA,
@@ -32923,9 +32926,9 @@ XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512FP16,
-AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD or CLDEMOTE extended instruction
-sets. Each has a corresponding @option{-mno-} option to disable use of these
-instructions.
+AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AMX-FP16 or CLDEMOTE extended
+instruction sets. Each has a corresponding @option{-mno-} option to disable
+use of these instructions.
These extensions are also available as built-in functions: see
@ref{x86 Built-in Functions}, for details of the functions enabled and
@@ -2508,6 +2508,9 @@ Target supports the execution of @code{amx-int8} instructions.
@item amx_bf16
Target supports the execution of @code{amx-bf16} instructions.
+@item amx_fp16
+Target supports the execution of @code{amx-fp16} instructions.
+
@item cell_hw
Test system can execute AltiVec and Cell PPU instructions.
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
@@ -213,6 +213,9 @@ main ()
#ifdef AMX_BF16
&& __builtin_cpu_supports ("amx-bf16")
#endif
+#ifdef AMX_FP16
+ && __builtin_cpu_supports ("amx-fp16")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
new file mode 100644
@@ -0,0 +1,61 @@
+#ifndef AMX_HELPER_H_INCLUDED
+#define AMX_HELPER_H_INCLUDED
+#if defined(AMX_FP16)
+#include <immintrin.h>
+#include <xmmintrin.h>
+#endif
+#include "amx-check.h"
+
+typedef union
+{
+ _Float16 f16;
+ uint16_t u;
+} union16f_uw;
+
+#if defined(AMX_FP16)
+/* Transformation functions between fp16/float */
+static uint16_t make_f32_fp16 (float f)
+{
+ union16f_uw tmp;
+ __m128 b = _mm_set_ss (f);
+ __m128h a;
+ tmp.f16 = _mm_cvtsh_h (_mm_cvtss_sh (a, b));
+ return tmp.u;
+}
+
+static float make_fp16_f32 (uint16_t fp)
+{
+ union16f_uw tmp;
+ tmp.u = fp;
+ __m128h b = _mm_set_sh (tmp.f16);
+ __m128 a;
+ return _mm_cvtss_f32 (_mm_cvtsh_ss (a, b));
+}
+
+/* Init tile buffer with fp16 pairs */
+void init_fp16_max_tile_buffer (uint8_t* buf)
+{
+ int i, j;
+ uint16_t* ptr = (uint16_t *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 32; j++)
+ {
+ float f = 2.5f * i + 1.25f * j;
+ ptr[i * 32 + j] = make_f32_fp16 (f);
+ }
+}
+
+/* Init tile fp16 pair buffer with zero */
+void init_fp16_max_tile_zero_buffer (uint8_t* buf)
+{
+ int i, j;
+ uint16_t* ptr = (uint16_t *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 32; j++)
+ ptr[i * 32 + j] = make_f32_fp16 (0.0f);
+}
+#endif
+
+#endif
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-fp16" } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+#include <immintrin.h>
+
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST ()
+{
+ _tile_dpfp16ps (TMM1, TMM2, TMM3);
+}
new file mode 100644
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+#include <immintrin.h>
+
+void TEST ()
+{
+ _tile_dpfp16ps (1, 2, 3);
+}
new file mode 100644
@@ -0,0 +1,57 @@
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_tile } */
+/* { dg-require-effective-target amx_fp16 } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-tile -mamx-fp16 -mavx512fp16" } */
+#define AMX_FP16
+#define DO_TEST test_amx_fp16_dpfp16ps
+void test_amx_fp16_dpfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_dpfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *)src1->buf;
+ uint16_t *src2_buf = (uint16_t *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+ int i, j, k, t;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ for (k = 0; k < K; k++)
+ for (t = 0; t < 2; t+=2)
+ {
+ dst_buf[i * K + k] +=
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t])) +
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t + 1]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t + 1]));
+ }
+
+}
+
+void test_amx_fp16_dpfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_dpfp16ps (&dst, &src1, &src2);
+
+ _tile_dpfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
@@ -84,6 +84,7 @@ extern void test_avxifma (void) __attribute__((__target__("avxifma")));
extern void test_avxvnniint8 (void) __attribute__((__target__("avxvnniint8")));
extern void test_avxneconvert (void) __attribute__((__target__("avxneconvert")));
extern void test_cmpccxadd (void) __attribute__((__target__("cmpccxadd")));
+extern void test_amx_fp16 (void) __attribute__((__target__("amx-fp16")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx5124fmaps(void) __attribute__((__target__("no-avx5124fmaps")));
@@ -169,6 +170,7 @@ extern void test_no_avxifma (void) __attribute__((__target__("no-avxifma")));
extern void test_no_avxvnniint8 (void) __attribute__((__target__("no-avxvnniint8")));
extern void test_no_avxneconvert (void) __attribute__((__target__("no-avxneconvert")));
extern void test_no_cmpccxadd (void) __attribute__((__target__("no-cmpccxadd")));
+extern void test_no_amx_fp16 (void) __attribute__((__target__("no-amx-fp16")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
@@ -3,7 +3,7 @@
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16" } */
#include <x86intrin.h>
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
@@ -103,7 +103,7 @@
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16")
#endif
/* Following intrinsics require immediate arguments. They
@@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
@@ -847,6 +847,6 @@
#define __builtin_ia32_cmpccxadd(A, B, C, D) __builtin_ia32_cmpccxadd(A, B, C, 1)
#define __builtin_ia32_cmpccxadd64(A, B, C, D) __builtin_ia32_cmpccxadd64(A, B, C, 1)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,cmpccxadd")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16")
#include <x86intrin.h>
@@ -10103,6 +10103,17 @@ proc check_effective_target_amx_bf16 { } {
} "-mamx-bf16" ]
}
+# Return 1 if amx-fp16 instructions can be compiled.
+proc check_effective_target_amx_fp16 { } {
+ return [check_no_compiler_messages amx_fp16 object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("tdpfp16ps\t%%tmm1, %%tmm2, %%tmm3" ::);
+ }
+ } "-mamx-fp16" ]
+}
+
# Return 1 if vpclmulqdq instructions can be compiled.
proc check_effective_target_vpclmulqdq { } {
return [check_no_compiler_messages vpclmulqdq object {