From patchwork Wed Feb 15 20:49:46 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Matthias Kretz X-Patchwork-Id: 57723 Return-Path: Delivered-To: ouuuleilei@gmail.com Received: by 2002:adf:eb09:0:0:0:0:0 with SMTP id s9csp412296wrn; Wed, 15 Feb 2023 12:54:58 -0800 (PST) X-Google-Smtp-Source: AK7set/IGz0OY6/1gBMG1wGcnxggI9x2l2VGcUP+EYIrFlbCWySt/Hz6Hg1sKOhqMMuE+BM7bRgX X-Received: by 2002:a05:6402:703:b0:4ac:cb71:42e with SMTP id w3-20020a056402070300b004accb71042emr3546679edx.12.1676494498521; Wed, 15 Feb 2023 12:54:58 -0800 (PST) ARC-Seal: i=1; a=rsa-sha256; t=1676494498; cv=none; d=google.com; s=arc-20160816; b=sBD5CvKZUeR/YDW0FWe1JIuV33sBdN6kmhkjiz91ioeu5BFnlCVwDLiZtmLuYglH60 DufTWHzsXXC7GE2gD6XWQwVzQXIwQv7iq5Muj6lTJFYb3+hDoJiQKkM2bKvnSejTo3+E c9Qer2aA7Qbw+WIXSgeqRZ4YlzlHu7cYfvZp8jctvKFcqT5XzFVQWF8FtJFTGUhTY16m WVXNP4tsaBU9LLqSAGXCa/OQVNOLMb2rout7/Vsgx/+Hf1hHIjqxS0mPc2FpKdI0wN+v J3sZ7h9YeBNQBbzaeS5zpfi4ZbixCuANvOR8hGkBzTcTQ4AM1JV+1vfXIk0pEiUlXr4D sTvg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:reply-to:from:list-subscribe:list-help:list-post :list-archive:list-unsubscribe:list-id:precedence :content-transfer-encoding:mime-version:references:in-reply-to :organization:message-id:date:subject:to:dmarc-filter:delivered-to :dkim-signature:dkim-filter; bh=ByTV4Z627m/HqGZnpcru6mxG8XCOmh+bzuRkbD/a/mI=; b=gjcT57ny6AT1ac3tpW58y3B4712ekaJuBxLhFgYVzmEPsOiGm3GQJmYC73XObn3fH4 bLIQYJHookZiNin5yHQZ+72sE8ygReJ86fO/1NHBCPw5qb6Sfw07se95uG24w2XxJKoA KkBpp8lBDK7kmzL5sEejcB5lo/Y25BTO8Ij2PYHPVY8Q7sykCbmwAa9ibBHk2HJAKdUY mnlwKyRkzOX71WpAUOY3GwZ/9UEdJNGxj/LGGqiHGITCik50Sw51MBJiFZX6DpSveuY5 t5mIRKzmWF5g0t9U9YCbNIpjvEjQFNh4vvw0QahE2yO6b1HyX/XyKfeUMOxdCHjAiT0K jqxg== ARC-Authentication-Results: i=1; mx.google.com; dkim=pass header.i=@gcc.gnu.org header.s=default header.b=Ggq6ALyF; spf=pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 2620:52:3:1:0:246e:9693:128c as permitted sender) smtp.mailfrom="gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org"; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=gnu.org Received: from sourceware.org (server2.sourceware.org. [2620:52:3:1:0:246e:9693:128c]) by mx.google.com with ESMTPS id gh13-20020a1709073c0d00b008b14ef753b8si638622ejc.731.2023.02.15.12.54.57 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 15 Feb 2023 12:54:58 -0800 (PST) Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 2620:52:3:1:0:246e:9693:128c as permitted sender) client-ip=2620:52:3:1:0:246e:9693:128c; Authentication-Results: mx.google.com; dkim=pass header.i=@gcc.gnu.org header.s=default header.b=Ggq6ALyF; spf=pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 2620:52:3:1:0:246e:9693:128c as permitted sender) smtp.mailfrom="gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org"; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=gnu.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 7E863392AC3E for ; Wed, 15 Feb 2023 20:51:57 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7E863392AC3E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1676494317; bh=ByTV4Z627m/HqGZnpcru6mxG8XCOmh+bzuRkbD/a/mI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=Ggq6ALyFc1AZWN0mqMGrEG6vYu63EL/ioePpjeSTkX/X5o7rDUJkbzq7pPDYzI41p P3fzv2OCIaA46AqKpqR/SPJsSwyCc7DoijckvJnhmCLkao/0vOIPhLE5etctJ2WLN/ VHbBGSjfAmxT1WVONCDn8bL68MWKCNOY2tSMOgUc= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from lxmtout2.gsi.de (lxmtout2.gsi.de [140.181.3.112]) by sourceware.org (Postfix) with ESMTPS id 995333854839; Wed, 15 Feb 2023 20:49:53 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 995333854839 Received: from localhost (localhost [127.0.0.1]) by lxmtout2.gsi.de (Postfix) with ESMTP id B77AB202AD6D; Wed, 15 Feb 2023 21:49:52 +0100 (CET) X-Virus-Scanned: Debian amavisd-new at lxmtout2.gsi.de Received: from lxmtout2.gsi.de ([127.0.0.1]) by localhost (lxmtout2.gsi.de [127.0.0.1]) (amavisd-new, port 10024) with LMTP id y_Hz73CkP1iX; Wed, 15 Feb 2023 21:49:52 +0100 (CET) Received: from srvEX6.campus.gsi.de (unknown [10.10.4.96]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by lxmtout2.gsi.de (Postfix) with ESMTPS id 81988202AD51; Wed, 15 Feb 2023 21:49:52 +0100 (CET) Received: from minbar.localnet (140.181.3.12) by srvEX6.campus.gsi.de (10.10.4.96) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1118.21; Wed, 15 Feb 2023 21:49:52 +0100 To: , Subject: [PATCH 2/7] libstdc++: Annotate most lambdas with always_inline Date: Wed, 15 Feb 2023 21:49:46 +0100 Message-ID: <9083131.CDJkKcVGEf@minbar> Organization: GSI Helmholtz Centre for Heavy Ion Research In-Reply-To: <3238840.44csPzL39Z@minbar> References: <3238840.44csPzL39Z@minbar> MIME-Version: 1.0 X-Originating-IP: [140.181.3.12] X-ClientProxiedBy: srvEX7.Campus.gsi.de (10.10.4.97) To srvEX6.campus.gsi.de (10.10.4.96) X-Spam-Status: No, score=-10.0 required=5.0 tests=BAYES_00, BODY_8BITS, GIT_PATCH_0, KAM_DMARC_STATUS, SPF_HELO_PASS, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Matthias Kretz via Gcc-patches From: Matthias Kretz Reply-To: Matthias Kretz Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org Sender: "Gcc-patches" X-getmail-retrieved-from-mailbox: =?utf-8?q?INBOX?= X-GMAIL-THRID: =?utf-8?q?1757931895215494813?= X-GMAIL-MSGID: =?utf-8?q?1757931895215494813?= All of the annotated lambdas are simply a necessary means for implementing these functions and should never result in an actual function call. Many of these lambdas would go away if C++ had better language support for packs. Signed-off-by: Matthias Kretz libstdc++-v3/ChangeLog: PR libstdc++/108030 * include/experimental/bits/simd_detail.h: Define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA. * include/experimental/bits/simd.h: Annotate lambdas with _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA. * include/experimental/bits/simd_builtin.h: Ditto. * include/experimental/bits/simd_converter.h: Ditto. * include/experimental/bits/simd_fixed_size.h: Ditto. * include/experimental/bits/simd_math.h: Ditto. * include/experimental/bits/simd_neon.h: Ditto. * include/experimental/bits/simd_x86.h: Ditto. --- libstdc++-v3/include/experimental/bits/simd.h | 239 ++++++------ .../include/experimental/bits/simd_builtin.h | 351 ++++++++++-------- .../experimental/bits/simd_converter.h | 22 +- .../include/experimental/bits/simd_detail.h | 3 + .../experimental/bits/simd_fixed_size.h | 265 ++++++------- .../include/experimental/bits/simd_math.h | 52 +-- .../include/experimental/bits/simd_neon.h | 14 +- .../include/experimental/bits/simd_x86.h | 122 +++--- 8 files changed, 575 insertions(+), 493 deletions(-) -- ────────────────────────────────────────────────────────────────────────── Dr. Matthias Kretz https://mattkretz.github.io GSI Helmholtz Centre for Heavy Ion Research https://gsi.de stdₓ::simd ────────────────────────────────────────────────────────────────────────── diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 3de966bbf22..ffe72fa6ccf 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -609,28 +609,34 @@ template operator&(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] & _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] & _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator|(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] | _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] | _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator^(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] ^ _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] ^ _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator~() const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return ~_M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return ~_M_data[__i]; }); } }; return _Ip{}; @@ -1391,7 +1397,7 @@ template operator^=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] ^= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] ^= __b._M_bits[__i]; }); return *this; } @@ -1399,7 +1405,7 @@ template operator|=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] |= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] |= __b._M_bits[__i]; }); return *this; } @@ -1407,7 +1413,7 @@ template operator&=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] &= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] &= __b._M_bits[__i]; }); return *this; } @@ -1797,8 +1803,9 @@ template __vector_broadcast(_Tp __x) { return __call_with_n_evaluations<_Np>( - [](auto... __xx) { return __vector_type_t<_Tp, _Np>{__xx...}; }, - [&__x](int) { return __x; }); + [](auto... __xx) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_type_t<_Tp, _Np>{__xx...}; + }, [&__x](int) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x; }); } // }}} @@ -2205,7 +2212,7 @@ template ( - __x, [](auto... __entries) { + __x, [](auto... __entries) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return reinterpret_cast<_R>(_Up{__entries...}); }); } @@ -2607,7 +2614,7 @@ template _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(initializer_list<_Tp> __init) : _Base(__generate_from_n_evaluations<_Width, _BuiltinType>( - [&](auto __i) { return __init.begin()[__i.value]; })) {} + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __init.begin()[__i.value]; })) {} _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper() = default; _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(const _SimdWrapper&) @@ -2632,10 +2639,9 @@ template _GLIBCXX_SIMD_INTRINSIC constexpr operator _SimdTuple<_Tp, _As...>() const { - const auto& dd = _M_data; // workaround for GCC7 ICE - return __generate_from_n_evaluations>([&]( - auto __i) constexpr { return dd[int(__i)]; }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return _M_data[int(__i)]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr operator const _BuiltinType&() const @@ -3192,21 +3198,19 @@ template { return __x; } template - _GLIBCXX_SIMD_INTRINSIC auto + _GLIBCXX_SIMD_INTRINSIC fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>> to_fixed_size(const simd<_Tp, _Ap>& __x) { - return simd<_Tp, simd_abi::fixed_size>>([&__x]( - auto __i) constexpr { return __x[__i]; }); + using _Rp = fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>; + return _Rp([&__x](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } template - _GLIBCXX_SIMD_INTRINSIC auto + _GLIBCXX_SIMD_INTRINSIC fixed_size_simd_mask<_Tp, simd_size_v<_Tp, _Ap>> to_fixed_size(const simd_mask<_Tp, _Ap>& __x) { - constexpr int _Np = simd_mask<_Tp, _Ap>::size(); - fixed_size_simd_mask<_Tp, _Np> __r; - __execute_n_times<_Np>([&](auto __i) constexpr { __r[__i] = __x[__i]; }); - return __r; + return {__private_init, + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }}; } // to_native {{{2 @@ -3225,7 +3229,9 @@ template enable_if_t<(_Np == native_simd_mask<_Tp>::size()), native_simd_mask<_Tp>> to_native(const fixed_size_simd_mask<_Tp, _Np>& __x) { - return native_simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); + return native_simd_mask<_Tp>( + __private_init, + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } // to_compatible {{{2 @@ -3242,7 +3248,10 @@ template _GLIBCXX_SIMD_INTRINSIC enable_if_t<(_Np == simd_mask<_Tp>::size()), simd_mask<_Tp>> to_compatible(const simd_mask<_Tp, simd_abi::fixed_size<_Np>>& __x) - { return simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); } + { + return simd_mask<_Tp>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); + } // masked assignment [simd_mask.where] {{{1 @@ -3400,9 +3409,9 @@ template _Impl::template _S_masked_cassign( \ __data(_M_k), __data(_M_value), \ __to_value_type_or_member_type<_Tp>(static_cast<_Up&&>(__x)), \ - [](auto __impl, auto __lhs, auto __rhs) constexpr { \ - return __impl.__name(__lhs, __rhs); \ - }); \ + [](auto __impl, auto __lhs, auto __rhs) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl.__name(__lhs, __rhs); }); \ } \ static_assert(true) _GLIBCXX_SIMD_OP_(+, _S_plus); @@ -3899,12 +3908,11 @@ template >([&]( - auto __i) constexpr { - return _V([&](auto __j) constexpr { - return __x[__i * _V::size() + __j]; - }); - }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __x[__i * _V::size() + __j]; }); + }); } else if constexpr ( __is_fixed_size_abi_v<_Ap> @@ -3917,41 +3925,40 @@ template * const __element_ptr = reinterpret_cast*>(&__data(__x)); - return __generate_from_n_evaluations>([&]( - auto __i) constexpr { - return _V(__element_ptr + __i * _V::size(), vector_aligned); - }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return _V(__element_ptr + __i * _V::size(), vector_aligned); }); #else const auto& __xx = __data(__x); - return __generate_from_n_evaluations>([&]( - auto __i) constexpr { - [[maybe_unused]] constexpr size_t __offset - = decltype(__i)::value * _V::size(); - return _V([&](auto __j) constexpr { - constexpr _SizeConstant<__j + __offset> __k; - return __xx[__k]; - }); - }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + [[maybe_unused]] constexpr size_t __offset + = decltype(__i)::value * _V::size(); + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<__j + __offset> __k; + return __xx[__k]; + }); + }); #endif } else if constexpr (is_same_v) { // normally memcpy should work here as well - return __generate_from_n_evaluations>([&]( - auto __i) constexpr { return __x[__i]; }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } else { - return __generate_from_n_evaluations>([&]( - auto __i) constexpr { - if constexpr (__is_fixed_size_abi_v) - return _V([&](auto __j) constexpr { - return __x[__i * _V::size() + __j]; - }); - else - return _V(__private_init, - __extract_part(__data(__x))); - }); + return __generate_from_n_evaluations>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__is_fixed_size_abi_v) + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__i * _V::size() + __j]; + }); + else + return _V(__private_init, + __extract_part(__data(__x))); + }); } } @@ -3975,22 +3982,22 @@ template >([&]( - auto __i) constexpr { - constexpr size_t __offset = __i * _V::size(); - return _V(__bitset_init, (__bits >> __offset).to_ullong()); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr size_t __offset = __i * _V::size(); + return _V(__bitset_init, (__bits >> __offset).to_ullong()); + }); } else { - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - constexpr size_t __offset = __i * _V::size(); - return _V( - __private_init, [&](auto __j) constexpr { - return __x[__j + __offset]; - }); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr size_t __offset = __i * _V::size(); + return _V(__private_init, + [&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__j + __offset]; + }); + }); } } @@ -4008,12 +4015,14 @@ template using _V = __deduced_simd<_Tp, _N0>; if (__x._M_is_constprop()) - return __generate_from_n_evaluations([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - constexpr size_t __offset = _SL::_S_before(__i); - return _Vi([&](auto __j) constexpr { return __x[__offset + __j]; }); - }); + return __generate_from_n_evaluations( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + constexpr size_t __offset = _SL::_S_before(__i); + return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__offset + __j]; + }); + }); else if constexpr (_Np == _N0) { static_assert(sizeof...(_Sizes) == 1); @@ -4080,28 +4089,28 @@ template #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS const __may_alias<_Tp>* const __element_ptr = reinterpret_cast*>(&__x); - return __generate_from_n_evaluations([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - constexpr size_t __offset = _SL::_S_before(__i); - constexpr size_t __base_align = alignof(simd<_Tp, _Ap>); - constexpr size_t __a - = __base_align - ((__offset * sizeof(_Tp)) % __base_align); - constexpr size_t __b = ((__a - 1) & __a) ^ __a; - constexpr size_t __alignment = __b == 0 ? __a : __b; - return _Vi(__element_ptr + __offset, overaligned<__alignment>); - }); + return __generate_from_n_evaluations( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + constexpr size_t __offset = _SL::_S_before(__i); + constexpr size_t __base_align = alignof(simd<_Tp, _Ap>); + constexpr size_t __a + = __base_align - ((__offset * sizeof(_Tp)) % __base_align); + constexpr size_t __b = ((__a - 1) & __a) ^ __a; + constexpr size_t __alignment = __b == 0 ? __a : __b; + return _Vi(__element_ptr + __offset, overaligned<__alignment>); + }); #else - return __generate_from_n_evaluations([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - const auto& __xx = __data(__x); - using _Offset = decltype(_SL::_S_before(__i)); - return _Vi([&](auto __j) constexpr { - constexpr _SizeConstant<_Offset::value + __j> __k; - return __xx[__k]; - }); - }); + return __generate_from_n_evaluations( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + const auto& __xx = __data(__x); + using _Offset = decltype(_SL::_S_before(__i)); + return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<_Offset::value + __j> __k; + return __xx[__k]; + }); + }); #endif } @@ -4143,8 +4152,9 @@ template return simd_cast<_Rp>(__xs...); else if ((... && __xs._M_is_constprop())) return simd<_Tp, - simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>([&]( - auto __i) constexpr { return __subscript_in_pack<__i>(__xs...); }); + simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __subscript_in_pack<__i>(__xs...); }); else { _Rp __r{}; @@ -4160,9 +4170,10 @@ template _GLIBCXX_SIMD_CONSTEXPR __deduced_simd<_Tp, simd_size_v<_Tp, _Abi> * _Np> concat(const array, _Np>& __x) { - return __call_with_subscripts<_Np>(__x, [](const auto&... __xs) { - return concat(__xs...); - }); + return __call_with_subscripts<_Np>( + __x, [](const auto&... __xs) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return concat(__xs...); + }); } // }}} @@ -4695,7 +4706,7 @@ template simd_mask(_PrivateInit, _Fp&& __gen) : _M_data() { - __execute_n_times([&](auto __i) constexpr { + __execute_n_times([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _Impl::_S_set(_M_data, __i, __gen(__i)); }); } @@ -4881,7 +4892,9 @@ template if (__builtin_is_constant_evaluated() || __k._M_is_constprop()) { const int __r = __call_with_subscripts>( - __k, [](auto... __elements) { return ((__elements != 0) + ...); }); + __k, [](auto... __elements) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return ((__elements != 0) + ...); + }); if (__builtin_is_constant_evaluated() || __builtin_constant_p(__r)) return __r; } @@ -4896,8 +4909,11 @@ template { constexpr size_t _Np = simd_size_v<_Tp, _Abi>; const size_t _Idx = __call_with_n_evaluations<_Np>( - [](auto... __indexes) { return std::min({__indexes...}); }, - [&](auto __i) { return __k[__i] ? +__i : _Np; }); + [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::min({__indexes...}); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? +__i : _Np; + }); if (_Idx >= _Np) __invoke_ub("find_first_set(empty mask) is UB"); if (__builtin_constant_p(_Idx)) @@ -4914,8 +4930,11 @@ template { constexpr size_t _Np = simd_size_v<_Tp, _Abi>; const int _Idx = __call_with_n_evaluations<_Np>( - [](auto... __indexes) { return std::max({__indexes...}); }, - [&](auto __i) { return __k[__i] ? int(__i) : -1; }); + [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::max({__indexes...}); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? int(__i) : -1; + }); if (_Idx < 0) __invoke_ub("find_first_set(empty mask) is UB"); if (__builtin_constant_p(_Idx)) diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h index 8851da69800..792439a81bf 100644 --- a/libstdc++-v3/include/experimental/bits/simd_builtin.h +++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h @@ -194,8 +194,11 @@ template > using _Up = decltype(__w); return __intrin_bitcast<_Tp>( __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( - [](auto... __chunks) { return _Up{__chunks...}; }, - [&](auto __i) { return __w[__shift / __chunksize + __i]; })); + [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _Up{__chunks...}; + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __w[__shift / __chunksize + __i]; + })); } } @@ -225,7 +228,9 @@ template // by _Total"); if (__x._M_is_constprop()) return __generate_from_n_evaluations<__return_size, _R>( - [&](auto __i) { return __x[__values_to_skip + __i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__values_to_skip + __i]; + }); if constexpr (_Index == 0 && _Total == 1) return __x; else if constexpr (_Index == 0) @@ -570,7 +575,9 @@ template >( - [&](auto __i) { return static_cast<_To>(__v[__i + _Offset]); }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_To>(__v[__i + _Offset]); + }); } else { @@ -611,13 +618,14 @@ template (__vv); }; [[maybe_unused]] const auto __vi = __to_intrin(__v); - auto&& __make_array = [](auto __x0, [[maybe_unused]] auto __x1) { - if constexpr (_Np == 1) - return _R{__intrin_bitcast<_To>(__x0)}; - else - return _R{__intrin_bitcast<_To>(__x0), - __intrin_bitcast<_To>(__x1)}; - }; + auto&& __make_array + = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (_Np == 1) + return _R{__intrin_bitcast<_To>(__x0)}; + else + return _R{__intrin_bitcast<_To>(__x0), + __intrin_bitcast<_To>(__x1)}; + }; if constexpr (_Np == 0) return _R{}; @@ -642,7 +650,7 @@ template , _Np>( __adjust(_SizeConstant<_Np * 4>(), __v)); return __generate_from_n_evaluations<_Np, _R>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __vector_convert<_To>(__as_wrapper(__ints[__i])); }); } @@ -687,36 +695,40 @@ template (_mm_unpacklo_epi16(__vv[1], __vv[1])), __vector_bitcast(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; if constexpr (sizeof(_ToT) == 4) - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __vector_convert<_To>( - _SimdWrapper(__vvvv[__i] >> 24)); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_convert<_To>( + _SimdWrapper(__vvvv[__i] >> 24)); + }); else if constexpr (is_integral_v<_ToT>) - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); - const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); - return __vector_bitcast<_ToT>( - __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) - : _mm_unpackhi_epi32(__sx32, __signbits)); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); + const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); + return __vector_bitcast<_ToT>( + __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) + : _mm_unpackhi_epi32(__sx32, __signbits)); + }); else - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - const _SimdWrapper __int4 = __vvvv[__i / 2] >> 24; - return __vector_convert<_To>( - __i % 2 == 0 ? __int4 - : _SimdWrapper( - _mm_unpackhi_epi64(__to_intrin(__int4), - __to_intrin(__int4)))); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + const _SimdWrapper __int4 = __vvvv[__i / 2] >> 24; + return __vector_convert<_To>( + __i % 2 == 0 ? __int4 + : _SimdWrapper( + _mm_unpackhi_epi64(__to_intrin(__int4), + __to_intrin(__int4)))); + }); } else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) { const auto __shorts = __convert_all<__vector_type16_t< conditional_t, short, unsigned short>>>( __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; + }); } else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 && is_signed_v<_FromT> && is_integral_v<_ToT>) @@ -736,9 +748,10 @@ template ( _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), _mm_srai_epi32(__vv[1], 31)))}; - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __vector_bitcast<_ToT>(__vvvv[__i]); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_bitcast<_ToT>(__vvvv[__i]); + }); } else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) { @@ -747,9 +760,10 @@ template || is_floating_point_v<_ToT>, int, unsigned int>>>( __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __convert_all<_To>(__ints[__i / 2])[__i % 2]; - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert_all<_To>(__ints[__i / 2])[__i % 2]; + }); } else __assert_unreachable<_To>(); @@ -779,14 +793,14 @@ template (__v))}; else - return __generate_from_n_evaluations<_Np, _R>([&]( - auto __i) constexpr { - auto __part - = __extract_part<__i * _ToVT::_S_full_size + _Offset, - _FromVT::_S_partial_width, - _ToVT::_S_full_size>(__v); - return __vector_convert<_To>(__part); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + auto __part + = __extract_part<__i * _ToVT::_S_full_size + _Offset, + _FromVT::_S_partial_width, + _ToVT::_S_full_size>(__v); + return __vector_convert<_To>(__part); + }); } else if constexpr (_Offset == 0) return array<_To, 1>{__vector_convert<_To>(__v)}; @@ -1017,8 +1031,9 @@ template else { constexpr auto __size = _S_size<_Tp>; - _GLIBCXX_SIMD_USE_CONSTEXPR auto __r = __generate_vector<_UV>( - [](auto __i) constexpr { return __i < __size ? -1 : 0; }); + _GLIBCXX_SIMD_USE_CONSTEXPR auto __r + = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __i < __size ? -1 : 0; }); return __r; } } @@ -1208,7 +1223,7 @@ template if constexpr (is_integral_v) return __x | __generate_vector<_Tp, _S_full_size<_Tp>>( - [](auto __i) -> _Tp { + [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { if (__i < _Np) return 0; else @@ -1348,26 +1363,27 @@ struct _CommonImplBuiltin } else { - __execute_n_times<__div_roundup(_Np, 4)>([&](auto __i) { - constexpr int __offset = __i * 4; - constexpr int __remaining = _Np - __offset; - if constexpr (__remaining > 4 && __remaining <= 7) - { - const _ULLong __bool7 - = (__x.template _M_extract<__offset>()._M_to_bits() - * 0x40810204081ULL) - & 0x0101010101010101ULL; - _S_store<__remaining>(__bool7, __mem + __offset); - } - else if constexpr (__remaining >= 4) - { - int __bits = __x.template _M_extract<__offset>()._M_to_bits(); - if constexpr (__remaining > 7) - __bits &= 0xf; - const int __bool4 = (__bits * 0x204081) & 0x01010101; - _S_store<4>(__bool4, __mem + __offset); - } - }); + __execute_n_times<__div_roundup(_Np, 4)>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr int __offset = __i * 4; + constexpr int __remaining = _Np - __offset; + if constexpr (__remaining > 4 && __remaining <= 7) + { + const _ULLong __bool7 + = (__x.template _M_extract<__offset>()._M_to_bits() + * 0x40810204081ULL) + & 0x0101010101010101ULL; + _S_store<__remaining>(__bool7, __mem + __offset); + } + else if constexpr (__remaining >= 4) + { + int __bits = __x.template _M_extract<__offset>()._M_to_bits(); + if constexpr (__remaining > 7) + __bits &= 0xf; + const int __bool4 = (__bits * 0x204081) & 0x01010101; + _S_store<4>(__bool4, __mem + __offset); + } + }); } } @@ -1434,13 +1450,13 @@ template inline static constexpr _SimdMember<_Tp> _S_generator(_Fp&& __gen, _TypeTag<_Tp>) { - return __generate_vector<_Tp, _S_full_size<_Tp>>([&]( - auto __i) constexpr { - if constexpr (__i < _S_size<_Tp>) - return __gen(__i); - else - return 0; - }); + return __generate_vector<_Tp, _S_full_size<_Tp>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _S_size<_Tp>) + return __gen(__i); + else + return 0; + }); } // _S_load {{{2 @@ -1455,10 +1471,10 @@ template : 16; constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; if constexpr (sizeof(_Up) > 8) - return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&]( - auto __i) constexpr { - return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); - }); + return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); + }); else if constexpr (is_same_v<_Up, _Tp>) return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, _Np * sizeof(_Tp)>(__mem); @@ -1470,13 +1486,12 @@ template constexpr size_t __n_loads = __bytes_to_load / __max_load_size; constexpr size_t __elements_per_load = _Np / __n_loads; return __call_with_n_evaluations<__n_loads>( - [](auto... __uncvted) { - return __convert<_SimdMember<_Tp>>(__uncvted...); - }, - [&](auto __i) { - return _CommonImpl::template _S_load<_Up, __elements_per_load>( - __mem + __i * __elements_per_load); - }); + [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert<_SimdMember<_Tp>>(__uncvted...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _CommonImpl::template _S_load<_Up, __elements_per_load>( + __mem + __i * __elements_per_load); + }); } else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 && __max_load_size > 16) @@ -1485,20 +1500,19 @@ template = __bytes_to_load / (__max_load_size / 2); constexpr size_t __elements_per_load = _Np / __n_loads; return __call_with_n_evaluations<__n_loads>( - [](auto... __uncvted) { - return __convert<_SimdMember<_Tp>>(__uncvted...); - }, - [&](auto __i) { - return _CommonImpl::template _S_load<_Up, __elements_per_load>( - __mem + __i * __elements_per_load); - }); + [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert<_SimdMember<_Tp>>(__uncvted...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _CommonImpl::template _S_load<_Up, __elements_per_load>( + __mem + __i * __elements_per_load); + }); } else // e.g. int[] -> return __call_with_subscripts( - __mem, make_index_sequence<_Np>(), [](auto... __args) { - return __vector_type_t<_Tp, _S_full_size<_Tp>>{ - static_cast<_Tp>(__args)...}; - }); + __mem, make_index_sequence<_Np>(), + [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; + }); } // _S_masked_load {{{2 @@ -1507,9 +1521,10 @@ template _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) noexcept { - _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), [&](auto __i) { - __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); - }); + _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); + }); return __merge; } @@ -1523,7 +1538,7 @@ template constexpr size_t __max_store_size = _SuperImpl::template _S_max_store_size<_Up>; if constexpr (sizeof(_Up) > 8) - __execute_n_times<_Np>([&](auto __i) constexpr { + __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); else if constexpr (is_same_v<_Up, _Tp>) @@ -1540,9 +1555,10 @@ template using _V = __vector_type_t<_Up, __vsize>; const array<_V, __stores> __converted = __convert_all<_V, __stores>(__v); - __execute_n_times<__full_stores>([&](auto __i) constexpr { - _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); - }); + __execute_n_times<__full_stores>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); + }); if constexpr (__full_stores < __stores) _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) * sizeof(_Up)>( @@ -1557,7 +1573,8 @@ template _MaskMember<_Tp> __k) { _BitOps::_S_bit_iteration( - _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr { + _MaskImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } @@ -1579,7 +1596,7 @@ template _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) { // bitwise or no conversion, reinterpret: - const _MaskMember<_Up> __kk = [&]() { + const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (__is_bitmask_v) return _MaskMember<_Up>(__k._M_data); else @@ -1618,7 +1635,7 @@ template constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; const array<_UV, _NAllStores> __converted = __convert_all<_UV, _NAllStores>(__v); - __execute_n_times<_NFullStores>([&](auto __i) { + __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _SuperImpl::_S_masked_store_nocvt( _UW(__converted[__i]), __mem + __i * _UW_size, _UAbi::_MaskImpl::template _S_convert< @@ -1637,10 +1654,10 @@ template } } else - _BitOps::_S_bit_iteration( - _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr { - __mem[__i] = static_cast<_Up>(__v[__i]); - }); + _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] = static_cast<_Up>(__v[__i]); + }); } // _S_complement {{{2 @@ -1932,7 +1949,9 @@ template static _Tp _S_##__name(const _Tp& __x, const _More&... __more) \ { \ return __generate_vector<_Tp>( \ - [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__i], __more[__i]...); \ + }); \ } #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ @@ -1941,23 +1960,25 @@ template const _More&... __more) \ { \ return __generate_vector<_Tp>( \ - [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \ - } - -#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ - template \ - static auto _S_##__name(const _Tp& __x, const _More&... __more) \ - { \ - return __fixed_size_storage_t<_RetTp, \ - _VectorTraits<_Tp>::_S_partial_width>:: \ - _S_generate([&](auto __meta) constexpr { \ - return __meta._S_generator( \ - [&](auto __i) { \ - return __name(__x[__meta._S_offset + __i], \ - __more[__meta._S_offset + __i]...); \ - }, \ - static_cast<_RetTp*>(nullptr)); \ - }); \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__i], __more[__i]...); \ + }); \ + } + +#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ + template \ + static auto _S_##__name(const _Tp& __x, const _More&... __more) \ + { \ + return __fixed_size_storage_t<_RetTp, \ + _VectorTraits<_Tp>::_S_partial_width>:: \ + _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __meta._S_generator( \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__meta._S_offset + __i], \ + __more[__meta._S_offset + __i]...); \ + }, \ + static_cast<_RetTp*>(nullptr)); \ + }); \ } _GLIBCXX_SIMD_MATH_FALLBACK(acos) @@ -2010,7 +2031,7 @@ template _S_remquo(const _Tp __x, const _Tp __y, __fixed_size_storage_t* __z) { - return __generate_vector<_Tp>([&](auto __i) { + return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { int __tmp; auto __r = remquo(__x[__i], __y[__i], &__tmp); __z->_M_set(__i, __tmp); @@ -2423,7 +2444,7 @@ template #endif // _GLIBCXX_SIMD_X86INTRIN else if constexpr (__fixed_size_storage_t::_S_tuple_size == 1) return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), - [](auto... __l) { + [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __make_wrapper(__l...); })}; else @@ -2554,13 +2575,13 @@ struct _MaskImplBuiltinMixin _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) { static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); - return __generate_vector<__vector_type_t<_Up, _ToN>>([&]( - auto __i) constexpr { - if constexpr (__i < _Np) - return __x[__i] ? ~_Up() : _Up(); - else - return _Up(); - }); + return __generate_vector<__vector_type_t<_Up, _ToN>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _Np) + return __x[__i] ? ~_Up() : _Up(); + else + return _Up(); + }); } template (__y); else */ { - return __generate_vector<__vector_type_t<_Up, _ToN>>([&]( - auto __i) constexpr { - if constexpr (__i < _Np) - return _Up(__x[__i.value]); - else - return _Up(); - }); + return __generate_vector<__vector_type_t<_Up, _ToN>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _Np) + return _Up(__x[__i.value]); + else + return _Up(); + }); } } } @@ -2625,7 +2646,9 @@ struct _MaskImplBuiltinMixin = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); _ULLong __r = 0; __execute_n_times<_Np>( - [&](auto __i) { __r |= _ULLong(__bools[__i.value]) << __i; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __r |= _ULLong(__bools[__i.value]) << __i; + }); return __r; } @@ -2677,9 +2700,10 @@ template return __bools > 0; } else - return __generate_vector<_I, _S_size<_Tp>>([&](auto __i) constexpr { - return __mem[__i] ? ~_I() : _I(); - }); + return __generate_vector<_I, _S_size<_Tp>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __mem[__i] ? ~_I() : _I(); + }); } // }}} @@ -2752,7 +2776,7 @@ template // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __tmp._M_set(__i, -__mem[__i]); }); __merge = __wrapper_bitcast<_Tp>(__tmp); @@ -2764,7 +2788,7 @@ template _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept { - __execute_n_times<_Np>([&](auto __i) constexpr { + __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } @@ -2775,10 +2799,10 @@ template _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, const _SimdWrapper<_Tp, _Np> __k) noexcept { - _BitOps::_S_bit_iteration( - _SuperImpl::_S_to_bits(__k), [&](auto __i) constexpr { - __mem[__i] = __v[__i]; - }); + _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] = __v[__i]; + }); } // _S_from_bitmask{{{2 @@ -2845,7 +2869,7 @@ template { __k = __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( - [&](auto __j) { + [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__i == __j) return _Tp(-__x); else @@ -2890,7 +2914,8 @@ template { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... && !(__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... && !(__ent == 0)); }); } // }}} @@ -2901,7 +2926,8 @@ template { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... || !(__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... || !(__ent == 0)); }); } // }}} @@ -2912,7 +2938,8 @@ template { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... && (__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... && (__ent == 0)); }); } // }}} diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h index 00b91c099ee..3160e251632 100644 --- a/libstdc++-v3/include/experimental/bits/simd_converter.h +++ b/libstdc++-v3/include/experimental/bits/simd_converter.h @@ -121,7 +121,7 @@ template { return __call_with_subscripts( __x, make_index_sequence<_Np>(), - [](auto... __values) constexpr->_Ret { + [](auto... __values) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Ret { return __make_simd_tuple<_To, decltype((void) __values, simd_abi::scalar())...>( static_cast<_To>(__values)...); @@ -233,7 +233,9 @@ template static_assert(_Ret::_FirstAbi::template _S_is_partial<_To>); return _Ret{__generate_from_n_evaluations< _Np, typename _VectorTraits::type>( - [&](auto __i) { return static_cast<_To>(__x[__i]); })}; + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_To>(__x[__i]); + })}; } else { @@ -241,7 +243,7 @@ template constexpr auto __n = __div_roundup(_Ret::_S_first_size, _Arg::_S_first_size); return __call_with_n_evaluations<__n>( - [&__x](auto... __uncvted) { + [&__x](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { // assuming _Arg Abi tags for all __i are _Arg::_FirstAbi _SimdConverter<_From, typename _Arg::_FirstAbi, _To, typename _Ret::_FirstAbi> @@ -255,8 +257,9 @@ template _From, simd_abi::fixed_size<_Np - _Ret::_S_first_size>, _To, simd_abi::fixed_size<_Np - _Ret::_S_first_size>>()( __simd_tuple_pop_front<_Ret::_S_first_size>(__x))}; - }, - [&__x](auto __i) { return __get_tuple_at<__i>(__x); }); + }, [&__x](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __get_tuple_at<__i>(__x); + }); } } }; @@ -322,13 +325,14 @@ template return __vector_convert<__vector_type_t<_To, _Np>>(__x.first); else if constexpr (_Arg::_S_is_homogeneous) return __call_with_n_evaluations<_Arg::_S_tuple_size>( - [](auto... __members) { + [](auto... __members) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr ((is_convertible_v && ...)) return __vector_type_t<_To, _Np>{static_cast<_To>(__members)...}; else return __vector_convert<__vector_type_t<_To, _Np>>(__members...); - }, - [&](auto __i) { return __get_tuple_at<__i>(__x); }); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __get_tuple_at<__i>(__x); + }); else if constexpr (__fixed_size_storage_t<_To, _Np>::_S_tuple_size == 1) { _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To, @@ -340,7 +344,7 @@ template { const _SimdWrapper<_From, _Np> __xv = __generate_from_n_evaluations<_Np, __vector_type_t<_From, _Np>>( - [&](auto __i) { return __x[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); return __vector_convert<__vector_type_t<_To, _Np>>(__xv); } } diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h index 8cabc504863..a0ad10efe0f 100644 --- a/libstdc++-v3/include/experimental/bits/simd_detail.h +++ b/libstdc++-v3/include/experimental/bits/simd_detail.h @@ -262,6 +262,7 @@ #define _GLIBCXX_SIMD_INTRINSIC \ [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline #define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline +#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__)) #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0) #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1) @@ -294,6 +295,8 @@ #ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE #undef _GLIBCXX_SIMD_ALWAYS_INLINE #define _GLIBCXX_SIMD_ALWAYS_INLINE inline +#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA +#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA #undef _GLIBCXX_SIMD_INTRINSIC #define _GLIBCXX_SIMD_INTRINSIC inline #endif diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h index 9ecc8e521ca..3ac6eaa3f6b 100644 --- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h +++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h @@ -434,14 +434,15 @@ template if constexpr (is_same_v<_SimdTuple, __remove_cvref_t<_Tup>>) return __tup.first; else if (__builtin_is_constant_evaluated()) - return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate([&]( - auto __meta) constexpr { - return __meta._S_generator( - [&](auto __i) constexpr { return __tup[__i]; }, - static_cast<_TupT*>(nullptr)); + return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate( + [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_generator( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __tup[__i]; + }, static_cast<_TupT*>(nullptr)); }); else - return [&]() { + return [&]() { // not always_inline; allow the compiler to decide __fixed_size_storage_t<_TupT, _S_first_size> __r; __builtin_memcpy(__r._M_as_charptr(), __tup._M_as_charptr(), sizeof(__r)); @@ -515,12 +516,11 @@ template negation>>>) ) { // need to write back at least one of __more after calling __fun - auto&& __first = [&](auto... __args) constexpr - { + auto&& __first = [&](auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { auto __r = __fun(__tuple_element_meta<_Tp, _Abi0, 0>(), first, __args...); [[maybe_unused]] auto&& __ignore_me = {( - [](auto&& __dst, const auto& __src) { + [](auto&& __dst, const auto& __src) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (is_assignable_v) { @@ -530,8 +530,7 @@ template }(static_cast<_More&&>(__more), __args), 0)...}; return __r; - } - (_M_extract_argument(__more)...); + }(_M_extract_argument(__more)...); if constexpr (_S_tuple_size == 1) return {__first}; else @@ -776,18 +775,18 @@ template sizeof...(_VX) == 0, "An array of scalars must be the last argument to __to_simd_tuple"); return __call_with_subscripts( - __from, - make_index_sequence<_NV>(), [&](const auto... __args) constexpr { - return __simd_tuple_concat( - _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>()); - }); + __from, make_index_sequence<_NV>(), + [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __simd_tuple_concat( + _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>()); + }); } else return __call_with_subscripts( - __from, - make_index_sequence<_NV>(), [&](const auto... __args) constexpr { - return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...); - }); + __from, make_index_sequence<_NV>(), + [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...); + }); } template @@ -841,7 +840,7 @@ template ) return {__generate_from_n_evaluations<_R::_S_first_size, typename _R::_FirstType>( - [&](auto __i) { return __x[__i]; }), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }), __optimize_simd_tuple( __simd_tuple_pop_front<_R::_S_first_size>(__x))}; else if constexpr (is_same_v<_A0, _A1> @@ -994,10 +993,11 @@ template (element_ptr, element_aligned)); #else [[maybe_unused]] constexpr size_t __offset = __values_to_skip; - return __as_vector(simd<_Tp, _RetAbi>([&](auto __i) constexpr { - constexpr _SizeConstant<__i + __offset> __k; - return __x[__k]; - })); + return __as_vector(simd<_Tp, _RetAbi>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<__i + __offset> __k; + return __x[__k]; + })); #endif } @@ -1286,9 +1286,10 @@ template template static constexpr inline _SimdMember<_Tp> _S_broadcast(_Tp __x) noexcept { - return _SimdMember<_Tp>::_S_generate([&](auto __meta) constexpr { - return __meta._S_broadcast(__x); - }); + return _SimdMember<_Tp>::_S_generate( + [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_broadcast(__x); + }); } // _S_generator {{{2 @@ -1296,14 +1297,15 @@ template static constexpr inline _SimdMember<_Tp> _S_generator(_Fp&& __gen, _TypeTag<_Tp>) { - return _SimdMember<_Tp>::_S_generate([&__gen](auto __meta) constexpr { - return __meta._S_generator( - [&](auto __i) constexpr { - return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>()) - : 0; - }, - _TypeTag<_Tp>()); - }); + return _SimdMember<_Tp>::_S_generate( + [&__gen](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_generator( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>()) + : 0; + }, + _TypeTag<_Tp>()); + }); } // _S_load {{{2 @@ -1311,9 +1313,10 @@ template static inline _SimdMember<_Tp> _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept { - return _SimdMember<_Tp>::_S_generate([&](auto __meta) { - return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>()); - }); + return _SimdMember<_Tp>::_S_generate( + [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>()); + }); } // _S_masked_load {{{2 @@ -1323,7 +1326,7 @@ template const _MaskMember __bits, const _Up* __mem) noexcept { auto __merge = __old; - __for_each(__merge, [&](auto __meta, auto& __native) { + __for_each(__merge, [&](auto __meta, auto& __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__meta._S_submask(__bits).any()) #pragma GCC diagnostic push // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts @@ -1344,7 +1347,7 @@ template static inline void _S_store(const _SimdMember<_Tp>& __v, _Up* __mem, _TypeTag<_Tp>) noexcept { - __for_each(__v, [&](auto __meta, auto __native) { + __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_store(__native, &__mem[__meta._S_offset], _TypeTag<_Tp>()); }); } @@ -1355,7 +1358,7 @@ template _Up* __mem, const _MaskMember __bits) noexcept { - __for_each(__v, [&](auto __meta, auto __native) { + __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__meta._S_submask(__bits).any()) #pragma GCC diagnostic push // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts @@ -1376,7 +1379,7 @@ template { _MaskMember __bits = 0; __for_each( - __x, [&__bits](auto __meta, auto __native) constexpr { + __x, [&__bits](auto __meta, auto __native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __bits |= __meta._S_mask_to_shifted_ullong(__meta._S_negate(__native)); }); @@ -1414,7 +1417,7 @@ template { const auto& __x2 = __call_with_n_evaluations< __div_roundup(_Tup::_S_tuple_size, 2)>( - [](auto __first_simd, auto... __remaining) { + [](auto __first_simd, auto... __remaining) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (sizeof...(__remaining) == 0) return __first_simd; else @@ -1428,7 +1431,7 @@ template __make_simd_tuple(__first_simd, __remaining...)); } }, - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { auto __left = __tup.template _M_simd_at<2 * __i>(); if constexpr (2 * __i + 1 == _Tup::_S_tuple_size) return __left; @@ -1444,7 +1447,9 @@ template _GLIBCXX_SIMD_USE_CONSTEXPR_API typename _LT::mask_type __k( __private_init, - [](auto __j) constexpr { return __j < _RT::size(); }); + [](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __j < _RT::size(); + }); _LT __ext_right = __left; where(__k, __ext_right) = __proposed::resizing_simd_cast<_LT>(__right); @@ -1464,7 +1469,7 @@ template const _SimdTuple<_Tp, _As...>& __b) { return __a._M_apply_per_chunk( - [](auto __impl, auto __aa, auto __bb) constexpr { + [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __impl._S_min(__aa, __bb); }, __b); @@ -1476,7 +1481,7 @@ template const _SimdTuple<_Tp, _As...>& __b) { return __a._M_apply_per_chunk( - [](auto __impl, auto __aa, auto __bb) constexpr { + [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __impl._S_max(__aa, __bb); }, __b); @@ -1487,9 +1492,10 @@ template static inline constexpr _SimdTuple<_Tp, _As...> _S_complement(const _SimdTuple<_Tp, _As...>& __x) noexcept { - return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr { - return __impl._S_complement(__xx); - }); + return __x._M_apply_per_chunk( + [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_complement(__xx); + }); } // _S_unary_minus {{{2 @@ -1497,23 +1503,24 @@ template static inline constexpr _SimdTuple<_Tp, _As...> _S_unary_minus(const _SimdTuple<_Tp, _As...>& __x) noexcept { - return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr { - return __impl._S_unary_minus(__xx); - }); + return __x._M_apply_per_chunk( + [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_unary_minus(__xx); + }); } // arithmetic operators {{{2 -#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \ - template \ - static inline constexpr _SimdTuple<_Tp, _As...> name_( \ - const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)\ - { \ - return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto __yy) constexpr { \ - return __impl.name_(__xx, __yy); \ - }, \ - __y); \ +#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \ + template \ + static inline constexpr _SimdTuple<_Tp, _As...> name_( \ + const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y) \ + { \ + return __x._M_apply_per_chunk( \ + [](auto __impl, auto __xx, auto __yy) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __impl.name_(__xx, __yy); \ + }, \ + __y); \ } _GLIBCXX_SIMD_FIXED_OP(_S_plus, +) @@ -1532,18 +1539,20 @@ template static inline constexpr _SimdTuple<_Tp, _As...> _S_bit_shift_left(const _SimdTuple<_Tp, _As...>& __x, int __y) { - return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr { - return __impl._S_bit_shift_left(__xx, __y); - }); + return __x._M_apply_per_chunk( + [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_bit_shift_left(__xx, __y); + }); } template static inline constexpr _SimdTuple<_Tp, _As...> _S_bit_shift_right(const _SimdTuple<_Tp, _As...>& __x, int __y) { - return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr { - return __impl._S_bit_shift_right(__xx, __y); - }); + return __x._M_apply_per_chunk( + [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_bit_shift_right(__xx, __y); + }); } // math {{{2 @@ -1557,35 +1566,40 @@ template { \ if constexpr (is_same_v<_Tp, _RetTp>) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx))); \ - }); \ + [](auto __impl, auto __xx) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx))); \ + }); \ else \ return __optimize_simd_tuple( \ - __x.template _M_apply_r<_RetTp>([](auto __impl, auto __xx) { \ - return __impl._S_##__name(__xx); \ - })); \ + __x.template _M_apply_r<_RetTp>( \ + [](auto __impl, auto __xx) \ + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl._S_##__name(__xx); })); \ } \ else if constexpr ( \ is_same_v< \ _Tp, \ _RetTp> && (... && is_same_v<_SimdTuple<_Tp, _As...>, _More>) ) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto... __pack) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx), \ - _V(__private_init, __pack)...)); \ - }, \ - __more...); \ + [](auto __impl, auto __xx, auto... __pack) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx), \ + _V(__private_init, __pack)...)); \ + }, __more...); \ else if constexpr (is_same_v<_Tp, _RetTp>) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto... __pack) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx), \ - __autocvt_to_simd(__pack)...)); \ - }, \ - __more...); \ + [](auto __impl, auto __xx, auto... __pack) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx), \ + __autocvt_to_simd(__pack)...)); \ + }, __more...); \ else \ __assert_unreachable<_Tp>(); \ } @@ -1657,10 +1671,10 @@ template __fixed_size_storage_t::_S_size()>* __z) { return __x._M_apply_per_chunk( - [](auto __impl, const auto __xx, const auto __yy, auto& __zz) { - return __impl._S_remquo(__xx, __yy, &__zz); - }, - __y, *__z); + [](auto __impl, const auto __xx, const auto __yy, auto& __zz) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __impl._S_remquo(__xx, __yy, &__zz); }, + __y, *__z); } template @@ -1669,12 +1683,10 @@ template __fixed_size_storage_t& __exp) noexcept { return __x._M_apply_per_chunk( - [](auto __impl, const auto& __a, auto& __b) { - return __data( - frexp(typename decltype(__impl)::simd_type(__private_init, __a), - __autocvt_to_simd(__b))); - }, - __exp); + [](auto __impl, const auto& __a, auto& __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __data(frexp(typename decltype(__impl)::simd_type(__private_init, __a), + __autocvt_to_simd(__b))); + }, __exp); } #define _GLIBCXX_SIMD_TEST_ON_TUPLE_(name_) \ @@ -1700,7 +1712,7 @@ template _S_increment(_SimdTuple<_Ts...>& __x) { __for_each( - __x, [](auto __meta, auto& native) constexpr { + __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_increment(native); }); } @@ -1710,7 +1722,7 @@ template _S_decrement(_SimdTuple<_Ts...>& __x) { __for_each( - __x, [](auto __meta, auto& native) constexpr { + __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_decrement(native); }); } @@ -1722,11 +1734,10 @@ template __cmp(const _SimdTuple<_Tp, _As...>& __x, \ const _SimdTuple<_Tp, _As...>& __y) \ { \ - return _M_test( \ - [](auto __impl, auto __xx, auto __yy) constexpr { \ - return __impl.__cmp(__xx, __yy); \ - }, \ - __x, __y); \ + return _M_test([](auto __impl, auto __xx, auto __yy) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl.__cmp(__xx, __yy); }, \ + __x, __y); \ } _GLIBCXX_SIMD_CMP_OPERATIONS(_S_equal_to) @@ -1753,12 +1764,13 @@ template _S_masked_assign(const _MaskMember __bits, _SimdTuple<_Tp, _As...>& __lhs, const __type_identity_t<_SimdTuple<_Tp, _As...>>& __rhs) { - __for_each( - __lhs, __rhs, - [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr { - __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, - __native_rhs); - }); + __for_each(__lhs, __rhs, + [&](auto __meta, auto& __native_lhs, auto __native_rhs) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { + __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, + __native_rhs); + }); } // Optimization for the case where the RHS is a scalar. No need to broadcast @@ -1769,7 +1781,7 @@ template const __type_identity_t<_Tp> __rhs) { __for_each( - __lhs, [&](auto __meta, auto& __native_lhs) constexpr { + __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, __rhs); }); @@ -1782,12 +1794,13 @@ template const _SimdTuple<_Tp, _As...>& __rhs, _Op __op) { - __for_each( - __lhs, __rhs, - [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr { - __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), - __native_lhs, __native_rhs, __op); - }); + __for_each(__lhs, __rhs, + [&](auto __meta, auto& __native_lhs, auto __native_rhs) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { + __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), + __native_lhs, __native_rhs, __op); + }); } // Optimization for the case where the RHS is a scalar. No need to broadcast @@ -1798,7 +1811,7 @@ template const _Tp& __rhs, _Op __op) { __for_each( - __lhs, [&](auto __meta, auto& __native_lhs) constexpr { + __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), __native_lhs, __rhs, __op); }); @@ -1899,7 +1912,7 @@ template // _Np _UShort, _UInt, _ULLong, float, and double can be more efficient. _ULLong __r = 0; using _Vs = __fixed_size_storage_t<_UChar, _Np>; - __for_each(_Vs{}, [&](auto __meta, auto) { + __for_each(_Vs{}, [&](auto __meta, auto) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r |= __meta._S_mask_to_shifted_ullong( __meta._S_mask_impl._S_load(&__mem[__meta._S_offset], _SizeConstant<__meta._S_size()>())); @@ -1912,9 +1925,10 @@ template _MaskMember __mask, const bool* __mem) noexcept { - _BitOps::_S_bit_iteration(__mask.to_ullong(), [&](auto __i) { - __merge.set(__i, __mem[__i]); - }); + _BitOps::_S_bit_iteration(__mask.to_ullong(), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge.set(__i, __mem[__i]); + }); return __merge; } @@ -1932,7 +1946,8 @@ template static inline void _S_masked_store(const _MaskMember __v, bool* __mem, const _MaskMember __k) noexcept { - _BitOps::_S_bit_iteration(__k, [&](auto __i) { __mem[__i] = __v[__i]; }); + _BitOps::_S_bit_iteration( + __k, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } // logical and bitwise operators {{{2 diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h index 2aff8ff5fa4..c20315e4e30 100644 --- a/libstdc++-v3/include/experimental/bits/simd_math.h +++ b/libstdc++-v3/include/experimental/bits/simd_math.h @@ -788,7 +788,7 @@ template // __exponent(__x) returns the exponent value (bias removed) as // simd<_Up> with integral _Up - auto&& __exponent = [](const _V& __v) { + auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { using namespace std::experimental::__proposed; using _IV = rebind_simd_t< conditional_t, _V>; @@ -931,7 +931,7 @@ template { return {__private_init, __data(__arg0)._M_apply_per_chunk( - [&](auto __impl, const auto&... __inner) { + [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { using _V = typename decltype(__impl)::simd_type; return __data(__apply(_V(__private_init, __inner)...)); }, @@ -1092,8 +1092,9 @@ _GLIBCXX_SIMD_CVTING2(hypot) if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1) { return __fixed_size_apply>( - [](auto __a, auto __b, auto __c) { return hypot(__a, __b, __c); }, - __x, __y, __z); + [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return hypot(__a, __b, __c); + }, __x, __y, __z); } else { @@ -1380,9 +1381,9 @@ template const fixed_size_simd>& __m, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); + }); } template @@ -1391,9 +1392,9 @@ template const fixed_size_simd>& __m, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); + }); } _GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp) @@ -1414,8 +1415,9 @@ template hermite(const fixed_size_simd>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::hermite(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::hermite(__n[__i], __x[__i]); + }); } template @@ -1423,8 +1425,9 @@ template laguerre(const fixed_size_simd>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::laguerre(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::laguerre(__n[__i], __x[__i]); + }); } template @@ -1432,8 +1435,9 @@ template legendre(const fixed_size_simd>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::legendre(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::legendre(__n[__i], __x[__i]); + }); } _GLIBCXX_SIMD_MATH_CALL_(riemann_zeta) @@ -1443,8 +1447,9 @@ template sph_bessel(const fixed_size_simd>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::sph_bessel(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::sph_bessel(__n[__i], __x[__i]); + }); } template @@ -1453,9 +1458,9 @@ template const fixed_size_simd>& __m, const simd<_Tp, _Abi>& theta) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); + }); } template @@ -1463,8 +1468,9 @@ template sph_neumann(const fixed_size_simd>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::sph_neumann(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::sph_neumann(__n[__i], __x[__i]); + }); } // }}} diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h index 8429c252196..7e4cb17b205 100644 --- a/libstdc++-v3/include/experimental/bits/simd_neon.h +++ b/libstdc++-v3/include/experimental/bits/simd_neon.h @@ -61,7 +61,7 @@ template _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) noexcept { - __execute_n_times<_Np>([&](auto __i) { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); }); @@ -75,7 +75,7 @@ template _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) { - __execute_n_times<_Np>([&](auto __i) { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __mem[__i] = __v[__i]; }); @@ -286,7 +286,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>( __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); }); @@ -306,7 +306,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -322,7 +322,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -346,7 +346,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -361,7 +361,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h index 0f4aa95e1a4..60e80d394ba 100644 --- a/libstdc++-v3/include/experimental/bits/simd_x86.h +++ b/libstdc++-v3/include/experimental/bits/simd_x86.h @@ -537,16 +537,17 @@ struct _CommonImplX86 : _CommonImplBuiltin _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) { if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL - _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr { - if constexpr (_Np <= 16) - return _mm_movm_epi8(__x._M_to_bits()); - else if constexpr (_Np <= 32) - return _mm256_movm_epi8(__x._M_to_bits()); - else if constexpr (_Np <= 64) - return _mm512_movm_epi8(__x._M_to_bits()); - else - __assert_unreachable<_SizeConstant<_Np>>(); - }()), + _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( + [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (_Np <= 16) + return _mm_movm_epi8(__x._M_to_bits()); + else if constexpr (_Np <= 32) + return _mm256_movm_epi8(__x._M_to_bits()); + else if constexpr (_Np <= 64) + return _mm512_movm_epi8(__x._M_to_bits()); + else + __assert_unreachable<_SizeConstant<_Np>>(); + }()), __mem); else if constexpr (__have_bmi2) { @@ -554,7 +555,7 @@ struct _CommonImplX86 : _CommonImplBuiltin _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); else __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { constexpr size_t __offset = __i * sizeof(size_t); constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); if constexpr (__todo == 1) @@ -575,7 +576,7 @@ struct _CommonImplX86 : _CommonImplBuiltin }); } else if constexpr (__have_sse2 && _Np > 7) - __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) { + __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { constexpr int __offset = __i * 16; constexpr int __todo = std::min(16, int(_Np) - __offset); const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); @@ -765,9 +766,10 @@ struct _CommonImplX86 : _CommonImplBuiltin static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); if (__k._M_is_constprop() && __at0._M_is_constprop() && __at1._M_is_constprop()) - return __generate_from_n_evaluations<_Np, - __vector_type_t<_Tp, _Np>>([&]( - auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; }); + return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? __at1[__i] : __at0[__i]; + }); else if constexpr (sizeof(__at0) == 64 || (__have_avx512vl && sizeof(__at0) >= 16)) return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); @@ -994,9 +996,8 @@ template } else _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), - [&](auto __i) { - __merge._M_set(__i, static_cast<_Tp>( - __mem[__i])); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); }); } /* Very uncertain, that the following improves anything. Needs @@ -1417,11 +1418,12 @@ template const auto __yf = __convert_all<_FloatV, __n_floatv>( _Abi::__make_padding_nonzero(__as_vector(__y))); return __call_with_n_evaluations<__n_floatv>( - [](auto... __quotients) { + [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __vector_convert<_R>(__quotients...); }, - [&__xf, - &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> { + [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + -> _SimdWrapper<_Float, __n_intermediate> + { #if !defined __clang__ && __GCC_IEC_559 == 0 // If -freciprocal-math is active, using the `/` operator is // incorrect because it may be translated to an imprecise @@ -1980,7 +1982,7 @@ template { auto __mask = __vector_bitcast<_UChar>( __vector_bitcast<_UShort>(__iy) << 5); - auto __maskl = [&]() { + auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); }; auto __xh = __vector_bitcast(__ix); @@ -2067,19 +2069,20 @@ template } //}}} else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ { - [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) { - if constexpr (sizeof(__a) == 16) - return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), - 0xaa); - else if constexpr (sizeof(__a) == 32) - return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), - 0xaa); - else if constexpr (sizeof(__a) == 64) - return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), - __to_intrin(__b)); - else - __assert_unreachable(); - }; + [[maybe_unused]] auto __blend_0xaa + = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (sizeof(__a) == 16) + return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), + 0xaa); + else if constexpr (sizeof(__a) == 32) + return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), + 0xaa); + else if constexpr (sizeof(__a) == 64) + return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), + __to_intrin(__b)); + else + __assert_unreachable(); + }; if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) return __intrin_bitcast<_V>(is_signed_v<_Up> ? _mm_srav_epi16(__ix, __iy) @@ -2136,9 +2139,10 @@ template { auto __k = __vector_bitcast<_UShort>(__iy) << 11; auto __x128 = __vector_bitcast<_Up>(__ix); - auto __mask = [](__vector_type16_t<_UShort> __kk) { - return __vector_bitcast(__kk) < 0; - }; + auto __mask + = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_bitcast(__kk) < 0; + }; // do __x128 = 0 where __y[4] is set __x128 = __mask(__k) ? decltype(__x128)() : __x128; // do __x128 =>> 8 where __y[3] is set @@ -2178,7 +2182,7 @@ template } else { - auto __shift = [](auto __a, auto __b) { + auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (is_signed_v<_Up>) return _mm_sra_epi32(__a, __b); else @@ -3492,7 +3496,7 @@ struct _MaskImplX86Mixin return _S_to_maskvector<_Up, _ToN>(__k); else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) return __generate_from_n_evaluations( - [&](auto __i) -> _Up { return -__x[__i.value]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); else if constexpr (sizeof(_Up) == 1) { if constexpr (sizeof(_UI) == 16) @@ -3737,9 +3741,9 @@ struct _MaskImplX86Mixin else if constexpr (__bits_per_element >= _ToN) { constexpr auto __bitmask - = __generate_vector<_V>([](auto __i) constexpr->_UpUInt { - return __i < _ToN ? 1ull << __i : 0; - }); + = __generate_vector<_V>([](auto __i) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt + { return __i < _ToN ? 1ull << __i : 0; }); const auto __bits = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; if constexpr (__bits_per_element > _ToN) @@ -3750,11 +3754,11 @@ struct _MaskImplX86Mixin else { const _V __tmp - = __generate_vector<_V>([&](auto __i) constexpr { + = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_UpUInt>( __k >> (__bits_per_element * (__i / __bits_per_element))); }) - & __generate_vector<_V>([](auto __i) constexpr { + & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_UpUInt>(1ull << (__i % __bits_per_element)); }); // mask bit index @@ -3790,7 +3794,7 @@ struct _MaskImplX86Mixin const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); return __generate_from_n_evaluations>( - [&](auto __i) -> _Up { return __y[__i.value]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); } using _To = __vector_type_t<_Up, _ToN>; [[maybe_unused]] constexpr size_t _FromN = _Np; @@ -4125,8 +4129,11 @@ struct _MaskImplX86Mixin { const auto __bools = -__x._M_data; const _ULLong __k = __call_with_n_evaluations<_Np>( - [](auto... __bits) { return (__bits | ...); }, - [&](auto __i) { return _ULLong(__bools[+__i]) << __i; }); + [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return (__bits | ...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _ULLong(__bools[+__i]) << __i; + }); if (__builtin_is_constant_evaluated() || __builtin_constant_p(__k)) return __k; @@ -4282,13 +4289,14 @@ template static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); if constexpr (__have_avx512bw) { - const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) { - if constexpr (__is_avx512_abi<_Abi>()) - return __bits; - else - return _S_to_maskvector<_Tp>( - _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); - }; + const auto __to_vec_or_bits + = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { + if constexpr (__is_avx512_abi<_Abi>()) + return __bits; + else + return _S_to_maskvector<_Tp>( + _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); + }; if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) { @@ -4475,7 +4483,7 @@ template } else { - _BitOps::_S_bit_iteration(__mask, [&](auto __i) { + _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __merge._M_set(__i, __mem[__i]); }); return __merge; @@ -4554,7 +4562,7 @@ template { if constexpr (__have_avx512bw_vl) _CommonImplX86::_S_store<_Np>( - __vector_bitcast([](auto __data) { + __vector_bitcast([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (_Np <= 16) return _mm_maskz_set1_epi8(__data, 1); else if constexpr (_Np <= 32)