// Implementation of -*- C++ -*- // Copyright The GNU Toolchain Authors. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . #ifndef _GLIBCXX_VEC_OPS_H #define _GLIBCXX_VEC_OPS_H 1 #ifdef _GLIBCXX_SYSHDR #pragma GCC system_header #endif #if __cplusplus >= 202400L #include "simd_details.h" #include #include // psabi warnings are bogus because the ABI of the internal types never leaks into user code #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpsabi" namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace simd { template constexpr bool __signed_has_single_bit(_Tp __x) { return __has_single_bit(make_unsigned_t<_Tp>(__x)); } /** * Alias for a vector builtin with given value type and total sizeof. */ template <__vectorizable _Tp, size_t _Bytes> requires (__has_single_bit(_Bytes)) using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp; /** * Alias for a vector builtin with given value type @p _Tp and @p _Width. */ template <__vectorizable _Tp, __simd_size_type _Width> requires (__signed_has_single_bit(_Width)) using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>; /** * Constrain to any vector builtin with given value type and optional width. */ template concept __vec_builtin_of = !is_class_v<_Tp> && !is_pointer_v<_Tp> && !is_arithmetic_v<_Tp> && __vectorizable<_ValueType> && _Width >= 1 && sizeof(_Tp) / sizeof(_ValueType) == _Width && same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp> && requires(_Tp& __v, _ValueType __x) { __v[0] = __x; }; /** * Constrain to any vector builtin. */ template concept __vec_builtin = __vec_builtin_of<_Tp, remove_cvref_t()[0])>>; /** * Alias for the value type of the given __vec_builtin type @p _Tp. */ template <__vec_builtin _Tp> using __vec_value_type = remove_cvref_t()[0])>; /** * The width (number of value_type elements) of the given vector builtin or arithmetic type. */ template inline constexpr __simd_size_type __width_of = 1; template requires __vec_builtin<_Tp> inline constexpr __simd_size_type __width_of<_Tp> = sizeof(_Tp) / sizeof(__vec_value_type<_Tp>); /** * Alias for a vector builtin with equal value type and new width @p _Np. */ template <__simd_size_type _Np, __vec_builtin _TV> using __resize_vec_builtin_t = __vec_builtin_type<__vec_value_type<_TV>, _Np>; template <__vec_builtin _TV> requires (__width_of<_TV> > 1) using __half_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> / 2, _TV>; template <__vec_builtin _TV> using __double_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> * 2, _TV>; template [[__gnu__::__always_inline__]] constexpr __vec_builtin_type_bytes<_Up, sizeof(_TV)> __vec_bit_cast(_TV __v) { return reinterpret_cast<__vec_builtin_type_bytes<_Up, sizeof(_TV)>>(__v); } template requires signed_integral<__vec_value_type<_TV>> static constexpr _TV _S_vec_implicit_mask = [] (integer_sequence) { return _TV{ (_Is < _Np ? -1 : 0)... }; } (make_integer_sequence>()); /** * Helper function to work around Clang not allowing v[i] in constant expressions. */ template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr __vec_value_type<_TV> __vec_get(_TV __v, int __i) { #ifdef _GLIBCXX_CLANG if consteval { return __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v)[__i]; } else #endif { return __v[__i]; } } /** * Helper function to work around Clang and GCC not allowing assignment to v[i] in constant * expressions. */ template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr void __vec_set(_TV& __v, int __i, __vec_value_type<_TV> __x) { if consteval { #ifdef _GLIBCXX_CLANG auto __arr = __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v); __arr[__i] = __x; __v = __builtin_bit_cast(_TV, __arr); #else constexpr auto [...__j] = _IotaArray<__width_of<_TV>>; __v = _TV{(__i == __j ? __x : __v[__j])...}; #endif } else { __v[__i] = __x; } } /** @internal * Return vector builtin with all values from @p __a and @p __b. */ template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr __vec_builtin_type<__vec_value_type<_TV>, __width_of<_TV> * 2> __vec_concat(_TV __a, _TV __b) { constexpr auto [...__is] = _IotaArray<__width_of<_TV> * 2>; return __builtin_shufflevector(__a, __b, __is...); } /** @internal * Concatenate the first @p _N0 elements from @p __a with the first @p _N1 elements from @p __b * with the elements from applying this function recursively to @p __rest. * * @pre _N0 <= __width_of<_TV0> && _N1 <= __width_of<_TV1> && _Ns <= __width_of<_TVs> && ... * * Strategy: Aim for a power-of-2 tree concat. E.g. * - cat(2, 2, 2, 2) -> cat(4, 2, 2) -> cat(4, 4) * - cat(2, 2, 2, 2, 8) -> cat(4, 2, 2, 8) -> cat(4, 4, 8) -> cat(8, 8) */ template [[__gnu__::__always_inline__]] constexpr __vec_builtin_type<__vec_value_type<_TV0>, __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))> __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest); template requires (__has_single_bit(unsigned(_N0))) && (_N0 >= (_N1 + _N2)) [[__gnu__::__always_inline__]] constexpr __vec_builtin_type<__vec_value_type<_TV0>, __bit_ceil(unsigned(_N0 + _N1 + (_N2 + ... + _Ns)))> __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TV2& __c, const _TVs&... __rest) { return __vec_concat_sized<_N0, _N1 + _N2, _Ns...>( __a, __vec_concat_sized<_N1, _N2>(__b, __c), __rest...); } template [[__gnu__::__always_inline__]] constexpr __vec_builtin_type<__vec_value_type<_TV0>, __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))> __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest) { // __is is rounded up because we need to generate a power-of-2 vector: constexpr auto [...__is] = _IotaArray<__bit_ceil(unsigned(_N0 + _N1)), int>; const auto __ab = __builtin_shufflevector(__a, __b, [](int __i) consteval { if (__i < _N0) // copy from __a return __i; else if (__i < _N0 + _N1) // copy from __b return __i - _N0 + __width_of<_TV0>; // _N0 <= __width_of<_TV0> else // can't index into __rest return -1; // don't care }(__is)...); if constexpr (sizeof...(__rest) == 0) return __ab; else return __vec_concat_sized<_N0 + _N1, _Ns...>(__ab, __rest...); } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr __half_vec_builtin_t<_TV> __vec_split_lo(_TV __v) { constexpr int __n = __width_of<_TV> / 2; constexpr auto [...__is] = _IotaArray<__n>; return __builtin_shufflevector(__v, __v, __is...); } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr __half_vec_builtin_t<_TV> __vec_split_hi(_TV __v) { constexpr int __n = __width_of<_TV> / 2; constexpr auto [...__is] = _IotaArray<__n>; return __builtin_shufflevector(__v, __v, (__n + __is)...); } /** @internal * Return @p __x zero-padded to @p _Bytes bytes. * * Use this function when you need two objects of the same size (e.g. for __vec_concat). */ template [[__gnu__::__always_inline__]] constexpr auto __vec_zero_pad_to(_TV __x) { if constexpr (sizeof(_TV) == _Bytes) return __x; else if constexpr (sizeof(_TV) <= sizeof(0ull)) { using _Up = _UInt; __vec_builtin_type_bytes<_Up, _Bytes> __tmp = {__builtin_bit_cast(_Up, __x)}; return __builtin_bit_cast(__vec_builtin_type_bytes<__vec_value_type<_TV>, _Bytes>, __tmp); } else if constexpr (sizeof(_TV) < _Bytes) return __vec_zero_pad_to<_Bytes>(__vec_concat(__x, _TV())); else static_assert(false); } /** @internal * Return a type with sizeof 16, add zero-padding to @p __x. The input must be smaller. * * Use this function instead of the above when you need to pad an argument for a SIMD builtin. */ template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr auto __vec_zero_pad_to_16(_TV __x) { static_assert(sizeof(_TV) < 16); return __vec_zero_pad_to<16>(__x); } // work around __builtin_constant_p returning false unless passed a variable // (__builtin_constant_p(x[0]) is false while __is_const_known(x[0]) is true) template [[__gnu__::__always_inline__]] constexpr bool __is_const_known(const _Tp& __x) { return __builtin_constant_p(__x); } [[__gnu__::__always_inline__]] constexpr bool __is_const_known(const auto&... __xs) requires(sizeof...(__xs) >= 2) { if consteval { return true; } else { return (__is_const_known(__xs) && ...); } } [[__gnu__::__always_inline__]] constexpr bool __is_const_known_equal_to(const auto& __x, const auto& __expect) { return __is_const_known(__x == __expect) && __x == __expect; } #if _GLIBCXX_X86 template <__vec_builtin _UV, __vec_builtin _TV> inline _UV __x86_cvt_f16c(_TV __v); #endif /** @internal * Simple wrapper around __builtin_convertvector to provide static_cast-like syntax. * * Works around GCC failing to use the F16C/AVX512F cvtps2ph/cvtph2ps instructions. */ template <__vec_builtin _UV, __vec_builtin _TV, _ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] constexpr _UV __vec_cast(_TV __v) { static_assert(__width_of<_UV> == __width_of<_TV>); #if _GLIBCXX_X86 using _Up = __vec_value_type<_UV>; using _Tp = __vec_value_type<_TV>; constexpr bool __to_f16 = is_same_v<_Up, _Float16>; constexpr bool __from_f16 = is_same_v<_Tp, _Float16>; constexpr bool __needs_f16c = _Traits._M_have_f16c() && !_Traits._M_have_avx512fp16() && (__to_f16 || __from_f16); if (__needs_f16c && !__is_const_known(__v)) { // Work around PR121688 if constexpr (__needs_f16c) return __x86_cvt_f16c<_UV>(__v); } if constexpr (is_floating_point_v<_Tp> && is_integral_v<_Up> && sizeof(_UV) < sizeof(_TV) && sizeof(_Up) < sizeof(int)) { using _Ip = __integer_from; using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>; return __vec_cast<_UV>(__vec_cast<_IV>(__v)); } #endif return __builtin_convertvector(__v, _UV); } /** @internal * Overload of the above cast function that determines the destination vector type from a given * element type @p _Up and the `__width_of` the argument type. * * Calls the above overload. */ template <__vectorizable _Up, __vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr __vec_builtin_type<_Up, __width_of<_TV>> __vec_cast(_TV __v) { return __vec_cast<__vec_builtin_type<_Up, __width_of<_TV>>>(__v); } /** @internal * As above, but with additional precondition on possible values of the argument. * * Precondition: __k[i] is either 0 or -1 for all i. */ template <__vec_builtin _UV, __vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _UV __vec_mask_cast(_TV __k) { static_assert(signed_integral<__vec_value_type<_UV>>); static_assert(signed_integral<__vec_value_type<_TV>>); // TODO: __builtin_convertvector cannot be optimal because it doesn't consider input and // output can only be 0 or -1. return __builtin_convertvector(__k, _UV); } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _TV __vec_xor(_TV __a, _TV __b) { using _Tp = __vec_value_type<_TV>; if constexpr (is_floating_point_v<_Tp>) { using _UV = __vec_builtin_type<__integer_from, __width_of<_TV>>; return __builtin_bit_cast( _TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b)); } else return __a ^ __b; } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _TV __vec_or(_TV __a, _TV __b) { using _Tp = __vec_value_type<_TV>; if constexpr (is_floating_point_v<_Tp>) { using _UV = __vec_builtin_type<__integer_from, __width_of<_TV>>; return __builtin_bit_cast( _TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b)); } else return __a | __b; } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _TV __vec_and(_TV __a, _TV __b) { using _Tp = __vec_value_type<_TV>; if constexpr (is_floating_point_v<_Tp>) { using _UV = __vec_builtin_type<__integer_from, __width_of<_TV>>; return __builtin_bit_cast( _TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b)); } else return __a & __b; } /** @internal * Returns the bit-wise and of not @p __a and @p __b. * * Use __vec_and(__vec_not(__a), __b) unless an andnot instruction is necessary for optimization. * * @see __vec_andnot in simd_x86.h */ template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _TV __vec_andnot(_TV __a, _TV __b) { using _Tp = __vec_value_type<_TV>; using _UV = __vec_builtin_type<__integer_from, __width_of<_TV>>; return __builtin_bit_cast( _TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b)); } template <__vec_builtin _TV> [[__gnu__::__always_inline__]] constexpr _TV __vec_not(_TV __a) { using _Tp = __vec_value_type<_TV>; using _UV = __vec_builtin_type_bytes<__integer_from, sizeof(_TV)>; if constexpr (is_floating_point_v<__vec_value_type<_TV>>) return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a)); else return ~__a; } /** * An object of given type where only the sign bits are 1. */ template <__vec_builtin _V> requires std::floating_point<__vec_value_type<_V>> constexpr _V _S_signmask = __vec_xor(_V() + 1, _V() - 1); template <__vec_builtin _TV, int _Np = __width_of<_TV>, typename = make_integer_sequence> struct _VecOps; template <__vec_builtin _TV, int _Np, int... _Is> struct _VecOps<_TV, _Np, integer_sequence> { static_assert(_Np <= __width_of<_TV>); using _Tp = __vec_value_type<_TV>; using _HV = __half_vec_builtin_t<__conditional_t<_Np >= 2, _TV, __double_vec_builtin_t<_TV>>>; [[__gnu__::__always_inline__]] static constexpr _TV _S_broadcast_to_even(_Tp __init) { return _TV {((_Is & 1) == 0 ? __init : _Tp())...}; } [[__gnu__::__always_inline__]] static constexpr _TV _S_broadcast_to_odd(_Tp __init) { return _TV {((_Is & 1) == 1 ? __init : _Tp())...}; } [[__gnu__::__always_inline__]] static constexpr bool _S_all_of(_TV __k) noexcept { return (... && (__k[_Is] != 0)); } [[__gnu__::__always_inline__]] static constexpr bool _S_any_of(_TV __k) noexcept { return (... || (__k[_Is] != 0)); } [[__gnu__::__always_inline__]] static constexpr bool _S_none_of(_TV __k) noexcept { return (... && (__k[_Is] == 0)); } template > [[__gnu__::__always_inline__]] static constexpr _TV _S_extract(__vec_builtin auto __x, _Offset = {}) { static_assert(is_same_v<__vec_value_type<_TV>, __vec_value_type>); return __builtin_shufflevector(__x, decltype(__x)(), (_Is + _Offset::value)...); } // swap neighboring elements [[__gnu__::__always_inline__]] static constexpr _TV _S_swap_neighbors(_TV __x) { return __builtin_shufflevector(__x, __x, (_Is ^ 1)...); } // duplicate even indexed elements, dropping the odd ones [[__gnu__::__always_inline__]] static constexpr _TV _S_dup_even(_TV __x) { return __builtin_shufflevector(__x, __x, (_Is & ~1)...); } // duplicate odd indexed elements, dropping the even ones [[__gnu__::__always_inline__]] static constexpr _TV _S_dup_odd(_TV __x) { return __builtin_shufflevector(__x, __x, (_Is | 1)...); } [[__gnu__::__always_inline__]] static constexpr void _S_overwrite_even_elements(_TV& __x, _HV __y) requires (_Np > 1) { constexpr __simd_size_type __n = __width_of<_TV>; __x = __builtin_shufflevector(__x, #ifdef _GLIBCXX_CLANG __vec_concat(__y, __y), #else __y, #endif ((_Is & 1) == 0 ? __n + _Is / 2 : _Is)...); } [[__gnu__::__always_inline__]] static constexpr void _S_overwrite_even_elements(_TV& __xl, _TV& __xh, _TV __y) { constexpr __simd_size_type __nl = __width_of<_TV>; constexpr __simd_size_type __nh = __nl * 3 / 2; __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 0 ? __nl + _Is / 2 : _Is)...); __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 0 ? __nh + _Is / 2 : _Is)...); } [[__gnu__::__always_inline__]] static constexpr void _S_overwrite_odd_elements(_TV& __x, _HV __y) requires (_Np > 1) { constexpr __simd_size_type __n = __width_of<_TV>; __x = __builtin_shufflevector(__x, #ifdef _GLIBCXX_CLANG __vec_concat(__y, __y), #else __y, #endif ((_Is & 1) == 1 ? __n + _Is / 2 : _Is)...); } [[__gnu__::__always_inline__]] static constexpr void _S_overwrite_odd_elements(_TV& __xl, _TV& __xh, _TV __y) { constexpr __simd_size_type __nl = __width_of<_TV>; constexpr __simd_size_type __nh = __nl * 3 / 2; __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 1 ? __nl + _Is / 2 : _Is)...); __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 1 ? __nh + _Is / 2 : _Is)...); } // true if all elements are know to be equal to __ref at compile time [[__gnu__::__always_inline__]] static constexpr bool _S_is_const_known_equal_to(_TV __x, _Tp __ref) { return (__is_const_known_equal_to(__x[_Is], __ref) && ...); } }; } // namespace simd _GLIBCXX_END_NAMESPACE_VERSION } // namespace std #pragma GCC diagnostic pop #endif // C++26 #endif // _GLIBCXX_VEC_OPS_H