Files
gcc/libstdc++-v3/include/std/mdspan

2500 lines
77 KiB
Plaintext
Raw Normal View History

// <mdspan> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file mdspan
* This is a Standard C++ Library header.
*/
#ifndef _GLIBCXX_MDSPAN
#define _GLIBCXX_MDSPAN 1
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
#include <span>
#include <array>
#include <type_traits>
#include <utility>
#if __cplusplus > 202302L
#include <bits/align.h>
#endif
#define __glibcxx_want_mdspan
#define __glibcxx_want_aligned_accessor
#define __glibcxx_want_submdspan
#include <bits/version.h>
#ifdef __glibcxx_mdspan
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace __mdspan
{
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
consteval bool
__all_static(std::span<const size_t> __extents)
{
for(auto __ext : __extents)
if (__ext == dynamic_extent)
return false;
return true;
}
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
consteval bool
__all_dynamic(std::span<const size_t> __extents)
{
for(auto __ext : __extents)
if (__ext != dynamic_extent)
return false;
return true;
}
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<typename _IndexType, typename _OIndexType>
constexpr _IndexType
__index_type_cast(_OIndexType&& __other)
{
if constexpr (std::is_integral_v<_OIndexType>)
{
constexpr _IndexType __index_type_max
= __gnu_cxx::__int_traits<_IndexType>::__max;
constexpr _OIndexType __oindex_type_max
= __gnu_cxx::__int_traits<_OIndexType>::__max;
if constexpr (__index_type_max < __oindex_type_max)
__glibcxx_assert(cmp_less_equal(__other, __index_type_max));
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
if constexpr (std::is_signed_v<_OIndexType>)
__glibcxx_assert(__other >= 0);
return static_cast<_IndexType>(__other);
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
}
else
{
auto __ret = static_cast<_IndexType>(std::move(__other));
if constexpr (std::is_signed_v<_IndexType>)
__glibcxx_assert(__ret >= 0);
return __ret;
}
}
template<array _Extents>
class _StaticExtents
{
public:
static constexpr size_t _S_rank = _Extents.size();
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
// For __r in [0, _S_rank], _S_dynamic_index(__r) is the number
// of dynamic extents up to (and not including) __r.
//
// If __r is the index of a dynamic extent, then
// _S_dynamic_index[__r] is the index of that extent in
// _M_dyn_exts.
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
static constexpr size_t
_S_dynamic_index(size_t __r) noexcept
{ return _S_dynamic_index_data[__r]; }
static constexpr auto _S_dynamic_index_data = [] consteval
{
array<size_t, _S_rank+1> __ret;
size_t __dyn = 0;
for (size_t __i = 0; __i < _S_rank; ++__i)
{
__ret[__i] = __dyn;
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
__dyn += (_Extents[__i] == dynamic_extent);
}
__ret[_S_rank] = __dyn;
return __ret;
}();
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
static constexpr size_t _S_rank_dynamic = _S_dynamic_index(_S_rank);
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
// For __r in [0, _S_rank_dynamic), _S_dynamic_index_inv(__r) is the
// index of the __r-th dynamic extent in _Extents.
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
static constexpr size_t
_S_dynamic_index_inv(size_t __r) noexcept
{ return _S_dynamic_index_inv_data[__r]; }
static constexpr auto _S_dynamic_index_inv_data = [] consteval
{
array<size_t, _S_rank_dynamic> __ret;
for (size_t __i = 0, __r = 0; __i < _S_rank; ++__i)
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
if (_Extents[__i] == dynamic_extent)
__ret[__r++] = __i;
return __ret;
}();
static constexpr size_t
_S_static_extent(size_t __r) noexcept
{ return _Extents[__r]; }
};
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
template<array _Extents>
requires (__all_dynamic<_Extents>())
class _StaticExtents<_Extents>
{
public:
static constexpr size_t _S_rank = _Extents.size();
static constexpr size_t
_S_dynamic_index(size_t __r) noexcept
{ return __r; }
static constexpr size_t _S_rank_dynamic = _S_rank;
static constexpr size_t
_S_dynamic_index_inv(size_t __k) noexcept
{ return __k; }
static constexpr size_t
_S_static_extent(size_t) noexcept
{ return dynamic_extent; }
};
template<typename _IndexType, array _Extents>
class _ExtentsStorage : public _StaticExtents<_Extents>
{
private:
using _Base = _StaticExtents<_Extents>;
public:
using _Base::_S_rank;
using _Base::_S_rank_dynamic;
using _Base::_S_dynamic_index;
using _Base::_S_dynamic_index_inv;
using _Base::_S_static_extent;
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
static constexpr bool
_S_is_dynamic(size_t __r) noexcept
{
if constexpr (__all_static(_Extents))
return false;
else if constexpr (__all_dynamic(_Extents))
return true;
else
return _Extents[__r] == dynamic_extent;
}
template<typename _OIndexType>
static constexpr _IndexType
_S_int_cast(const _OIndexType& __other) noexcept
{ return _IndexType(__other); }
constexpr _IndexType
_M_extent(size_t __r) const noexcept
{
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
if (_S_is_dynamic(__r))
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
return _M_dyn_exts[_S_dynamic_index(__r)];
else
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
return _S_static_extent(__r);
}
template<size_t _OtherRank, typename _GetOtherExtent>
static constexpr bool
_S_is_compatible_extents(_GetOtherExtent __get_extent) noexcept
{
if constexpr (_OtherRank == _S_rank)
for (size_t __i = 0; __i < _S_rank; ++__i)
libstdc++: Reduce indirection in extents::extent. In both fully static and dynamic extents the comparison static_extent(i) == dynamic_extent is known at compile time. As a result, extents::extent doesn't need to perform the check at runtime. An illustrative example is: using E = std::extents<int, 3, 5, 7, 11, 13, 17>; int required_span_size(const typename Layout::mapping<E>& m) { return m.required_span_size(); } Prior to this commit the generated code (on -O2) is: 2a0: b9 01 00 00 00 mov ecx,0x1 2a5: 31 d2 xor edx,edx 2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2ae: 00 00 00 00 2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 2b9: 00 00 00 00 2bd: 0f 1f 00 nop DWORD PTR [rax] 2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0] 2c7: 00 2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff 2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32> 2d2: 83 e8 01 sub eax,0x1 2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4] 2d9: 48 83 c2 01 add rdx,0x1 2dd: 01 c1 add ecx,eax 2df: 48 83 fa 06 cmp rdx,0x6 2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20> 2e5: 89 c8 mov eax,ecx 2e7: c3 ret which is a scalar loop, and notably includes the check 308: 48 83 f8 ff cmp rax,0xffffffffffffffff to assert that the static extent is indeed not -1. Note, that on -O3 the optimizer eliminates the comparison; and generates a sequence of scalar operations: lea, shl, add and mov. The aim of this commit is to eliminate this comparison also for -O2. With the optimization applied we get: 2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi] 2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] 2eb: 00 2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10] 2ef: 66 0f 6f c1 movdqa xmm0,xmm1 2f3: 66 0f 73 d1 20 psrlq xmm1,0x20 2f8: 66 0f f4 c2 pmuludq xmm0,xmm2 2fc: 66 0f 73 d2 20 psrlq xmm2,0x20 301: 8d 14 52 lea edx,[rdx+rdx*2] 304: 66 0f f4 ca pmuludq xmm1,xmm2 308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8 30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8 312: 66 0f 62 c1 punpckldq xmm0,xmm1 316: 66 0f 6f c8 movdqa xmm1,xmm0 31a: 66 0f 73 d9 08 psrldq xmm1,0x8 31f: 66 0f fe c1 paddd xmm0,xmm1 323: 66 0f 6f c8 movdqa xmm1,xmm0 327: 66 0f 73 d9 04 psrldq xmm1,0x4 32c: 66 0f fe c1 paddd xmm0,xmm1 330: 66 0f 7e c0 movd eax,xmm0 334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1] 338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14] 33b: c1 e0 04 shl eax,0x4 33e: 01 d0 add eax,edx 340: c3 ret Which shows eliminating the trivial comparison, unlocks a new set of optimizations, i.e. SIMD-vectorization. In particular, the loop has been vectorized by loading the first four constants from aligned memory; the first four strides from non-aligned memory, then computes the product and reduction. It interleaves the above with computing 1 + 12*S[4] + 16*S[5] (as scalar operations) and then finishes the reduction. A similar effect can be observed for fully dynamic extents. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_static): New function. (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate. (__mdspan::_ExtentsStorage::_S_is_dynamic): New method. (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
if (!_S_is_dynamic(__i)
&& !cmp_equal(_Extents[__i], _S_int_cast(__get_extent(__i))))
return false;
return true;
}
template<size_t _OtherRank, typename _GetOtherExtent>
constexpr void
_M_init_dynamic_extents(_GetOtherExtent __get_extent) noexcept
{
__glibcxx_assert(_S_is_compatible_extents<_OtherRank>(__get_extent));
for (size_t __i = 0; __i < _S_rank_dynamic; ++__i)
{
size_t __di = __i;
if constexpr (_OtherRank != _S_rank_dynamic)
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
__di = _S_dynamic_index_inv(__i);
_M_dyn_exts[__i] = _S_int_cast(__get_extent(__di));
}
}
constexpr
_ExtentsStorage() noexcept = default;
template<typename _OIndexType, array _OExtents>
constexpr
_ExtentsStorage(const _ExtentsStorage<_OIndexType, _OExtents>&
__other) noexcept
{
_M_init_dynamic_extents<_S_rank>([&__other](size_t __i)
{ return __other._M_extent(__i); });
}
template<typename _OIndexType, size_t _Nm>
constexpr
_ExtentsStorage(span<const _OIndexType, _Nm> __exts) noexcept
{
_M_init_dynamic_extents<_Nm>(
[&__exts](size_t __i) -> const _OIndexType&
{ return __exts[__i]; });
}
static constexpr const array<size_t, _S_rank>&
_S_static_extents() noexcept
{ return _Extents; }
constexpr span<const _IndexType>
_M_dynamic_extents(size_t __begin, size_t __end) const noexcept
requires (_Extents.size() > 0)
{
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
return {_M_dyn_exts + _S_dynamic_index(__begin),
_M_dyn_exts + _S_dynamic_index(__end)};
}
private:
using _Storage = __array_traits<_IndexType, _S_rank_dynamic>::_Type;
[[no_unique_address]] _Storage _M_dyn_exts{};
};
template<typename _OIndexType, typename _SIndexType>
concept __valid_index_type =
is_convertible_v<_OIndexType, _SIndexType> &&
is_nothrow_constructible_v<_SIndexType, _OIndexType>;
template<size_t _Extent, typename _IndexType>
concept
__valid_static_extent = _Extent == dynamic_extent
|| _Extent <= __gnu_cxx::__int_traits<_IndexType>::__max;
template<typename _Extents>
constexpr const array<size_t, _Extents::rank()>&
__static_extents() noexcept
{ return _Extents::_Storage::_S_static_extents(); }
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<typename _Extents>
constexpr span<const size_t>
__static_extents(size_t __begin, size_t __end) noexcept
{
const auto& __sta_exts = __static_extents<_Extents>();
return span<const size_t>(__sta_exts.data() + __begin, __end - __begin);
}
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
// Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive)
template<array _Extents>
constexpr auto __fwd_partial_prods = [] consteval
{
constexpr size_t __rank = _Extents.size();
std::array<size_t, __rank> __ret;
size_t __prod = 1;
for (size_t __r = 0; __r < __rank; ++__r)
{
__ret[__r] = __prod;
if (size_t __ext = _Extents[__r]; __ext != dynamic_extent)
__prod *= __ext;
}
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
return __ret;
}();
// Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
template<array _Extents>
constexpr auto __rev_partial_prods = [] consteval
{
constexpr size_t __rank = _Extents.size();
std::array<size_t, __rank> __ret;
size_t __prod = 1;
for (size_t __r = __rank; __r > 0; --__r)
{
__ret[__r - 1] = __prod;
if (size_t __ext = _Extents[__r - 1]; __ext != dynamic_extent)
__prod *= __ext;
}
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
return __ret;
}();
template<typename _Extents>
constexpr span<const typename _Extents::index_type>
__dynamic_extents(const _Extents& __exts, size_t __begin = 0,
size_t __end = _Extents::rank()) noexcept
{ return __exts._M_exts._M_dynamic_extents(__begin, __end); }
}
#if __glibcxx_submdspan
struct full_extent_t
{
explicit full_extent_t() = default;
};
inline constexpr full_extent_t full_extent{};
template<typename _OffsetType, typename _ExtentType, typename _StrideType>
struct strided_slice {
static_assert(__is_standard_integer<_OffsetType>::value
|| __detail::__integral_constant_like<_OffsetType>);
static_assert(__is_standard_integer<_ExtentType>::value
|| __detail::__integral_constant_like<_ExtentType>);
static_assert(__is_standard_integer<_StrideType>::value
|| __detail::__integral_constant_like<_StrideType>);
using offset_type = _OffsetType;
using extent_type = _ExtentType;
using stride_type = _StrideType;
[[no_unique_address]] offset_type offset{};
[[no_unique_address]] extent_type extent{};
[[no_unique_address]] stride_type stride{};
};
template<typename _Mapping>
struct submdspan_mapping_result
{
[[no_unique_address]] _Mapping mapping = _Mapping();
size_t offset{};
};
#endif
template<typename _IndexType, size_t... _Extents>
class extents
{
static_assert(__is_standard_integer<_IndexType>::value,
"IndexType must be a signed or unsigned integer type");
static_assert(
(__mdspan::__valid_static_extent<_Extents, _IndexType> && ...),
"Extents must either be dynamic or representable as IndexType");
public:
using index_type = _IndexType;
using size_type = make_unsigned_t<index_type>;
using rank_type = size_t;
static constexpr rank_type
rank() noexcept { return _Storage::_S_rank; }
static constexpr rank_type
rank_dynamic() noexcept { return _Storage::_S_rank_dynamic; }
static constexpr size_t
static_extent(rank_type __r) noexcept
{
__glibcxx_assert(__r < rank());
if constexpr (rank() == 0)
__builtin_trap();
else
return _Storage::_S_static_extent(__r);
}
constexpr index_type
extent(rank_type __r) const noexcept
{
__glibcxx_assert(__r < rank());
if constexpr (rank() == 0)
__builtin_trap();
else
return _M_exts._M_extent(__r);
}
constexpr
extents() noexcept = default;
private:
static consteval bool
_S_is_less_dynamic(size_t __ext, size_t __oext)
{ return (__ext != dynamic_extent) && (__oext == dynamic_extent); }
template<typename _OIndexType, size_t... _OExtents>
static consteval bool
_S_ctor_explicit()
{
return (_S_is_less_dynamic(_Extents, _OExtents) || ...)
|| (__gnu_cxx::__int_traits<index_type>::__max
< __gnu_cxx::__int_traits<_OIndexType>::__max);
}
template<size_t... _OExtents>
static consteval bool
_S_is_compatible_extents()
{
if constexpr (sizeof...(_OExtents) != rank())
return false;
else
return ((_OExtents == dynamic_extent || _Extents == dynamic_extent
|| _OExtents == _Extents) && ...);
}
public:
template<typename _OIndexType, size_t... _OExtents>
requires (_S_is_compatible_extents<_OExtents...>())
constexpr explicit(_S_ctor_explicit<_OIndexType, _OExtents...>())
extents(const extents<_OIndexType, _OExtents...>& __other) noexcept
: _M_exts(__other._M_exts)
{ }
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
requires (sizeof...(_OIndexTypes) == rank()
|| sizeof...(_OIndexTypes) == rank_dynamic())
constexpr explicit extents(_OIndexTypes... __exts) noexcept
: _M_exts(span<const _IndexType, sizeof...(_OIndexTypes)>(
initializer_list{static_cast<_IndexType>(std::move(__exts))...}))
{ }
template<typename _OIndexType, size_t _Nm>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
&& (_Nm == rank() || _Nm == rank_dynamic())
constexpr explicit(_Nm != rank_dynamic())
extents(span<_OIndexType, _Nm> __exts) noexcept
: _M_exts(span<const _OIndexType, _Nm>(__exts))
{ }
template<typename _OIndexType, size_t _Nm>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
&& (_Nm == rank() || _Nm == rank_dynamic())
constexpr explicit(_Nm != rank_dynamic())
extents(const array<_OIndexType, _Nm>& __exts) noexcept
: _M_exts(span<const _OIndexType, _Nm>(__exts))
{ }
template<typename _OIndexType, size_t... _OExtents>
friend constexpr bool
operator==(const extents& __self,
const extents<_OIndexType, _OExtents...>& __other) noexcept
{
if constexpr (!_S_is_compatible_extents<_OExtents...>())
return false;
else
{
libstdc++: Improve extents::operator==. An interesting case to consider is: bool same11(const std::extents<int, dyn, 2, 3>& e1, const std::extents<int, dyn, dyn, 3>& e2) { return e1 == e2; } Which has the following properties: - There's no mismatching static extents, preventing any short-circuiting. - There's a comparison between dynamic and static extents. - There's one trivial comparison: ... && 3 == 3. Let E[i] denote the array of static extents, D[k] denote the array of dynamic extents and k[i] be the index of the i-th extent in D. (Naturally, k[i] is only meaningful if i is a dynamic extent). The previous implementation results in assembly that's more or less a literal translation of: for (i = 0; i < 3; ++i) e1 = E1[i] == -1 ? D1[k1[i]] : E1[i]; e2 = E2[i] == -1 ? D2[k2[i]] : E2[i]; if e1 != e2: return false return true; While the proposed method results in assembly for if(D1[0] == D2[0]) return false; return 2 == D2[1]; i.e. 110: 8b 17 mov edx,DWORD PTR [rdi] 112: 31 c0 xor eax,eax 114: 39 16 cmp DWORD PTR [rsi],edx 116: 74 08 je 120 <same11+0x10> 118: c3 ret 119: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 120: 83 7e 04 02 cmp DWORD PTR [rsi+0x4],0x2 124: 0f 94 c0 sete al 127: c3 ret It has the following nice properties: - It eliminated the indirection D[k[i]], because k[i] is known at compile time. Saving us a comparison E[i] == -1 and conditionally loading k[i]. - It eliminated the trivial condition 3 == 3. The result is code that only loads the required values and performs exactly the number of comparisons needed by the algorithm. It also results in smaller object files. Therefore, this seems like a sensible change. We've check several other examples, including fully statically determined cases and high-rank examples. The example given above illustrates the other cases well. The constexpr condition: if constexpr (!_S_is_compatible_extents<...>) return false; is no longer needed, because the optimizer correctly handles this case. However, it's retained for clarity/certainty. libstdc++-v3/ChangeLog: * include/std/mdspan (extents::operator==): Replace loop with pack expansion. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:29 +02:00
auto __impl = [&__self, &__other]<size_t... _Counts>(
index_sequence<_Counts...>)
{ return (cmp_equal(__self.extent(_Counts),
__other.extent(_Counts)) && ...); };
return __impl(make_index_sequence<__self.rank()>());
}
}
private:
friend const array<size_t, rank()>&
__mdspan::__static_extents<extents>();
friend span<const index_type>
__mdspan::__dynamic_extents<extents>(const extents&, size_t, size_t);
using _Storage = __mdspan::_ExtentsStorage<
_IndexType, array<size_t, sizeof...(_Extents)>{_Extents...}>;
[[no_unique_address]] _Storage _M_exts;
template<typename _OIndexType, size_t... _OExtents>
friend class extents;
};
namespace __mdspan
{
template<typename _Tp, size_t _Nm>
constexpr bool
__contains_zero(span<_Tp, _Nm> __exts) noexcept
{
for (size_t __i = 0; __i < __exts.size(); ++__i)
if (__exts[__i] == 0)
return true;
return false;
}
template<typename _Tp, size_t _Nm>
consteval bool
__contains_zero(const array<_Tp, _Nm>& __exts) noexcept
{ return __contains_zero(span<const _Tp>(__exts)); }
template<typename _Extents>
constexpr bool
__empty(const _Extents& __exts) noexcept
{
if constexpr (__contains_zero(__static_extents<_Extents>()))
return true;
else if constexpr (_Extents::rank_dynamic() > 0)
return __contains_zero(__dynamic_extents(__exts));
else
return false;
}
template<typename _Extents>
constexpr typename _Extents::index_type
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
__extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin,
size_t __end) noexcept
{
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
if (__sta_prod == 0)
return 0;
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
size_t __ret = __sta_prod;
if constexpr (_Extents::rank_dynamic() > 0)
for (auto __factor : __dynamic_extents(__exts, __begin, __end))
__ret *= size_t(__factor);
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
return static_cast<typename _Extents::index_type>(__ret);
}
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
// Preconditions: _r < _Extents::rank()
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<typename _Extents>
constexpr typename _Extents::index_type
__fwd_prod(const _Extents& __exts, size_t __begin, size_t __end) noexcept
{
size_t __sta_prod = [__begin, __end] {
span<const size_t> __sta_exts = __static_extents<_Extents>(__begin, __end);
size_t __ret = 1;
for(auto __ext : __sta_exts)
if (__ext != dynamic_extent)
__ret *= __ext;
return __ret;
}();
return __extents_prod(__exts, __sta_prod, __begin, __end);
}
template<typename _Extents>
constexpr typename _Extents::index_type
__fwd_prod(const _Extents& __exts, size_t __r) noexcept
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
{
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
constexpr size_t __rank = _Extents::rank();
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
constexpr auto& __sta_exts = __static_extents<_Extents>();
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
if constexpr (__rank == 1)
return 1;
else if constexpr (__rank == 2)
return __r == 0 ? 1 : __exts.extent(0);
else if constexpr (__all_dynamic(std::span(__sta_exts).first(__rank-1)))
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
return __extents_prod(__exts, 1, 0, __r);
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
else
{
size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
return __extents_prod(__exts, __sta_prod, 0, __r);
}
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
}
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
// Preconditions: _r < _Extents::rank()
template<typename _Extents>
constexpr typename _Extents::index_type
__rev_prod(const _Extents& __exts, size_t __r) noexcept
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
{
constexpr size_t __rank = _Extents::rank();
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
constexpr auto& __sta_exts = __static_extents<_Extents>();
if constexpr (__rank == 1)
return 1;
else if constexpr (__rank == 2)
return __r == 0 ? __exts.extent(1) : 1;
else if constexpr (__all_dynamic(std::span(__sta_exts).last(__rank-1)))
libstdc++: Improve fully dynamic extents in mdspan. In mdspan related code, for extents with no static extents, i.e. only dynamic extents, the following simplifications can be made: - The array of dynamic extents has size rank. - The two arrays dynamic-index and dynamic-index-inv become trivial, e.g. k[i] == i. - All elements of the arrays __{fwd,rev}_partial_prods are 1. This commits eliminates the arrays for dynamic-index, dynamic-index-inv and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i from the source code, which isn't as relevant because the optimizer is (often) capable of eliminating the indirection. To check if it's working we look at: using E2 = std::extents<int, dyn, dyn, dyn, dyn>; int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r) { return m.stride(r); } which generates the following 0000000000000190 <stride_left_E2>: 190: 48 c1 e6 02 shl rsi,0x2 194: 74 22 je 1b8 <stride_left_E2+0x28> 196: 48 01 fe add rsi,rdi 199: b8 01 00 00 00 mov eax,0x1 19e: 66 90 xchg ax,ax 1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 1a3: 48 83 c7 04 add rdi,0x4 1a7: 48 0f af c2 imul rax,rdx 1ab: 48 39 fe cmp rsi,rdi 1ae: 75 f0 jne 1a0 <stride_left_E2+0x10> 1b0: c3 ret 1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 1b8: b8 01 00 00 00 mov eax,0x1 1bd: c3 ret We see that: - There's no code to load the partial product of static extents. - There's no indirection D[k[i]], it's just D[i] (as before). On a test file which computes both mapping::stride(r) and mapping::required_span_size, we check for static storage with objdump -h we don't see the NTTP _Extents, anything (anymore) related to _StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also check that the size of the reference object file (described three commits prior) reduced by a few percent from 41.9kB to 39.4kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__all_dynamic): New function. (__mdspan::_StaticExtents::_S_dynamic_index): Convert to method. (__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto. (__mdspan::_StaticExtents): New specialization for fully dynamic extents. (__mdspan::__fwd_prod): New constexpr if branch to avoid instantiating __fwd_partial_prods. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
return __extents_prod(__exts, 1, __r + 1, __rank);
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
else
{
size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
}
libstdc++: Precompute products of static extents. Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents. The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
}
template<typename _Extents>
constexpr typename _Extents::index_type
__size(const _Extents& __exts) noexcept
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
{
constexpr size_t __sta_prod = [] {
span<const size_t> __sta_exts = __static_extents<_Extents>();
size_t __ret = 1;
for(auto __ext : __sta_exts)
if (__ext != dynamic_extent)
__ret *= __ext;
return __ret;
}();
return __extents_prod(__exts, __sta_prod, 0, _Extents::rank());
libstdc++: Improve low-rank layout_{left,right}::stride. The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
}
template<typename _IndexType, size_t... _Counts>
auto __build_dextents_type(integer_sequence<size_t, _Counts...>)
-> extents<_IndexType, ((void) _Counts, dynamic_extent)...>;
}
template<typename _IndexType, size_t _Rank>
using dextents = decltype(__mdspan::__build_dextents_type<_IndexType>(
make_index_sequence<_Rank>()));
#if __glibcxx_mdspan >= 202406L
template<size_t _Rank, typename _IndexType = size_t>
using dims = dextents<_IndexType, _Rank>;
#endif
template<typename... _Integrals>
requires (is_convertible_v<_Integrals, size_t> && ...)
explicit extents(_Integrals...) ->
extents<size_t, __detail::__maybe_static_ext<_Integrals>...>;
struct layout_left
{
template<typename _Extents>
class mapping;
};
struct layout_right
{
template<typename _Extents>
class mapping;
};
struct layout_stride
{
template<typename _Extents>
class mapping;
};
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
#ifdef __glibcxx_padded_layouts
template<size_t _PaddingValue>
struct layout_left_padded
{
template<typename _Extents>
class mapping;
};
template<size_t _PaddingValue>
struct layout_right_padded
{
template<typename _Extents>
class mapping;
};
#endif
namespace __mdspan
{
template<typename _Tp>
constexpr bool __is_extents = false;
template<typename _IndexType, size_t... _Extents>
constexpr bool __is_extents<extents<_IndexType, _Extents...>> = true;
template<typename _Extents, typename... _Indices>
constexpr typename _Extents::index_type
__linear_index_left(const _Extents& __exts, _Indices... __indices)
noexcept
{
using _IndexType = typename _Extents::index_type;
_IndexType __res = 0;
if constexpr (sizeof...(__indices) > 0)
{
_IndexType __mult = 1;
auto __update = [&, __pos = 0u](_IndexType __idx) mutable
{
_GLIBCXX_DEBUG_ASSERT(cmp_less(__idx, __exts.extent(__pos)));
__res += __idx * __mult;
__mult *= __exts.extent(__pos);
++__pos;
};
(__update(__indices), ...);
}
return __res;
}
template<typename _IndexType>
consteval _IndexType
__static_quotient(std::span<const size_t> __sta_exts,
_IndexType __nom = __gnu_cxx::__int_traits<_IndexType>::__max)
{
for (auto __factor : __sta_exts)
{
if (__factor != dynamic_extent)
__nom /= _IndexType(__factor);
if (__nom == 0)
break;
}
return __nom;
}
template<typename _Extents,
typename _IndexType = typename _Extents::index_type>
requires __is_extents<_Extents>
consteval _IndexType
__static_quotient(_IndexType __nom
= __gnu_cxx::__int_traits<_IndexType>::__max)
{
std::span<const size_t> __sta_exts = __static_extents<_Extents>();
return __static_quotient<_IndexType>(__sta_exts, __nom);
}
template<typename _Extents>
constexpr bool
__is_representable_extents(const _Extents& __exts) noexcept
{
using _IndexType = _Extents::index_type;
if constexpr (__contains_zero(__static_extents<_Extents>()))
return true;
else
{
constexpr auto __sta_quo = __static_quotient<_Extents>();
if constexpr (_Extents::rank_dynamic() == 0)
return __sta_quo != 0;
else
{
auto __dyn_exts = __dynamic_extents(__exts);
if (__contains_zero(__dyn_exts))
return true;
if constexpr (__sta_quo == 0)
return false;
else
{
auto __dyn_quo = _IndexType(__sta_quo);
for (auto __factor : __dyn_exts)
{
__dyn_quo /= __factor;
if (__dyn_quo == 0)
return false;
}
return true;
}
}
}
}
template<typename _Extents, typename _IndexType>
concept __representable_size = _Extents::rank_dynamic() != 0
|| __contains_zero(__static_extents<_Extents>())
|| (__static_quotient<_Extents, _IndexType>() != 0);
template<typename _Layout, typename _Mapping>
concept __mapping_of =
is_same_v<typename _Layout::template mapping<typename _Mapping::extents_type>,
_Mapping>;
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<template<size_t> typename _Layout, typename _Mapping>
concept __padded_mapping_of = __mapping_of<
_Layout<_Mapping::padding_value>, _Mapping>;
#ifdef __glibcxx_padded_layouts
template<typename _Mapping>
constexpr bool __is_left_padded_mapping = __padded_mapping_of<
layout_left_padded, _Mapping>;
template<typename _Mapping>
constexpr bool __is_right_padded_mapping = __padded_mapping_of<
layout_right_padded, _Mapping>;
#endif
template<typename _PaddedMapping>
consteval size_t
__get_static_stride()
{ return _PaddedMapping::_PaddedStorage::_S_static_stride; }
template<typename _Mapping>
concept __standardized_mapping = __mapping_of<layout_left, _Mapping>
|| __mapping_of<layout_right, _Mapping>
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
|| __mapping_of<layout_stride, _Mapping>
#ifdef __glibcxx_padded_layouts
|| __is_left_padded_mapping<_Mapping>
|| __is_right_padded_mapping<_Mapping>
#endif
;
// A tag type to create internal ctors.
class __internal_ctor
{ };
}
template<typename _Extents>
class layout_left::mapping
{
public:
using extents_type = _Extents;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using layout_type = layout_left;
static_assert(__mdspan::__representable_size<extents_type, index_type>,
"The size of extents_type must be representable as index_type");
constexpr
mapping() noexcept = default;
constexpr
mapping(const mapping&) noexcept = default;
constexpr
mapping(const extents_type& __extents) noexcept
: _M_extents(__extents)
{ __glibcxx_assert(__mdspan::__is_representable_extents(_M_extents)); }
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ }
template<typename _OExtents>
requires (extents_type::rank() <= 1)
&& is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const layout_right::mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ }
// noexcept for consistency with other layouts.
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(extents_type::rank() > 0)
mapping(const layout_stride::mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ __glibcxx_assert(*this == __other); }
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
#if __glibcxx_padded_layouts
template<typename _LeftpadMapping>
requires __mdspan::__is_left_padded_mapping<_LeftpadMapping>
&& is_constructible_v<extents_type,
typename _LeftpadMapping::extents_type>
constexpr
explicit(!is_convertible_v<typename _LeftpadMapping::extents_type,
extents_type>)
mapping(const _LeftpadMapping& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{
constexpr size_t __ostride_sta
= __mdspan::__get_static_stride<_LeftpadMapping>();
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
if constexpr (extents_type::rank() > 1)
{
if constexpr (extents_type::static_extent(0) != dynamic_extent
&& __ostride_sta != dynamic_extent)
static_assert(extents_type::static_extent(0) == __ostride_sta);
else
__glibcxx_assert(__other.stride(1)
== __other.extents().extent(0));
}
}
#endif // __glibcxx_padded_layouts
constexpr mapping&
operator=(const mapping&) noexcept = default;
constexpr const extents_type&
extents() const noexcept { return _M_extents; }
constexpr index_type
required_span_size() const noexcept
{ return __mdspan::__size(_M_extents); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4314. Missing move in mdspan layout mapping::operator()
template<__mdspan::__valid_index_type<index_type>... _Indices>
requires (sizeof...(_Indices) == extents_type::rank())
constexpr index_type
operator()(_Indices... __indices) const noexcept
{
return __mdspan::__linear_index_left(_M_extents,
static_cast<index_type>(std::move(__indices))...);
}
static constexpr bool
is_always_unique() noexcept { return true; }
static constexpr bool
is_always_exhaustive() noexcept { return true; }
static constexpr bool
is_always_strided() noexcept { return true; }
static constexpr bool
is_unique() noexcept { return true; }
static constexpr bool
is_exhaustive() noexcept { return true; }
static constexpr bool
is_strided() noexcept { return true; }
constexpr index_type
stride(rank_type __i) const noexcept
requires (extents_type::rank() > 0)
{
__glibcxx_assert(__i < extents_type::rank());
return __mdspan::__fwd_prod(_M_extents, __i);
}
template<typename _OExtents>
requires (extents_type::rank() == _OExtents::rank())
friend constexpr bool
operator==(const mapping& __self, const mapping<_OExtents>& __other)
noexcept
{ return __self.extents() == __other.extents(); }
private:
template<typename _OExtents>
constexpr explicit
mapping(const _OExtents& __oexts, __mdspan::__internal_ctor) noexcept
: _M_extents(__oexts)
{
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
"The size of OtherExtents must be representable as index_type");
__glibcxx_assert(__mdspan::__is_representable_extents(_M_extents));
}
[[no_unique_address]] extents_type _M_extents{};
};
namespace __mdspan
{
template<typename _Extents, typename... _Indices>
constexpr typename _Extents::index_type
__linear_index_right(const _Extents& __exts, _Indices... __indices)
noexcept
{
using _IndexType = typename _Extents::index_type;
array<_IndexType, sizeof...(__indices)> __ind_arr{__indices...};
_IndexType __res = 0;
if constexpr (sizeof...(__indices) > 0)
{
_IndexType __mult = 1;
auto __update = [&, __pos = __exts.rank()](_IndexType) mutable
{
--__pos;
_GLIBCXX_DEBUG_ASSERT(cmp_less(__ind_arr[__pos],
__exts.extent(__pos)));
__res += __ind_arr[__pos] * __mult;
__mult *= __exts.extent(__pos);
};
(__update(__indices), ...);
}
return __res;
}
}
template<typename _Extents>
class layout_right::mapping
{
public:
using extents_type = _Extents;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using layout_type = layout_right;
static_assert(__mdspan::__representable_size<extents_type, index_type>,
"The size of extents_type must be representable as index_type");
constexpr
mapping() noexcept = default;
constexpr
mapping(const mapping&) noexcept = default;
constexpr
mapping(const extents_type& __extents) noexcept
: _M_extents(__extents)
{ __glibcxx_assert(__mdspan::__is_representable_extents(_M_extents)); }
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ }
template<typename _OExtents>
requires (extents_type::rank() <= 1)
&& is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const layout_left::mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ }
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(extents_type::rank() > 0)
mapping(const layout_stride::mapping<_OExtents>& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{ __glibcxx_assert(*this == __other); }
#if __glibcxx_padded_layouts
template<typename _RightPaddedMapping>
requires __mdspan::__is_right_padded_mapping<_RightPaddedMapping>
&& is_constructible_v<extents_type,
typename _RightPaddedMapping::extents_type>
constexpr
explicit(!is_convertible_v<typename _RightPaddedMapping::extents_type,
extents_type>)
mapping(const _RightPaddedMapping& __other) noexcept
: mapping(__other.extents(), __mdspan::__internal_ctor{})
{
constexpr size_t __rank = extents_type::rank();
constexpr size_t __ostride_sta
= __mdspan::__get_static_stride<_RightPaddedMapping>();
if constexpr (__rank > 1)
{
if constexpr (extents_type::static_extent(__rank - 1) != dynamic_extent
&& __ostride_sta != dynamic_extent)
static_assert(extents_type::static_extent(__rank - 1)
== __ostride_sta);
else
__glibcxx_assert(__other.stride(__rank - 2)
== __other.extents().extent(__rank - 1));
}
}
#endif
constexpr mapping&
operator=(const mapping&) noexcept = default;
constexpr const extents_type&
extents() const noexcept { return _M_extents; }
constexpr index_type
required_span_size() const noexcept
{ return __mdspan::__size(_M_extents); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4314. Missing move in mdspan layout mapping::operator()
template<__mdspan::__valid_index_type<index_type>... _Indices>
requires (sizeof...(_Indices) == extents_type::rank())
constexpr index_type
operator()(_Indices... __indices) const noexcept
{
return __mdspan::__linear_index_right(
_M_extents, static_cast<index_type>(std::move(__indices))...);
}
static constexpr bool
is_always_unique() noexcept
{ return true; }
static constexpr bool
is_always_exhaustive() noexcept
{ return true; }
static constexpr bool
is_always_strided() noexcept
{ return true; }
static constexpr bool
is_unique() noexcept
{ return true; }
static constexpr bool
is_exhaustive() noexcept
{ return true; }
static constexpr bool
is_strided() noexcept
{ return true; }
constexpr index_type
stride(rank_type __i) const noexcept
requires (extents_type::rank() > 0)
{
__glibcxx_assert(__i < extents_type::rank());
return __mdspan::__rev_prod(_M_extents, __i);
}
template<typename _OExtents>
requires (extents_type::rank() == _OExtents::rank())
friend constexpr bool
operator==(const mapping& __self, const mapping<_OExtents>& __other)
noexcept
{ return __self.extents() == __other.extents(); }
private:
template<typename _OExtents>
constexpr explicit
mapping(const _OExtents& __oexts, __mdspan::__internal_ctor) noexcept
: _M_extents(__oexts)
{
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
"The size of OtherExtents must be representable as index_type");
__glibcxx_assert(__mdspan::__is_representable_extents(_M_extents));
}
[[no_unique_address]] extents_type _M_extents{};
};
namespace __mdspan
{
template<typename _Mp>
concept __mapping_alike = requires
{
requires __is_extents<typename _Mp::extents_type>;
{ _Mp::is_always_strided() } -> same_as<bool>;
{ _Mp::is_always_exhaustive() } -> same_as<bool>;
{ _Mp::is_always_unique() } -> same_as<bool>;
bool_constant<_Mp::is_always_strided()>::value;
bool_constant<_Mp::is_always_exhaustive()>::value;
bool_constant<_Mp::is_always_unique()>::value;
};
template<typename _Mapping>
constexpr typename _Mapping::index_type
__offset(const _Mapping& __m) noexcept
{
using _IndexType = typename _Mapping::index_type;
constexpr auto __rank = _Mapping::extents_type::rank();
if constexpr (__standardized_mapping<_Mapping>)
return 0;
else if (__empty(__m.extents()))
return 0;
else
{
auto __impl = [&__m]<size_t... _Counts>(index_sequence<_Counts...>)
{ return __m(((void) _Counts, _IndexType(0))...); };
return __impl(make_index_sequence<__rank>());
}
}
template<typename _Mapping, typename... _Indices>
constexpr typename _Mapping::index_type
__linear_index_strides(const _Mapping& __m, _Indices... __indices)
noexcept
{
using _IndexType = typename _Mapping::index_type;
_IndexType __res = 0;
if constexpr (sizeof...(__indices) > 0)
{
auto __update = [&, __pos = 0u](_IndexType __idx) mutable
{
_GLIBCXX_DEBUG_ASSERT(cmp_less(__idx,
__m.extents().extent(__pos)));
__res += __idx * __m.stride(__pos++);
};
(__update(__indices), ...);
}
return __res;
}
}
template<typename _Extents>
class layout_stride::mapping
{
public:
using extents_type = _Extents;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using layout_type = layout_stride;
static_assert(__mdspan::__representable_size<extents_type, index_type>,
"The size of extents_type must be representable as index_type");
constexpr
mapping() noexcept
{
// The precondition is either statically asserted, or automatically
// satisfied because dynamic extents are zero-initialized.
size_t __stride = 1;
for (size_t __i = extents_type::rank(); __i > 0; --__i)
{
_M_strides[__i - 1] = index_type(__stride);
__stride *= size_t(_M_extents.extent(__i - 1));
}
}
constexpr
mapping(const mapping&) noexcept = default;
template<typename _OIndexType>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
constexpr
mapping(const extents_type& __exts,
span<_OIndexType, extents_type::rank()> __strides) noexcept
: _M_extents(__exts)
{
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
_M_strides[__i] = index_type(as_const(__strides[__i]));
}
template<typename _OIndexType>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
constexpr
mapping(const extents_type& __exts,
const array<_OIndexType, extents_type::rank()>& __strides)
noexcept
: mapping(__exts,
span<const _OIndexType, extents_type::rank()>(__strides))
{ }
template<__mdspan::__mapping_alike _StridedMapping>
requires (is_constructible_v<extents_type,
typename _StridedMapping::extents_type>
&& _StridedMapping::is_always_unique()
&& _StridedMapping::is_always_strided())
constexpr explicit(!(
is_convertible_v<typename _StridedMapping::extents_type, extents_type>
&& __mdspan::__standardized_mapping<_StridedMapping>))
mapping(const _StridedMapping& __other) noexcept
: _M_extents(__other.extents())
{
using _OIndexType = _StridedMapping::index_type;
using _OExtents = _StridedMapping::extents_type;
__glibcxx_assert(__mdspan::__offset(__other) == 0);
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
"The size of StridedMapping::extents_type must be representable as"
" index_type");
if constexpr (cmp_greater(__gnu_cxx::__int_traits<_OIndexType>::__max,
__gnu_cxx::__int_traits<index_type>::__max))
__glibcxx_assert(!cmp_less(
__gnu_cxx::__int_traits<index_type>::__max,
__other.required_span_size())
&& "other.required_span_size() must be representable"
" as index_type");
if constexpr (extents_type::rank() > 0)
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
_M_strides[__i] = index_type(__other.stride(__i));
}
constexpr mapping&
operator=(const mapping&) noexcept = default;
constexpr const extents_type&
extents() const noexcept { return _M_extents; }
constexpr array<index_type, extents_type::rank()>
strides() const noexcept
{
array<index_type, extents_type::rank()> __ret;
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
__ret[__i] = _M_strides[__i];
return __ret;
}
constexpr index_type
required_span_size() const noexcept
{
if (__mdspan::__empty(_M_extents))
return 0;
index_type __ret = 1;
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
__ret += (_M_extents.extent(__i) - 1) * _M_strides[__i];
return __ret;
}
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4314. Missing move in mdspan layout mapping::operator()
template<__mdspan::__valid_index_type<index_type>... _Indices>
requires (sizeof...(_Indices) == extents_type::rank())
constexpr index_type
operator()(_Indices... __indices) const noexcept
{
return __mdspan::__linear_index_strides(*this,
static_cast<index_type>(std::move(__indices))...);
}
static constexpr bool
is_always_unique() noexcept { return true; }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4266. layout_stride::mapping should treat empty mappings as exhaustive
static constexpr bool
is_always_exhaustive() noexcept
{
return (_Extents::rank() == 0) || __mdspan::__contains_zero(
__mdspan::__static_extents<extents_type>());
}
static constexpr bool
is_always_strided() noexcept { return true; }
static constexpr bool
is_unique() noexcept { return true; }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4266. layout_stride::mapping should treat empty mappings as exhaustive
constexpr bool
is_exhaustive() const noexcept
{
if constexpr (!is_always_exhaustive())
{
auto __size = __mdspan::__size(_M_extents);
if(__size > 0)
return __size == required_span_size();
}
return true;
}
static constexpr bool
is_strided() noexcept { return true; }
constexpr index_type
stride(rank_type __r) const noexcept { return _M_strides[__r]; }
template<__mdspan::__mapping_alike _OMapping>
requires ((extents_type::rank() == _OMapping::extents_type::rank())
&& _OMapping::is_always_strided())
friend constexpr bool
operator==(const mapping& __self, const _OMapping& __other) noexcept
{
if (__self.extents() != __other.extents())
return false;
if constexpr (extents_type::rank() > 0)
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
if (!cmp_equal(__self.stride(__i), __other.stride(__i)))
return false;
return __mdspan::__offset(__other) == 0;
}
private:
using _Strides = typename __array_traits<index_type,
extents_type::rank()>::_Type;
[[no_unique_address]] extents_type _M_extents;
[[no_unique_address]] _Strides _M_strides;
};
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
#ifdef __glibcxx_padded_layouts
namespace __mdspan
{
constexpr size_t
__least_multiple(size_t __x, size_t __y)
{
if (__x <= 1)
return __y;
return (__y / __x + (__y % __x != 0)) * __x ;
}
template<typename _IndexType>
constexpr bool
__is_representable_least_multiple(size_t __x, size_t __y)
{
constexpr auto __y_max = __gnu_cxx::__int_traits<_IndexType>::__max;
if(std::cmp_greater(__y, __y_max))
return false;
if(__x <= 1)
return true;
auto __max_delta = __y_max - static_cast<_IndexType>(__y);
auto __y_mod_x = __y % __x;
auto __delta = (__y_mod_x == 0) ? size_t(0) : (__x - __y_mod_x);
return std::cmp_less_equal(__delta, __max_delta);
}
template<typename _Extents, size_t _PaddingValue, typename _LayoutTraits,
size_t _Rank = _Extents::rank()>
concept __valid_static_stride = (_Extents::rank() <= 1)
|| (_PaddingValue == dynamic_extent)
|| (_Extents::static_extent(_LayoutTraits::_S_ext_idx) == dynamic_extent)
|| (__is_representable_least_multiple<size_t>(
_PaddingValue, _Extents::static_extent(_LayoutTraits::_S_ext_idx)));
template<size_t _PaddedStride, typename _Extents,
typename _LayoutTraits>
consteval bool
__is_representable_padded_size()
{
using _IndexType = typename _Extents::index_type;
auto __sta_exts = __static_extents<_Extents>(
_LayoutTraits::_S_unpad_begin, _LayoutTraits::_S_unpad_end);
size_t __max = __gnu_cxx::__int_traits<_IndexType>::__max;
return __static_quotient(__sta_exts, __max / _PaddedStride) != 0;
}
template<typename _Extents, size_t _PaddedStride, typename _LayoutTraits,
size_t _Rank = _Extents::rank()>
concept __valid_padded_size = (_Rank <= 1)
|| (_PaddedStride == dynamic_extent)
|| (!__all_static(__static_extents<_Extents>()))
|| (__contains_zero(__static_extents<_Extents>()))
|| (__is_representable_padded_size<_PaddedStride, _Extents,
_LayoutTraits>());
template<typename _Extents, typename _Stride, typename... _Indices>
constexpr typename _Extents::index_type
__linear_index_leftpad(const _Extents& __exts, _Stride __stride,
_Indices... __indices)
{
// i0 + stride*(i1 + extents.extent(1)*...)
using _IndexType = typename _Extents::index_type;
_IndexType __res = 0;
if constexpr (sizeof...(__indices) > 0)
{
_IndexType __mult = 1;
auto __update_rest = [&, __pos = 1u](_IndexType __idx) mutable
{
__res += __idx * __mult;
__mult *= __exts.extent(__pos);
++__pos;
};
auto __update = [&](_IndexType __idx, auto... __rest)
{
__res += __idx;
__mult = __stride.extent(0);
(__update_rest(__rest), ...);
};
__update(__indices...);
}
return __res;
}
template<typename _Extents, typename _Stride, typename... _Indices>
constexpr typename _Extents::index_type
__linear_index_rightpad(const _Extents& __exts, _Stride __stride,
_Indices... __indices)
{
// i[n-1] + stride*(i[n-2] + extents.extent(n-2])*...)
using _IndexType = typename _Extents::index_type;
_IndexType __res = 0;
if constexpr (sizeof...(__indices) > 0)
{
_IndexType __mult = 1;
array<_IndexType, sizeof...(__indices)> __ind_arr{__indices...};
auto __update_rest = [&, __pos = __exts.rank()-1](_IndexType) mutable
{
--__pos;
__res += __ind_arr[__pos] * __mult;
__mult *= __exts.extent(__pos);
};
auto __update = [&](_IndexType, auto... __rest)
{
__res += __ind_arr[__exts.rank() - 1];
__mult = __stride.extent(0);
(__update_rest(__rest), ...);
};
__update(__indices...);
}
return __res;
}
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<size_t _Rank>
struct _LeftPaddedLayoutTraits
{
using _LayoutSame = layout_left;
using _LayoutOther = layout_right;
constexpr static const size_t _S_ext_idx = 0;
constexpr static const size_t _S_stride_idx = 1;
constexpr static const size_t _S_unpad_begin = 1;
constexpr static const size_t _S_unpad_end = _Rank;
template<typename _IndexType, size_t _StaticStride, size_t..._Extents>
constexpr static auto
_S_make_padded_extent(
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
extents<_IndexType, _StaticStride> __stride,
const extents<_IndexType, _Extents...>& __exts)
{
auto __impl = [&]<size_t... _Is>(integer_sequence<size_t, _Is...>)
{
return extents<_IndexType, _StaticStride,
(_Extents...[_Is + 1])...>{
__stride.extent(0), __exts.extent(_Is + 1)...};
};
return __impl(make_index_sequence<sizeof...(_Extents) - 1>());
}
};
template<size_t _Rank>
struct _RightPaddedLayoutTraits
{
using _LayoutSame = layout_right;
using _LayoutOther = layout_left;
constexpr static size_t _S_ext_idx = _Rank - 1;
constexpr static size_t _S_stride_idx = _Rank - 2;
constexpr static size_t _S_unpad_begin = 0;
constexpr static size_t _S_unpad_end = _Rank - 1;
template<typename _IndexType, size_t _StaticStride, size_t..._Extents>
constexpr static auto
_S_make_padded_extent(
extents<_IndexType, _StaticStride> __stride,
const extents<_IndexType, _Extents...>& __exts)
{
auto __impl = [&]<size_t... _Is>(integer_sequence<size_t, _Is...>)
{
return extents<_IndexType, (_Extents...[_Is])..., _StaticStride>{
__exts.extent(_Is)..., __stride.extent(0)};
};
return __impl(make_index_sequence<sizeof...(_Extents) - 1>());
}
};
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
template<size_t _PaddingValue, typename _Extents, typename _LayoutTraits>
class _PaddedStorage
{
using _LayoutSame = typename _LayoutTraits::_LayoutSame;
public:
using _IndexType = typename _Extents::index_type;
constexpr static size_t _S_rank = _Extents::rank();
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4372. Weaken Mandates: for dynamic padding values in padded layouts
static_assert((_PaddingValue == dynamic_extent)
|| (cmp_less_equal(_PaddingValue,
__gnu_cxx::__int_traits<_IndexType>::__max)),
"padding_value must be representable as index_type");
static_assert(__representable_size<_Extents, _IndexType>,
"The size of extents_type must be representable as index_type");
static_assert(__valid_static_stride<_Extents, _PaddingValue,
_LayoutTraits>,
"The padded stride must be representable as size_t");
static constexpr size_t _S_static_stride = [] consteval
{
constexpr size_t __rank = _Extents::rank();
if constexpr (__rank <= 1)
return 0;
else
{
constexpr size_t __ext_idx = _LayoutTraits::_S_ext_idx;
constexpr size_t __sta_ext = _Extents::static_extent(__ext_idx);
if constexpr (__sta_ext == 0)
return size_t(0);
else if constexpr (_PaddingValue == dynamic_extent
|| __sta_ext == dynamic_extent)
return dynamic_extent;
else
return __least_multiple(_PaddingValue, __sta_ext);
}
}();
static_assert(_S_static_stride == dynamic_extent
|| cmp_less_equal(_S_static_stride,
__gnu_cxx::__int_traits<_IndexType>::__max),
"Padded stride must be representable as index_type");
static_assert(__valid_padded_size<_Extents, _S_static_stride,
_LayoutTraits>);
constexpr
_PaddedStorage() noexcept
{
if constexpr (_S_rank > 1)
if constexpr (_S_static_stride == dynamic_extent
&& _S_static_padextent() != dynamic_extent)
_M_stride = _Stride{_S_static_padextent()};
}
constexpr explicit
_PaddedStorage(const _Extents& __exts)
: _M_extents(__exts)
{
if constexpr (!__all_static(__static_extents<_Extents>()))
__glibcxx_assert(__is_representable_extents(_M_extents));
if constexpr (_S_rank > 1)
{
_IndexType __stride;
if constexpr (_PaddingValue == dynamic_extent)
__stride = _M_padextent();
else if constexpr (_S_static_padextent() != dynamic_extent)
return;
else
{
__glibcxx_assert(
__is_representable_least_multiple<_IndexType>(
_PaddingValue, _M_padextent()));
__stride = static_cast<_IndexType>(
__least_multiple(_PaddingValue, _M_padextent()));
__glibcxx_assert(__is_representable_extents(
_LayoutTraits::_S_make_padded_extent(
std::dextents<_IndexType, 1>{__stride},
_M_extents)));
}
_M_stride = _Stride{__stride};
}
}
constexpr explicit
_PaddedStorage(const _Extents& __exts, _IndexType __pad)
: _M_extents(__exts)
{
if constexpr (_PaddingValue != dynamic_extent)
__glibcxx_assert(cmp_equal(_PaddingValue, __pad));
if constexpr (_S_rank > 1 && _S_static_stride == dynamic_extent)
{
__glibcxx_assert(
__is_representable_least_multiple<_IndexType>(
__pad, _M_padextent()));
_M_stride = _Stride{static_cast<_IndexType>(
__least_multiple(__pad, _M_padextent()))};
__glibcxx_assert(__is_representable_extents(
_LayoutTraits::_S_make_padded_extent(
_M_stride, _M_extents)));
}
}
template<typename _OExtents>
constexpr explicit
_PaddedStorage(const typename _LayoutSame::mapping<_OExtents>&
__other)
: _PaddedStorage(_Extents(__other.extents()))
{
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
constexpr size_t __ext_idx = _LayoutTraits::_S_ext_idx;
if constexpr (_S_rank > 1 && _PaddingValue != dynamic_extent)
{
static_assert(_S_static_stride == dynamic_extent
|| _OExtents::static_extent(__ext_idx) == dynamic_extent
|| _S_static_stride == _OExtents::static_extent(__ext_idx),
"The padded stride must be compatible with other");
if constexpr (_S_static_stride == dynamic_extent
|| _OExtents::static_extent(__stride_idx) == dynamic_extent)
__glibcxx_assert(std::cmp_equal(_M_padstride(),
_M_padextent()));
}
}
template<typename _OExtents>
constexpr explicit
_PaddedStorage(const typename layout_stride::mapping<_OExtents>&
__other)
: _M_extents(__other.extents())
{
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
__gnu_cxx::__int_traits<_IndexType>
::__max));
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
if constexpr (_S_rank > 1)
{
if constexpr (_PaddingValue != dynamic_extent)
__glibcxx_assert(cmp_equal(__other.stride(__stride_idx),
_M_calc_padstride())
&& "The padded stride must be compatible with other");
if constexpr (_S_static_stride == dynamic_extent)
_M_stride = _Stride{__other.stride(__stride_idx)};
}
}
template<typename _SamePaddedMapping>
constexpr explicit
_PaddedStorage(_LayoutTraits::_LayoutSame,
const _SamePaddedMapping& __other)
: _M_extents(__other.extents())
{
if constexpr (_S_rank > 1)
{
static_assert(_PaddingValue == dynamic_extent
|| _SamePaddedMapping::padding_value == dynamic_extent
|| _PaddingValue == _SamePaddedMapping::padding_value,
"If neither PaddingValue is dynamic_extent, then they must "
"be equal");
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
if constexpr (_PaddingValue != dynamic_extent)
__glibcxx_assert(cmp_equal(__other.stride(__stride_idx),
_M_calc_padstride())
&& "The padded stride must be compatible with other");
if constexpr (_S_static_stride == dynamic_extent)
_M_stride = _Stride{__other.stride(__stride_idx)};
}
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
__gnu_cxx::__int_traits<_IndexType>::__max));
}
template<typename _OtherPaddedMapping>
constexpr explicit
_PaddedStorage(_LayoutTraits::_LayoutOther,
const _OtherPaddedMapping& __other) noexcept
: _M_extents(__other.extents())
{
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
__gnu_cxx::__int_traits<_IndexType>::__max));
}
static constexpr bool
_M_is_always_exhaustive() noexcept
{
if constexpr (_S_rank <= 1)
return true;
else
return _S_static_padextent() != dynamic_extent
&& _S_static_stride != dynamic_extent
&& _S_static_padextent() == _S_static_stride;
}
constexpr bool
_M_is_exhaustive() const noexcept
{
if constexpr (_M_is_always_exhaustive())
return true;
else
return cmp_equal(_M_padextent(), _M_padstride());
}
constexpr static size_t
_S_static_padextent() noexcept
{ return _Extents::static_extent(_LayoutTraits::_S_ext_idx); }
constexpr _IndexType
_M_padextent() const noexcept
{ return _M_extents.extent(_LayoutTraits::_S_ext_idx); }
constexpr _IndexType
_M_calc_padstride() const noexcept
{
if constexpr (_S_static_stride != dynamic_extent)
return _S_static_stride;
else if constexpr (_PaddingValue != dynamic_extent)
return __least_multiple(_PaddingValue, _M_padextent());
else
return _M_padextent();
}
constexpr _IndexType
_M_padstride() const noexcept
{ return _M_stride.extent(0); }
constexpr _IndexType
_M_required_span_size() const noexcept
{
if constexpr (_S_rank == 0)
return 1;
else if (__mdspan::__empty(_M_extents))
return 0;
else
{
size_t __stride = static_cast<size_t>(_M_padstride());
size_t __prod_rest = __mdspan::__fwd_prod(_M_extents,
_LayoutTraits::_S_unpad_begin, _LayoutTraits::_S_unpad_end);
size_t __delta = _M_padstride() - _M_padextent();
return static_cast<_IndexType>(__stride * __prod_rest - __delta);
}
}
template<typename _SamePaddedMapping>
constexpr bool
_M_equal(const _SamePaddedMapping& __other) const noexcept
{
return _M_extents == __other.extents()
&& (_S_rank < 2
|| cmp_equal(_M_stride.extent(0),
__other.stride(_LayoutTraits::_S_stride_idx)));
}
using _Stride = std::extents<_IndexType, _S_static_stride>;
[[no_unique_address]] _Stride _M_stride;
[[no_unique_address]] _Extents _M_extents;
};
}
template<size_t _PaddingValue>
template<typename _Extents>
class layout_left_padded<_PaddingValue>::mapping
{
public:
static constexpr size_t padding_value = _PaddingValue;
using extents_type = _Extents;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using layout_type = layout_left_padded<padding_value>;
private:
static constexpr size_t _S_rank = extents_type::rank();
using _PaddedStorage = __mdspan::_PaddedStorage<_PaddingValue,
_Extents, __mdspan::_LeftPaddedLayoutTraits<_S_rank>>;
[[no_unique_address]] _PaddedStorage _M_storage;
consteval friend size_t
__mdspan::__get_static_stride<mapping>();
constexpr index_type
_M_extent(size_t __r) const noexcept
{ return _M_storage._M_extents.extent(__r); }
constexpr index_type
_M_padstride() const noexcept
{ return _M_storage._M_stride.extent(0); }
public:
constexpr
mapping() noexcept
{ }
constexpr
mapping(const mapping&) noexcept = default;
constexpr
mapping(const extents_type& __exts)
: _M_storage(__exts)
{ }
template<__mdspan::__valid_index_type<index_type> _OIndexType>
constexpr
mapping(const extents_type& __exts, _OIndexType __pad)
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
: _M_storage(__exts,
__mdspan::__index_type_cast<index_type>(std::move(__pad)))
{ }
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const layout_left::mapping<_OExtents>& __other)
: _M_storage(__other)
{ }
template<typename _OExtents>
requires is_constructible_v<_OExtents, extents_type>
constexpr explicit(_OExtents::rank() > 0)
mapping(const typename layout_stride::mapping<_OExtents>& __other)
: _M_storage(__other)
{ __glibcxx_assert(*this == __other); }
template<typename _LeftpadMapping>
requires __mdspan::__is_left_padded_mapping<_LeftpadMapping>
&& is_constructible_v<extents_type,
typename _LeftpadMapping::extents_type>
constexpr explicit(_S_rank > 1 && (padding_value != dynamic_extent
|| _LeftpadMapping::padding_value == dynamic_extent))
mapping(const _LeftpadMapping& __other)
: _M_storage(layout_left{}, __other)
{ }
template<typename _RightPaddedMapping>
requires (__mdspan::__is_right_padded_mapping<_RightPaddedMapping>
|| __mdspan::__mapping_of<layout_right, _RightPaddedMapping>)
&& (_S_rank <= 1)
&& is_constructible_v<extents_type,
typename _RightPaddedMapping::extents_type>
constexpr explicit(!is_convertible_v<
typename _RightPaddedMapping::extents_type, extents_type>)
mapping(const _RightPaddedMapping& __other) noexcept
: _M_storage(layout_right{}, __other)
{ }
constexpr mapping&
operator=(const mapping&) noexcept = default;
constexpr const extents_type&
extents() const noexcept { return _M_storage._M_extents; }
constexpr array<index_type, _S_rank>
strides() const noexcept
{
array<index_type, _S_rank> __ret;
if constexpr (_S_rank > 0)
__ret[0] = 1;
if constexpr (_S_rank > 1)
__ret[1] = _M_padstride();
if constexpr (_S_rank > 2)
for(size_t __i = 2; __i < _S_rank; ++__i)
__ret[__i] = __ret[__i - 1] * _M_extent(__i - 1);
return __ret;
}
constexpr index_type
required_span_size() const noexcept
{ return _M_storage._M_required_span_size(); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4314. Missing move in mdspan layout mapping::operator()
template<__mdspan::__valid_index_type<index_type>... _Indices>
requires (sizeof...(_Indices) == _S_rank)
constexpr index_type
operator()(_Indices... __indices) const noexcept
{
return __mdspan::__linear_index_leftpad(
extents(), _M_storage._M_stride,
static_cast<index_type>(std::move(__indices))...);
}
static constexpr bool
is_always_exhaustive() noexcept
{ return _PaddedStorage::_M_is_always_exhaustive(); }
constexpr bool
is_exhaustive() noexcept
{ return _M_storage._M_is_exhaustive(); }
static constexpr bool
is_always_unique() noexcept { return true; }
static constexpr bool
is_unique() noexcept { return true; }
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
static constexpr bool
is_always_strided() noexcept { return true; }
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
static constexpr bool
is_strided() noexcept { return true; }
constexpr index_type
stride(rank_type __r) const noexcept
{
__glibcxx_assert(__r < _S_rank);
if (__r == 0)
return 1;
else
return static_cast<index_type>(
static_cast<size_t>(_M_padstride()) *
static_cast<size_t>(__mdspan::__fwd_prod(extents(), 1, __r)));
}
template<typename _LeftpadMapping>
requires(__mdspan::__is_left_padded_mapping<_LeftpadMapping>
&& _LeftpadMapping::extents_type::rank() == _S_rank)
friend constexpr bool
operator==(const mapping& __self, const _LeftpadMapping& __other)
noexcept
{ return __self._M_storage._M_equal(__other); }
};
template<size_t _PaddingValue>
template<typename _Extents>
class layout_right_padded<_PaddingValue>::mapping {
public:
static constexpr size_t padding_value = _PaddingValue;
using extents_type = _Extents;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using layout_type = layout_right_padded<_PaddingValue>;
private:
static constexpr size_t _S_rank = extents_type::rank();
using _PaddedStorage = __mdspan::_PaddedStorage<_PaddingValue,
_Extents, __mdspan::_RightPaddedLayoutTraits<_S_rank>>;
[[no_unique_address]] _PaddedStorage _M_storage;
consteval friend size_t
__mdspan::__get_static_stride<mapping>();
constexpr index_type
_M_extent(size_t __r) const noexcept
{ return _M_storage._M_extents.extent(__r); }
constexpr index_type
_M_padstride() const noexcept
{ return _M_storage._M_stride.extent(0); }
public:
constexpr
mapping() noexcept
{ }
constexpr
mapping(const mapping&) noexcept = default;
constexpr
mapping(const extents_type& __exts)
: _M_storage(__exts)
{ }
template<__mdspan::__valid_index_type<index_type> _OIndexType>
constexpr
mapping(const extents_type& __exts, _OIndexType __pad)
: _M_storage(__exts,
__mdspan::__index_type_cast<index_type>(std::move(__pad)))
{ }
template<typename _OExtents>
requires is_constructible_v<extents_type, _OExtents>
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
mapping(const layout_right::mapping<_OExtents>& __other)
: _M_storage(__other)
{ }
template<typename _OExtents>
requires is_constructible_v<_OExtents, extents_type>
constexpr explicit(_OExtents::rank() > 0)
mapping(const typename layout_stride::mapping<_OExtents>& __other)
: _M_storage(__other)
{ __glibcxx_assert(*this == __other); }
template<typename _RightPaddedMapping>
requires __mdspan::__is_right_padded_mapping<_RightPaddedMapping>
&& is_constructible_v<extents_type,
typename _RightPaddedMapping::extents_type>
constexpr explicit(_S_rank > 1 && (padding_value != dynamic_extent
|| _RightPaddedMapping::padding_value == dynamic_extent))
mapping(const _RightPaddedMapping& __other)
: _M_storage(layout_right{}, __other)
{ }
template<typename _LeftPaddedMapping>
requires (__mdspan::__is_left_padded_mapping<_LeftPaddedMapping>
|| __mdspan::__mapping_of<layout_left, _LeftPaddedMapping>)
&& (_S_rank <= 1)
&& is_constructible_v<extents_type,
typename _LeftPaddedMapping::extents_type>
constexpr explicit(!is_convertible_v<
typename _LeftPaddedMapping::extents_type, extents_type>)
mapping(const _LeftPaddedMapping& __other) noexcept
: _M_storage(layout_left{}, __other)
{ }
constexpr mapping& operator=(const mapping&) noexcept = default;
constexpr const extents_type&
extents() const noexcept { return _M_storage._M_extents; }
constexpr array<index_type, _S_rank>
strides() const noexcept
{
array<index_type, _S_rank> __ret;
if constexpr (_S_rank > 0)
__ret[_S_rank - 1] = 1;
if constexpr (_S_rank > 1)
__ret[_S_rank - 2] = _M_padstride();
if constexpr (_S_rank > 2)
for(size_t __i = _S_rank - 2; __i > 0; --__i)
__ret[__i - 1] = __ret[__i] * _M_extent(__i);
return __ret;
}
constexpr index_type
required_span_size() const noexcept
{ return _M_storage._M_required_span_size(); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 4314. Missing move in mdspan layout mapping::operator()
template<__mdspan::__valid_index_type<index_type>... _Indices>
requires (sizeof...(_Indices) == _S_rank)
constexpr index_type
operator()(_Indices... __indices) const noexcept
{
return __mdspan::__linear_index_rightpad(
extents(), _M_storage._M_stride,
static_cast<index_type>(std::move(__indices))...);
}
static constexpr bool
is_always_exhaustive() noexcept
{ return _PaddedStorage::_M_is_always_exhaustive(); }
constexpr bool
is_exhaustive() noexcept
{ return _M_storage._M_is_exhaustive(); }
static constexpr bool
is_always_unique() noexcept { return true; }
static constexpr bool
is_unique() noexcept { return true; }
static constexpr bool
is_always_strided() noexcept { return true; }
static constexpr bool
is_strided() noexcept { return true; }
constexpr index_type
stride(rank_type __r) const noexcept
{
__glibcxx_assert(__r < _S_rank);
if constexpr (_S_rank <= 1)
return 1;
else if (__r == _S_rank - 1)
return 1;
else if (__r == _S_rank - 2)
return _M_padstride();
else
return static_cast<index_type>(
static_cast<size_t>(_M_padstride()) *
static_cast<size_t>(__mdspan::__fwd_prod(
extents(), __r + 1, _S_rank - 1)));
}
template<typename _RightPaddedMapping>
requires(__mdspan::__is_right_padded_mapping<_RightPaddedMapping>
&& _RightPaddedMapping::extents_type::rank() == _S_rank)
friend constexpr bool
operator==(const mapping& __self, const _RightPaddedMapping& __other)
noexcept
{ return __self._M_storage._M_equal(__other); }
};
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
#endif // __glibcxx_padded_layouts
template<typename _ElementType>
struct default_accessor
{
static_assert(!is_array_v<_ElementType>,
"ElementType must not be an array type");
static_assert(!is_abstract_v<_ElementType>,
"ElementType must not be an abstract class type");
using offset_policy = default_accessor;
using element_type = _ElementType;
using reference = element_type&;
using data_handle_type = element_type*;
constexpr
default_accessor() noexcept = default;
template<typename _OElementType>
requires is_convertible_v<_OElementType(*)[], element_type(*)[]>
constexpr
default_accessor(default_accessor<_OElementType>) noexcept
{ }
constexpr reference
access(data_handle_type __p, size_t __i) const noexcept
{ return __p[__i]; }
constexpr data_handle_type
offset(data_handle_type __p, size_t __i) const noexcept
{ return __p + __i; }
};
#ifdef __glibcxx_aligned_accessor
template<typename _ElementType, size_t _ByteAlignment>
struct aligned_accessor
{
static_assert(has_single_bit(_ByteAlignment),
"ByteAlignment must be a power of two");
static_assert(_ByteAlignment >= alignof(_ElementType));
using offset_policy = default_accessor<_ElementType>;
using element_type = _ElementType;
using reference = element_type&;
using data_handle_type = element_type*;
static constexpr size_t byte_alignment = _ByteAlignment;
constexpr
aligned_accessor() noexcept = default;
template<typename _OElementType, size_t _OByteAlignment>
requires (_OByteAlignment >= byte_alignment)
&& is_convertible_v<_OElementType(*)[], element_type(*)[]>
constexpr
aligned_accessor(aligned_accessor<_OElementType, _OByteAlignment>)
noexcept
{ }
template<typename _OElementType>
requires is_convertible_v<_OElementType(*)[], element_type(*)[]>
constexpr explicit
aligned_accessor(default_accessor<_OElementType>) noexcept
{ }
template<typename _OElementType>
requires is_convertible_v<element_type(*)[], _OElementType(*)[]>
constexpr
operator default_accessor<_OElementType>() const noexcept
{ return {}; }
constexpr reference
access(data_handle_type __p, size_t __i) const noexcept
{ return std::assume_aligned<byte_alignment>(__p)[__i]; }
constexpr typename offset_policy::data_handle_type
offset(data_handle_type __p, size_t __i) const noexcept
{ return std::assume_aligned<byte_alignment>(__p) + __i; }
};
#endif
namespace __mdspan
{
template<typename _Extents, typename _IndexType, size_t _Nm>
constexpr bool
__is_multi_index(const _Extents& __exts, span<_IndexType, _Nm> __indices)
{
static_assert(__exts.rank() == _Nm);
for (size_t __i = 0; __i < __exts.rank(); ++__i)
if (__indices[__i] >= __exts.extent(__i))
return false;
return true;
}
}
template<typename _ElementType, typename _Extents,
typename _LayoutPolicy = layout_right,
typename _AccessorPolicy = default_accessor<_ElementType>>
class mdspan
{
static_assert(!is_array_v<_ElementType>,
"ElementType must not be an array type");
static_assert(!is_abstract_v<_ElementType>,
"ElementType must not be an abstract class type");
static_assert(__mdspan::__is_extents<_Extents>,
"Extents must be a specialization of std::extents");
static_assert(is_same_v<_ElementType,
typename _AccessorPolicy::element_type>);
public:
using extents_type = _Extents;
using layout_type = _LayoutPolicy;
using accessor_type = _AccessorPolicy;
using mapping_type = typename layout_type::template mapping<extents_type>;
using element_type = _ElementType;
using value_type = remove_cv_t<element_type>;
using index_type = typename extents_type::index_type;
using size_type = typename extents_type::size_type;
using rank_type = typename extents_type::rank_type;
using data_handle_type = typename accessor_type::data_handle_type;
using reference = typename accessor_type::reference;
static constexpr rank_type
rank() noexcept { return extents_type::rank(); }
static constexpr rank_type
rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
static constexpr size_t
static_extent(rank_type __r) noexcept
{ return extents_type::static_extent(__r); }
constexpr index_type
extent(rank_type __r) const noexcept { return extents().extent(__r); }
constexpr
mdspan()
requires (rank_dynamic() > 0)
&& is_default_constructible_v<data_handle_type>
&& is_default_constructible_v<mapping_type>
&& is_default_constructible_v<accessor_type> = default;
constexpr
mdspan(const mdspan& __other) = default;
constexpr
mdspan(mdspan&& __other) = default;
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
requires (sizeof...(_OIndexTypes) == rank()
|| sizeof...(_OIndexTypes) == rank_dynamic())
&& is_constructible_v<mapping_type, extents_type>
&& is_default_constructible_v<accessor_type>
constexpr explicit
mdspan(data_handle_type __handle, _OIndexTypes... __exts)
: _M_accessor(),
_M_mapping(_Extents(static_cast<index_type>(std::move(__exts))...)),
_M_handle(std::move(__handle))
{ }
template<typename _OIndexType, size_t _Nm>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
&& (_Nm == rank() || _Nm == rank_dynamic())
&& is_constructible_v<mapping_type, extents_type>
&& is_default_constructible_v<accessor_type>
constexpr explicit(_Nm != rank_dynamic())
mdspan(data_handle_type __handle, span<_OIndexType, _Nm> __exts)
: _M_accessor(), _M_mapping(extents_type(__exts)),
_M_handle(std::move(__handle))
{ }
template<typename _OIndexType, size_t _Nm>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
&& (_Nm == rank() || _Nm == rank_dynamic())
&& is_constructible_v<mapping_type, extents_type>
&& is_default_constructible_v<accessor_type>
constexpr explicit(_Nm != rank_dynamic())
mdspan(data_handle_type __handle, const array<_OIndexType, _Nm>& __exts)
: _M_accessor(), _M_mapping(extents_type(__exts)),
_M_handle(std::move(__handle))
{ }
constexpr
mdspan(data_handle_type __handle, const extents_type& __exts)
requires is_constructible_v<mapping_type, const extents_type&>
&& is_default_constructible_v<accessor_type>
: _M_accessor(), _M_mapping(__exts), _M_handle(std::move(__handle))
{ }
constexpr
mdspan(data_handle_type __handle, const mapping_type& __mapping)
requires is_default_constructible_v<accessor_type>
: _M_accessor(), _M_mapping(__mapping), _M_handle(std::move(__handle))
{ }
constexpr
mdspan(data_handle_type __handle, const mapping_type& __mapping,
const accessor_type& __accessor)
: _M_accessor(__accessor), _M_mapping(__mapping),
libstdc++: Implement std::layout_left_padded [PR110352]. This commit adds a new layout layout_left_padded as standardized in N5014. It adds a purely internal feature testing macro padded_layouts and registers layout_left_padded in the std module. This commit implements LWG4372, because without it's not possible to properly test padded layouts with a dynamic padding value. It also implements LWG4314, for consistency with prior layouts. The implementation uses a _PaddedStorage to deduplicate most of the code shared between left- and right-padded layouts. It's implemented through aggregation rather than inheritence, because of a bug related to inheriting conditionally explicit ctors. The tests are written such that the canonical version works for layout_left_padded. A version for layout_right_padded is derived essentially by reversing the order of the extents. PR libstdc++/110352 libstdc++-v3/ChangeLog: * include/bits/version.def (padded_layouts): Add new internal feature testing macro. * include/bits/version.h: Regenerate. * include/std/mdspan (__fwd_prod): New overload. (layout_left_padded): Add declaration and implementation. (layout_right_padded): Add declaration only. (layout_left::mapping::mapping): New overload for left padded mappings. (__index_type_cast): New function that performs a checked cast to index_type. (__is_left_padded_mapping): New concept. (__is_right_padded_mapping): Ditto. (__standardized_mapping): Recognize left and right padded mappings. (_LeftPaddedIndices): Traits for left padded details. (_PaddedStorage): New class for implementing padded layouts. * src/c++23/std.cc.in (layout_left_padded): Add. * testsuite/23_containers/mdspan/layouts/class_mandate_neg.cc: Refactor and add tests for layout_left_padded. * testsuite/23_containers/mdspan/layouts/ctors.cc: Ditto. * testsuite/23_containers/mdspan/layouts/empty.cc: Ditto. * testsuite/23_containers/mdspan/layouts/mapping.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_neg.cc: Ditto. * testsuite/23_containers/mdspan/layouts/padded_traits.h: New traits. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-09-29 08:00:18 +02:00
_M_handle(std::move(__handle))
{ }
template<typename _OElementType, typename _OExtents, typename _OLayout,
typename _OAccessor>
requires is_constructible_v<mapping_type,
const typename _OLayout::template mapping<_OExtents>&>
&& is_constructible_v<accessor_type, const _OAccessor&>
constexpr explicit(!is_convertible_v<
const typename _OLayout::template mapping<_OExtents>&, mapping_type>
|| !is_convertible_v<const _OAccessor&, accessor_type>)
mdspan(const mdspan<_OElementType, _OExtents, _OLayout, _OAccessor>&
__other)
: _M_accessor(__other.accessor()), _M_mapping(__other.mapping()),
_M_handle(__other.data_handle())
{
static_assert(is_constructible_v<data_handle_type,
const typename _OAccessor::data_handle_type&>);
static_assert(is_constructible_v<extents_type, _OExtents>);
}
constexpr mdspan&
operator=(const mdspan& __other) = default;
constexpr mdspan&
operator=(mdspan&& __other) = default;
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
requires (sizeof...(_OIndexTypes) == rank())
constexpr reference
operator[](_OIndexTypes... __indices) const
{
auto __checked_call = [this](auto... __idxs) -> index_type
{
if constexpr (sizeof...(__idxs) > 0)
__glibcxx_assert(__mdspan::__is_multi_index(extents(),
span<const index_type, sizeof...(__idxs)>({__idxs...})));
return _M_mapping(__idxs...);
};
auto __index = __checked_call(
static_cast<index_type>(std::move(__indices))...);
return _M_accessor.access(_M_handle, __index);
}
template<typename _OIndexType>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
constexpr reference
operator[](span<_OIndexType, rank()> __indices) const
{
auto __call = [&]<size_t... _Counts>(index_sequence<_Counts...>)
-> reference
{ return (*this)[index_type(as_const(__indices[_Counts]))...]; };
return __call(make_index_sequence<rank()>());
}
template<typename _OIndexType>
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
constexpr reference
operator[](const array<_OIndexType, rank()>& __indices) const
{ return (*this)[span<const _OIndexType, rank()>(__indices)]; }
constexpr size_type
size() const noexcept
{
__glibcxx_assert(cmp_less_equal(_M_mapping.required_span_size(),
__gnu_cxx::__int_traits<size_t>
::__max));
return size_type(__mdspan::__size(extents()));
}
[[nodiscard]]
constexpr bool
empty() const noexcept
{ return __mdspan::__empty(extents()); }
friend constexpr void
swap(mdspan& __x, mdspan& __y) noexcept
{
using std::swap;
swap(__x._M_mapping, __y._M_mapping);
swap(__x._M_accessor, __y._M_accessor);
swap(__x._M_handle, __y._M_handle);
}
constexpr const extents_type&
extents() const noexcept { return _M_mapping.extents(); }
constexpr const data_handle_type&
data_handle() const noexcept { return _M_handle; }
constexpr const mapping_type&
mapping() const noexcept { return _M_mapping; }
constexpr const accessor_type&
accessor() const noexcept { return _M_accessor; }
// Strengthened noexcept for all `is_*` methods.
static constexpr bool
is_always_unique() noexcept(noexcept(mapping_type::is_always_unique()))
{ return mapping_type::is_always_unique(); }
static constexpr bool
is_always_exhaustive()
noexcept(noexcept(mapping_type::is_always_exhaustive()))
{ return mapping_type::is_always_exhaustive(); }
static constexpr bool
is_always_strided()
noexcept(noexcept(mapping_type::is_always_strided()))
{ return mapping_type::is_always_strided(); }
constexpr bool
is_unique() const noexcept(noexcept(_M_mapping.is_unique()))
{ return _M_mapping.is_unique(); }
constexpr bool
is_exhaustive() const noexcept(noexcept(_M_mapping.is_exhaustive()))
{ return _M_mapping.is_exhaustive(); }
constexpr bool
is_strided() const noexcept(noexcept(_M_mapping.is_strided()))
{ return _M_mapping.is_strided(); }
constexpr index_type
stride(rank_type __r) const { return _M_mapping.stride(__r); }
private:
[[no_unique_address]] accessor_type _M_accessor = accessor_type();
[[no_unique_address]] mapping_type _M_mapping = mapping_type();
[[no_unique_address]] data_handle_type _M_handle = data_handle_type();
};
template<typename _CArray>
requires is_array_v<_CArray> && (rank_v<_CArray> == 1)
mdspan(_CArray&)
-> mdspan<remove_all_extents_t<_CArray>,
extents<size_t, extent_v<_CArray, 0>>>;
template<typename _Pointer>
requires is_pointer_v<remove_reference_t<_Pointer>>
mdspan(_Pointer&&)
-> mdspan<remove_pointer_t<remove_reference_t<_Pointer>>, extents<size_t>>;
template<typename _ElementType, typename... _Integrals>
requires (is_convertible_v<_Integrals, size_t> && ...)
&& (sizeof...(_Integrals) > 0)
explicit mdspan(_ElementType*, _Integrals...)
-> mdspan<_ElementType,
extents<size_t, __detail::__maybe_static_ext<_Integrals>...>>;
template<typename _ElementType, typename _OIndexType, size_t _Nm>
mdspan(_ElementType*, span<_OIndexType, _Nm>)
-> mdspan<_ElementType, dextents<size_t, _Nm>>;
template<typename _ElementType, typename _OIndexType, size_t _Nm>
mdspan(_ElementType*, const array<_OIndexType, _Nm>&)
-> mdspan<_ElementType, dextents<size_t, _Nm>>;
template<typename _ElementType, typename _IndexType, size_t... _ExtentsPack>
mdspan(_ElementType*, const extents<_IndexType, _ExtentsPack...>&)
-> mdspan<_ElementType, extents<_IndexType, _ExtentsPack...>>;
template<typename _ElementType, typename _MappingType>
mdspan(_ElementType*, const _MappingType&)
-> mdspan<_ElementType, typename _MappingType::extents_type,
typename _MappingType::layout_type>;
template<typename _MappingType, typename _AccessorType>
mdspan(const typename _AccessorType::data_handle_type&, const _MappingType&,
const _AccessorType&)
-> mdspan<typename _AccessorType::element_type,
typename _MappingType::extents_type,
typename _MappingType::layout_type, _AccessorType>;
_GLIBCXX_END_NAMESPACE_VERSION
}
#endif
#endif