2025-04-29 14:46:08 +02:00
|
|
|
// <mdspan> -*- C++ -*-
|
|
|
|
|
|
|
|
|
|
// Copyright The GNU Toolchain Authors.
|
|
|
|
|
//
|
|
|
|
|
// This file is part of the GNU ISO C++ Library. This library is free
|
|
|
|
|
// software; you can redistribute it and/or modify it under the
|
|
|
|
|
// terms of the GNU General Public License as published by the
|
|
|
|
|
// Free Software Foundation; either version 3, or (at your option)
|
|
|
|
|
// any later version.
|
|
|
|
|
|
|
|
|
|
// This library is distributed in the hope that it will be useful,
|
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
// Under Section 7 of GPL version 3, you are granted additional
|
|
|
|
|
// permissions described in the GCC Runtime Library Exception, version
|
|
|
|
|
// 3.1, as published by the Free Software Foundation.
|
|
|
|
|
|
|
|
|
|
// You should have received a copy of the GNU General Public License and
|
|
|
|
|
// a copy of the GCC Runtime Library Exception along with this program;
|
|
|
|
|
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
|
|
|
// <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
/** @file mdspan
|
|
|
|
|
* This is a Standard C++ Library header.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#ifndef _GLIBCXX_MDSPAN
|
|
|
|
|
#define _GLIBCXX_MDSPAN 1
|
|
|
|
|
|
|
|
|
|
#ifdef _GLIBCXX_SYSHDR
|
|
|
|
|
#pragma GCC system_header
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
#include <span>
|
|
|
|
|
#include <array>
|
|
|
|
|
#include <type_traits>
|
|
|
|
|
#include <utility>
|
|
|
|
|
|
2025-08-04 12:59:27 +02:00
|
|
|
#if __cplusplus > 202302L
|
|
|
|
|
#include <bits/align.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-04-29 14:46:08 +02:00
|
|
|
#define __glibcxx_want_mdspan
|
2025-08-04 12:59:27 +02:00
|
|
|
#define __glibcxx_want_aligned_accessor
|
2025-10-02 11:27:14 +02:00
|
|
|
#define __glibcxx_want_submdspan
|
2025-04-29 14:46:08 +02:00
|
|
|
#include <bits/version.h>
|
|
|
|
|
|
|
|
|
|
#ifdef __glibcxx_mdspan
|
|
|
|
|
|
|
|
|
|
namespace std _GLIBCXX_VISIBILITY(default)
|
|
|
|
|
{
|
|
|
|
|
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
2025-04-29 14:46:09 +02:00
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
consteval bool
|
|
|
|
|
__all_static(std::span<const size_t> __extents)
|
|
|
|
|
{
|
|
|
|
|
for(auto __ext : __extents)
|
|
|
|
|
if (__ext == dynamic_extent)
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
consteval bool
|
|
|
|
|
__all_dynamic(std::span<const size_t> __extents)
|
|
|
|
|
{
|
|
|
|
|
for(auto __ext : __extents)
|
|
|
|
|
if (__ext != dynamic_extent)
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
template<typename _IndexType, typename _OIndexType>
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
__index_type_cast(_OIndexType&& __other)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (std::is_integral_v<_OIndexType>)
|
|
|
|
|
{
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr _IndexType __index_type_max
|
|
|
|
|
= __gnu_cxx::__int_traits<_IndexType>::__max;
|
|
|
|
|
constexpr _OIndexType __oindex_type_max
|
|
|
|
|
= __gnu_cxx::__int_traits<_OIndexType>::__max;
|
|
|
|
|
|
|
|
|
|
if constexpr (__index_type_max < __oindex_type_max)
|
|
|
|
|
__glibcxx_assert(cmp_less_equal(__other, __index_type_max));
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
if constexpr (std::is_signed_v<_OIndexType>)
|
|
|
|
|
__glibcxx_assert(__other >= 0);
|
2025-09-30 12:55:18 +02:00
|
|
|
return static_cast<_IndexType>(__other);
|
2025-09-29 08:00:18 +02:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
auto __ret = static_cast<_IndexType>(std::move(__other));
|
|
|
|
|
if constexpr (std::is_signed_v<_IndexType>)
|
|
|
|
|
__glibcxx_assert(__ret >= 0);
|
|
|
|
|
return __ret;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-03 22:57:23 +02:00
|
|
|
template<array _Extents>
|
|
|
|
|
class _StaticExtents
|
2025-04-29 14:46:09 +02:00
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
static constexpr size_t _S_rank = _Extents.size();
|
|
|
|
|
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
// For __r in [0, _S_rank], _S_dynamic_index(__r) is the number
|
2025-04-29 14:46:09 +02:00
|
|
|
// of dynamic extents up to (and not including) __r.
|
|
|
|
|
//
|
|
|
|
|
// If __r is the index of a dynamic extent, then
|
|
|
|
|
// _S_dynamic_index[__r] is the index of that extent in
|
2025-06-04 16:58:46 +02:00
|
|
|
// _M_dyn_exts.
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
static constexpr size_t
|
|
|
|
|
_S_dynamic_index(size_t __r) noexcept
|
|
|
|
|
{ return _S_dynamic_index_data[__r]; }
|
|
|
|
|
|
|
|
|
|
static constexpr auto _S_dynamic_index_data = [] consteval
|
2025-04-29 14:46:09 +02:00
|
|
|
{
|
|
|
|
|
array<size_t, _S_rank+1> __ret;
|
|
|
|
|
size_t __dyn = 0;
|
2025-06-04 16:58:46 +02:00
|
|
|
for (size_t __i = 0; __i < _S_rank; ++__i)
|
2025-04-29 14:46:09 +02:00
|
|
|
{
|
|
|
|
|
__ret[__i] = __dyn;
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
__dyn += (_Extents[__i] == dynamic_extent);
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
__ret[_S_rank] = __dyn;
|
|
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
static constexpr size_t _S_rank_dynamic = _S_dynamic_index(_S_rank);
|
2025-04-29 14:46:09 +02:00
|
|
|
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
// For __r in [0, _S_rank_dynamic), _S_dynamic_index_inv(__r) is the
|
2025-04-29 14:46:09 +02:00
|
|
|
// index of the __r-th dynamic extent in _Extents.
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
static constexpr size_t
|
|
|
|
|
_S_dynamic_index_inv(size_t __r) noexcept
|
|
|
|
|
{ return _S_dynamic_index_inv_data[__r]; }
|
|
|
|
|
|
|
|
|
|
static constexpr auto _S_dynamic_index_inv_data = [] consteval
|
2025-04-29 14:46:09 +02:00
|
|
|
{
|
|
|
|
|
array<size_t, _S_rank_dynamic> __ret;
|
|
|
|
|
for (size_t __i = 0, __r = 0; __i < _S_rank; ++__i)
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
if (_Extents[__i] == dynamic_extent)
|
2025-04-29 14:46:09 +02:00
|
|
|
__ret[__r++] = __i;
|
|
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
_S_static_extent(size_t __r) noexcept
|
|
|
|
|
{ return _Extents[__r]; }
|
2025-08-03 22:57:23 +02:00
|
|
|
};
|
|
|
|
|
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
template<array _Extents>
|
|
|
|
|
requires (__all_dynamic<_Extents>())
|
|
|
|
|
class _StaticExtents<_Extents>
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
static constexpr size_t _S_rank = _Extents.size();
|
|
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
_S_dynamic_index(size_t __r) noexcept
|
|
|
|
|
{ return __r; }
|
|
|
|
|
|
|
|
|
|
static constexpr size_t _S_rank_dynamic = _S_rank;
|
|
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
_S_dynamic_index_inv(size_t __k) noexcept
|
|
|
|
|
{ return __k; }
|
|
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
_S_static_extent(size_t) noexcept
|
|
|
|
|
{ return dynamic_extent; }
|
|
|
|
|
};
|
|
|
|
|
|
2025-08-03 22:57:23 +02:00
|
|
|
template<typename _IndexType, array _Extents>
|
|
|
|
|
class _ExtentsStorage : public _StaticExtents<_Extents>
|
|
|
|
|
{
|
|
|
|
|
private:
|
2025-09-10 12:10:29 +02:00
|
|
|
using _Base = _StaticExtents<_Extents>;
|
2025-08-03 22:57:23 +02:00
|
|
|
|
|
|
|
|
public:
|
2025-09-10 12:10:29 +02:00
|
|
|
using _Base::_S_rank;
|
|
|
|
|
using _Base::_S_rank_dynamic;
|
|
|
|
|
using _Base::_S_dynamic_index;
|
|
|
|
|
using _Base::_S_dynamic_index_inv;
|
|
|
|
|
using _Base::_S_static_extent;
|
2025-08-03 22:57:23 +02:00
|
|
|
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
static constexpr bool
|
|
|
|
|
_S_is_dynamic(size_t __r) noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (__all_static(_Extents))
|
|
|
|
|
return false;
|
|
|
|
|
else if constexpr (__all_dynamic(_Extents))
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
return _Extents[__r] == dynamic_extent;
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-03 22:57:23 +02:00
|
|
|
template<typename _OIndexType>
|
|
|
|
|
static constexpr _IndexType
|
|
|
|
|
_S_int_cast(const _OIndexType& __other) noexcept
|
|
|
|
|
{ return _IndexType(__other); }
|
2025-04-29 14:46:09 +02:00
|
|
|
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
_M_extent(size_t __r) const noexcept
|
|
|
|
|
{
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
if (_S_is_dynamic(__r))
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
return _M_dyn_exts[_S_dynamic_index(__r)];
|
2025-04-29 14:46:09 +02:00
|
|
|
else
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
return _S_static_extent(__r);
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
|
2025-07-04 10:29:44 +02:00
|
|
|
template<size_t _OtherRank, typename _GetOtherExtent>
|
|
|
|
|
static constexpr bool
|
|
|
|
|
_S_is_compatible_extents(_GetOtherExtent __get_extent) noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_OtherRank == _S_rank)
|
|
|
|
|
for (size_t __i = 0; __i < _S_rank; ++__i)
|
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:28 +02:00
|
|
|
if (!_S_is_dynamic(__i)
|
2025-07-04 10:29:44 +02:00
|
|
|
&& !cmp_equal(_Extents[__i], _S_int_cast(__get_extent(__i))))
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
template<size_t _OtherRank, typename _GetOtherExtent>
|
|
|
|
|
constexpr void
|
|
|
|
|
_M_init_dynamic_extents(_GetOtherExtent __get_extent) noexcept
|
|
|
|
|
{
|
2025-07-04 10:29:44 +02:00
|
|
|
__glibcxx_assert(_S_is_compatible_extents<_OtherRank>(__get_extent));
|
2025-06-04 16:58:46 +02:00
|
|
|
for (size_t __i = 0; __i < _S_rank_dynamic; ++__i)
|
2025-04-29 14:46:09 +02:00
|
|
|
{
|
|
|
|
|
size_t __di = __i;
|
|
|
|
|
if constexpr (_OtherRank != _S_rank_dynamic)
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
__di = _S_dynamic_index_inv(__i);
|
2025-06-04 16:58:46 +02:00
|
|
|
_M_dyn_exts[__i] = _S_int_cast(__get_extent(__di));
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
_ExtentsStorage() noexcept = default;
|
|
|
|
|
|
|
|
|
|
template<typename _OIndexType, array _OExtents>
|
|
|
|
|
constexpr
|
|
|
|
|
_ExtentsStorage(const _ExtentsStorage<_OIndexType, _OExtents>&
|
|
|
|
|
__other) noexcept
|
|
|
|
|
{
|
|
|
|
|
_M_init_dynamic_extents<_S_rank>([&__other](size_t __i)
|
|
|
|
|
{ return __other._M_extent(__i); });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OIndexType, size_t _Nm>
|
|
|
|
|
constexpr
|
|
|
|
|
_ExtentsStorage(span<const _OIndexType, _Nm> __exts) noexcept
|
|
|
|
|
{
|
|
|
|
|
_M_init_dynamic_extents<_Nm>(
|
|
|
|
|
[&__exts](size_t __i) -> const _OIndexType&
|
|
|
|
|
{ return __exts[__i]; });
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-03 22:57:23 +02:00
|
|
|
static constexpr const array<size_t, _S_rank>&
|
|
|
|
|
_S_static_extents() noexcept
|
|
|
|
|
{ return _Extents; }
|
2025-06-04 16:58:47 +02:00
|
|
|
|
|
|
|
|
constexpr span<const _IndexType>
|
|
|
|
|
_M_dynamic_extents(size_t __begin, size_t __end) const noexcept
|
|
|
|
|
requires (_Extents.size() > 0)
|
|
|
|
|
{
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
return {_M_dyn_exts + _S_dynamic_index(__begin),
|
|
|
|
|
_M_dyn_exts + _S_dynamic_index(__end)};
|
2025-06-04 16:58:47 +02:00
|
|
|
}
|
|
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
private:
|
2025-09-10 12:10:29 +02:00
|
|
|
using _Storage = __array_traits<_IndexType, _S_rank_dynamic>::_Type;
|
|
|
|
|
[[no_unique_address]] _Storage _M_dyn_exts{};
|
2025-04-29 14:46:09 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<typename _OIndexType, typename _SIndexType>
|
|
|
|
|
concept __valid_index_type =
|
|
|
|
|
is_convertible_v<_OIndexType, _SIndexType> &&
|
|
|
|
|
is_nothrow_constructible_v<_SIndexType, _OIndexType>;
|
|
|
|
|
|
|
|
|
|
template<size_t _Extent, typename _IndexType>
|
|
|
|
|
concept
|
|
|
|
|
__valid_static_extent = _Extent == dynamic_extent
|
2025-08-03 22:57:30 +02:00
|
|
|
|| _Extent <= __gnu_cxx::__int_traits<_IndexType>::__max;
|
2025-04-29 14:46:09 +02:00
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
2025-08-03 22:57:23 +02:00
|
|
|
constexpr const array<size_t, _Extents::rank()>&
|
|
|
|
|
__static_extents() noexcept
|
2025-09-10 12:10:29 +02:00
|
|
|
{ return _Extents::_Storage::_S_static_extents(); }
|
2025-06-04 16:58:47 +02:00
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr span<const size_t>
|
|
|
|
|
__static_extents(size_t __begin, size_t __end) noexcept
|
|
|
|
|
{
|
|
|
|
|
const auto& __sta_exts = __static_extents<_Extents>();
|
|
|
|
|
return span<const size_t>(__sta_exts.data() + __begin, __end - __begin);
|
|
|
|
|
}
|
|
|
|
|
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
// Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive)
|
|
|
|
|
template<array _Extents>
|
|
|
|
|
constexpr auto __fwd_partial_prods = [] consteval
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __rank = _Extents.size();
|
2025-08-11 22:14:54 +02:00
|
|
|
std::array<size_t, __rank> __ret;
|
2025-08-11 22:14:55 +02:00
|
|
|
size_t __prod = 1;
|
|
|
|
|
for (size_t __r = 0; __r < __rank; ++__r)
|
|
|
|
|
{
|
|
|
|
|
__ret[__r] = __prod;
|
|
|
|
|
if (size_t __ext = _Extents[__r]; __ext != dynamic_extent)
|
|
|
|
|
__prod *= __ext;
|
|
|
|
|
}
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
|
|
|
|
|
// Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
|
|
|
|
|
template<array _Extents>
|
|
|
|
|
constexpr auto __rev_partial_prods = [] consteval
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __rank = _Extents.size();
|
|
|
|
|
std::array<size_t, __rank> __ret;
|
2025-08-11 22:14:55 +02:00
|
|
|
size_t __prod = 1;
|
|
|
|
|
for (size_t __r = __rank; __r > 0; --__r)
|
|
|
|
|
{
|
|
|
|
|
__ret[__r - 1] = __prod;
|
|
|
|
|
if (size_t __ext = _Extents[__r - 1]; __ext != dynamic_extent)
|
|
|
|
|
__prod *= __ext;
|
|
|
|
|
}
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr span<const typename _Extents::index_type>
|
|
|
|
|
__dynamic_extents(const _Extents& __exts, size_t __begin = 0,
|
|
|
|
|
size_t __end = _Extents::rank()) noexcept
|
2025-07-27 14:40:10 +02:00
|
|
|
{ return __exts._M_exts._M_dynamic_extents(__begin, __end); }
|
2025-06-04 16:58:47 +02:00
|
|
|
}
|
|
|
|
|
|
2025-10-02 11:27:14 +02:00
|
|
|
#if __glibcxx_submdspan
|
2025-10-02 11:27:15 +02:00
|
|
|
struct full_extent_t
|
|
|
|
|
{
|
|
|
|
|
explicit full_extent_t() = default;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline constexpr full_extent_t full_extent{};
|
|
|
|
|
|
2025-10-02 11:27:14 +02:00
|
|
|
template<typename _OffsetType, typename _ExtentType, typename _StrideType>
|
|
|
|
|
struct strided_slice {
|
|
|
|
|
static_assert(__is_standard_integer<_OffsetType>::value
|
|
|
|
|
|| __detail::__integral_constant_like<_OffsetType>);
|
|
|
|
|
static_assert(__is_standard_integer<_ExtentType>::value
|
|
|
|
|
|| __detail::__integral_constant_like<_ExtentType>);
|
|
|
|
|
static_assert(__is_standard_integer<_StrideType>::value
|
|
|
|
|
|| __detail::__integral_constant_like<_StrideType>);
|
|
|
|
|
|
|
|
|
|
using offset_type = _OffsetType;
|
|
|
|
|
using extent_type = _ExtentType;
|
|
|
|
|
using stride_type = _StrideType;
|
|
|
|
|
|
|
|
|
|
[[no_unique_address]] offset_type offset{};
|
|
|
|
|
[[no_unique_address]] extent_type extent{};
|
|
|
|
|
[[no_unique_address]] stride_type stride{};
|
|
|
|
|
};
|
2025-10-02 11:27:16 +02:00
|
|
|
|
|
|
|
|
template<typename _Mapping>
|
|
|
|
|
struct submdspan_mapping_result
|
|
|
|
|
{
|
|
|
|
|
[[no_unique_address]] _Mapping mapping = _Mapping();
|
|
|
|
|
size_t offset{};
|
|
|
|
|
};
|
2025-10-02 11:27:14 +02:00
|
|
|
#endif
|
|
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
template<typename _IndexType, size_t... _Extents>
|
|
|
|
|
class extents
|
|
|
|
|
{
|
2025-05-14 21:13:52 +02:00
|
|
|
static_assert(__is_standard_integer<_IndexType>::value,
|
|
|
|
|
"IndexType must be a signed or unsigned integer type");
|
2025-04-29 14:46:09 +02:00
|
|
|
static_assert(
|
|
|
|
|
(__mdspan::__valid_static_extent<_Extents, _IndexType> && ...),
|
2025-05-14 21:13:52 +02:00
|
|
|
"Extents must either be dynamic or representable as IndexType");
|
2025-04-29 14:46:09 +02:00
|
|
|
public:
|
|
|
|
|
using index_type = _IndexType;
|
|
|
|
|
using size_type = make_unsigned_t<index_type>;
|
|
|
|
|
using rank_type = size_t;
|
|
|
|
|
|
|
|
|
|
static constexpr rank_type
|
2025-09-10 12:10:29 +02:00
|
|
|
rank() noexcept { return _Storage::_S_rank; }
|
2025-04-29 14:46:09 +02:00
|
|
|
|
|
|
|
|
static constexpr rank_type
|
2025-09-10 12:10:29 +02:00
|
|
|
rank_dynamic() noexcept { return _Storage::_S_rank_dynamic; }
|
2025-04-29 14:46:09 +02:00
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
static_extent(rank_type __r) noexcept
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__r < rank());
|
|
|
|
|
if constexpr (rank() == 0)
|
|
|
|
|
__builtin_trap();
|
|
|
|
|
else
|
2025-09-10 12:10:29 +02:00
|
|
|
return _Storage::_S_static_extent(__r);
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
extent(rank_type __r) const noexcept
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__r < rank());
|
|
|
|
|
if constexpr (rank() == 0)
|
|
|
|
|
__builtin_trap();
|
|
|
|
|
else
|
2025-06-04 16:58:46 +02:00
|
|
|
return _M_exts._M_extent(__r);
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
extents() noexcept = default;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
static consteval bool
|
|
|
|
|
_S_is_less_dynamic(size_t __ext, size_t __oext)
|
|
|
|
|
{ return (__ext != dynamic_extent) && (__oext == dynamic_extent); }
|
|
|
|
|
|
|
|
|
|
template<typename _OIndexType, size_t... _OExtents>
|
|
|
|
|
static consteval bool
|
|
|
|
|
_S_ctor_explicit()
|
|
|
|
|
{
|
|
|
|
|
return (_S_is_less_dynamic(_Extents, _OExtents) || ...)
|
2025-08-03 22:57:30 +02:00
|
|
|
|| (__gnu_cxx::__int_traits<index_type>::__max
|
|
|
|
|
< __gnu_cxx::__int_traits<_OIndexType>::__max);
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<size_t... _OExtents>
|
|
|
|
|
static consteval bool
|
|
|
|
|
_S_is_compatible_extents()
|
|
|
|
|
{
|
|
|
|
|
if constexpr (sizeof...(_OExtents) != rank())
|
|
|
|
|
return false;
|
|
|
|
|
else
|
|
|
|
|
return ((_OExtents == dynamic_extent || _Extents == dynamic_extent
|
|
|
|
|
|| _OExtents == _Extents) && ...);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
template<typename _OIndexType, size_t... _OExtents>
|
|
|
|
|
requires (_S_is_compatible_extents<_OExtents...>())
|
|
|
|
|
constexpr explicit(_S_ctor_explicit<_OIndexType, _OExtents...>())
|
|
|
|
|
extents(const extents<_OIndexType, _OExtents...>& __other) noexcept
|
2025-06-04 16:58:46 +02:00
|
|
|
: _M_exts(__other._M_exts)
|
2025-04-29 14:46:09 +02:00
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
|
|
|
|
|
requires (sizeof...(_OIndexTypes) == rank()
|
|
|
|
|
|| sizeof...(_OIndexTypes) == rank_dynamic())
|
|
|
|
|
constexpr explicit extents(_OIndexTypes... __exts) noexcept
|
2025-06-04 16:58:46 +02:00
|
|
|
: _M_exts(span<const _IndexType, sizeof...(_OIndexTypes)>(
|
2025-07-16 15:45:45 +02:00
|
|
|
initializer_list{static_cast<_IndexType>(std::move(__exts))...}))
|
2025-04-29 14:46:09 +02:00
|
|
|
{ }
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType, size_t _Nm>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
|
|
|
|
&& (_Nm == rank() || _Nm == rank_dynamic())
|
2025-04-29 14:46:09 +02:00
|
|
|
constexpr explicit(_Nm != rank_dynamic())
|
|
|
|
|
extents(span<_OIndexType, _Nm> __exts) noexcept
|
2025-06-04 16:58:46 +02:00
|
|
|
: _M_exts(span<const _OIndexType, _Nm>(__exts))
|
2025-04-29 14:46:09 +02:00
|
|
|
{ }
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType, size_t _Nm>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
|
|
|
|
&& (_Nm == rank() || _Nm == rank_dynamic())
|
2025-04-29 14:46:09 +02:00
|
|
|
constexpr explicit(_Nm != rank_dynamic())
|
|
|
|
|
extents(const array<_OIndexType, _Nm>& __exts) noexcept
|
2025-06-04 16:58:46 +02:00
|
|
|
: _M_exts(span<const _OIndexType, _Nm>(__exts))
|
2025-04-29 14:46:09 +02:00
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OIndexType, size_t... _OExtents>
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const extents& __self,
|
|
|
|
|
const extents<_OIndexType, _OExtents...>& __other) noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (!_S_is_compatible_extents<_OExtents...>())
|
|
|
|
|
return false;
|
|
|
|
|
else
|
|
|
|
|
{
|
libstdc++: Improve extents::operator==.
An interesting case to consider is:
bool same11(const std::extents<int, dyn, 2, 3>& e1,
const std::extents<int, dyn, dyn, 3>& e2)
{ return e1 == e2; }
Which has the following properties:
- There's no mismatching static extents, preventing any
short-circuiting.
- There's a comparison between dynamic and static extents.
- There's one trivial comparison: ... && 3 == 3.
Let E[i] denote the array of static extents, D[k] denote the array of
dynamic extents and k[i] be the index of the i-th extent in D.
(Naturally, k[i] is only meaningful if i is a dynamic extent).
The previous implementation results in assembly that's more or less a
literal translation of:
for (i = 0; i < 3; ++i)
e1 = E1[i] == -1 ? D1[k1[i]] : E1[i];
e2 = E2[i] == -1 ? D2[k2[i]] : E2[i];
if e1 != e2:
return false
return true;
While the proposed method results in assembly for
if(D1[0] == D2[0]) return false;
return 2 == D2[1];
i.e.
110: 8b 17 mov edx,DWORD PTR [rdi]
112: 31 c0 xor eax,eax
114: 39 16 cmp DWORD PTR [rsi],edx
116: 74 08 je 120 <same11+0x10>
118: c3 ret
119: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
120: 83 7e 04 02 cmp DWORD PTR [rsi+0x4],0x2
124: 0f 94 c0 sete al
127: c3 ret
It has the following nice properties:
- It eliminated the indirection D[k[i]], because k[i] is known at
compile time. Saving us a comparison E[i] == -1 and conditionally
loading k[i].
- It eliminated the trivial condition 3 == 3.
The result is code that only loads the required values and performs
exactly the number of comparisons needed by the algorithm. It also
results in smaller object files. Therefore, this seems like a sensible
change. We've check several other examples, including fully statically
determined cases and high-rank examples. The example given above
illustrates the other cases well.
The constexpr condition:
if constexpr (!_S_is_compatible_extents<...>)
return false;
is no longer needed, because the optimizer correctly handles this case.
However, it's retained for clarity/certainty.
libstdc++-v3/ChangeLog:
* include/std/mdspan (extents::operator==): Replace loop with
pack expansion.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:29 +02:00
|
|
|
auto __impl = [&__self, &__other]<size_t... _Counts>(
|
|
|
|
|
index_sequence<_Counts...>)
|
|
|
|
|
{ return (cmp_equal(__self.extent(_Counts),
|
|
|
|
|
__other.extent(_Counts)) && ...); };
|
|
|
|
|
return __impl(make_index_sequence<__self.rank()>());
|
2025-04-29 14:46:09 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2025-08-03 22:57:23 +02:00
|
|
|
friend const array<size_t, rank()>&
|
|
|
|
|
__mdspan::__static_extents<extents>();
|
2025-06-04 16:58:47 +02:00
|
|
|
|
|
|
|
|
friend span<const index_type>
|
|
|
|
|
__mdspan::__dynamic_extents<extents>(const extents&, size_t, size_t);
|
|
|
|
|
|
2025-09-10 12:10:29 +02:00
|
|
|
using _Storage = __mdspan::_ExtentsStorage<
|
2025-04-29 14:46:09 +02:00
|
|
|
_IndexType, array<size_t, sizeof...(_Extents)>{_Extents...}>;
|
2025-09-10 12:10:29 +02:00
|
|
|
[[no_unique_address]] _Storage _M_exts;
|
2025-04-29 14:46:09 +02:00
|
|
|
|
|
|
|
|
template<typename _OIndexType, size_t... _OExtents>
|
|
|
|
|
friend class extents;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Tp, size_t _Nm>
|
|
|
|
|
constexpr bool
|
|
|
|
|
__contains_zero(span<_Tp, _Nm> __exts) noexcept
|
|
|
|
|
{
|
|
|
|
|
for (size_t __i = 0; __i < __exts.size(); ++__i)
|
|
|
|
|
if (__exts[__i] == 0)
|
|
|
|
|
return true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-03 22:57:23 +02:00
|
|
|
template<typename _Tp, size_t _Nm>
|
|
|
|
|
consteval bool
|
|
|
|
|
__contains_zero(const array<_Tp, _Nm>& __exts) noexcept
|
|
|
|
|
{ return __contains_zero(span<const _Tp>(__exts)); }
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr bool
|
|
|
|
|
__empty(const _Extents& __exts) noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (__contains_zero(__static_extents<_Extents>()))
|
|
|
|
|
return true;
|
|
|
|
|
else if constexpr (_Extents::rank_dynamic() > 0)
|
|
|
|
|
return __contains_zero(__dynamic_extents(__exts));
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr typename _Extents::index_type
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
__extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin,
|
|
|
|
|
size_t __end) noexcept
|
2025-06-04 16:58:47 +02:00
|
|
|
{
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
if (__sta_prod == 0)
|
|
|
|
|
return 0;
|
2025-06-04 16:58:47 +02:00
|
|
|
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
size_t __ret = __sta_prod;
|
2025-06-04 16:58:47 +02:00
|
|
|
if constexpr (_Extents::rank_dynamic() > 0)
|
|
|
|
|
for (auto __factor : __dynamic_extents(__exts, __begin, __end))
|
|
|
|
|
__ret *= size_t(__factor);
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
return static_cast<typename _Extents::index_type>(__ret);
|
2025-06-04 16:58:47 +02:00
|
|
|
}
|
|
|
|
|
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
// Preconditions: _r < _Extents::rank()
|
2025-09-29 08:00:18 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__fwd_prod(const _Extents& __exts, size_t __begin, size_t __end) noexcept
|
|
|
|
|
{
|
|
|
|
|
size_t __sta_prod = [__begin, __end] {
|
|
|
|
|
span<const size_t> __sta_exts = __static_extents<_Extents>(__begin, __end);
|
|
|
|
|
size_t __ret = 1;
|
|
|
|
|
for(auto __ext : __sta_exts)
|
|
|
|
|
if (__ext != dynamic_extent)
|
|
|
|
|
__ret *= __ext;
|
|
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
return __extents_prod(__exts, __sta_prod, __begin, __end);
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__fwd_prod(const _Extents& __exts, size_t __r) noexcept
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
{
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
constexpr size_t __rank = _Extents::rank();
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
constexpr auto& __sta_exts = __static_extents<_Extents>();
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
if constexpr (__rank == 1)
|
|
|
|
|
return 1;
|
|
|
|
|
else if constexpr (__rank == 2)
|
|
|
|
|
return __r == 0 ? 1 : __exts.extent(0);
|
libstdc++: Improve nearly fully dynamic extents in mdspan.
One previous commit optimized fully dynamic extents; and another
refactored __size such that __fwd_prod is valid for __r = 0, ..., rank
(exclusive).
Therefore, by noticing that __rev_prod (and __fwd_prod) never accesses
the first (or last) extent, one can avoid pre-computing partial products
of static extents in those cases, if all other extents are dynamic.
We check that the size of the reference object file decreases further
and the .rodata sections for
__fwd_prod<dyn, ..., dyn, 11>
__rev_prod<3, dyn, ..., dyn>
are absent.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__fwd_prods): Relax condition for fully-dynamic
extents to cover (dyn, ..., dyn, X).
(__rev_partial_prods): Analogous for (X, dyn, ..., dyn).
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:27 +02:00
|
|
|
else if constexpr (__all_dynamic(std::span(__sta_exts).first(__rank-1)))
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
return __extents_prod(__exts, 1, 0, __r);
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
|
|
|
|
|
return __extents_prod(__exts, __sta_prod, 0, __r);
|
|
|
|
|
}
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
}
|
2025-06-04 16:58:47 +02:00
|
|
|
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
// Preconditions: _r < _Extents::rank()
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__rev_prod(const _Extents& __exts, size_t __r) noexcept
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
{
|
|
|
|
|
constexpr size_t __rank = _Extents::rank();
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
constexpr auto& __sta_exts = __static_extents<_Extents>();
|
|
|
|
|
if constexpr (__rank == 1)
|
|
|
|
|
return 1;
|
|
|
|
|
else if constexpr (__rank == 2)
|
|
|
|
|
return __r == 0 ? __exts.extent(1) : 1;
|
libstdc++: Improve nearly fully dynamic extents in mdspan.
One previous commit optimized fully dynamic extents; and another
refactored __size such that __fwd_prod is valid for __r = 0, ..., rank
(exclusive).
Therefore, by noticing that __rev_prod (and __fwd_prod) never accesses
the first (or last) extent, one can avoid pre-computing partial products
of static extents in those cases, if all other extents are dynamic.
We check that the size of the reference object file decreases further
and the .rodata sections for
__fwd_prod<dyn, ..., dyn, 11>
__rev_prod<3, dyn, ..., dyn>
are absent.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__fwd_prods): Relax condition for fully-dynamic
extents to cover (dyn, ..., dyn, X).
(__rev_partial_prods): Analogous for (X, dyn, ..., dyn).
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:27 +02:00
|
|
|
else if constexpr (__all_dynamic(std::span(__sta_exts).last(__rank-1)))
|
libstdc++: Improve fully dynamic extents in mdspan.
In mdspan related code, for extents with no static extents, i.e. only
dynamic extents, the following simplifications can be made:
- The array of dynamic extents has size rank.
- The two arrays dynamic-index and dynamic-index-inv become
trivial, e.g. k[i] == i.
- All elements of the arrays __{fwd,rev}_partial_prods are 1.
This commits eliminates the arrays for dynamic-index, dynamic-index-inv
and __{fwd,rev}_partial_prods. It also removes the indirection k[i] == i
from the source code, which isn't as relevant because the optimizer is
(often) capable of eliminating the indirection.
To check if it's working we look at:
using E2 = std::extents<int, dyn, dyn, dyn, dyn>;
int stride_left_E2(const std::layout_left::mapping<E2>& m, size_t r)
{ return m.stride(r); }
which generates the following
0000000000000190 <stride_left_E2>:
190: 48 c1 e6 02 shl rsi,0x2
194: 74 22 je 1b8 <stride_left_E2+0x28>
196: 48 01 fe add rsi,rdi
199: b8 01 00 00 00 mov eax,0x1
19e: 66 90 xchg ax,ax
1a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
1a3: 48 83 c7 04 add rdi,0x4
1a7: 48 0f af c2 imul rax,rdx
1ab: 48 39 fe cmp rsi,rdi
1ae: 75 f0 jne 1a0 <stride_left_E2+0x10>
1b0: c3 ret
1b1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1b8: b8 01 00 00 00 mov eax,0x1
1bd: c3 ret
We see that:
- There's no code to load the partial product of static extents.
- There's no indirection D[k[i]], it's just D[i] (as before).
On a test file which computes both mapping::stride(r) and
mapping::required_span_size, we check for static storage with
objdump -h
we don't see the NTTP _Extents, anything (anymore) related to
_StaticExtents, __fwd_partial_prods or __rev_partial_prods. We also
check that the size of the reference object file (described three
commits prior) reduced by a few percent from 41.9kB to 39.4kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_dynamic): New function.
(__mdspan::_StaticExtents::_S_dynamic_index): Convert to method.
(__mdspan::_StaticExtents::_S_dynamic_index_inv): Ditto.
(__mdspan::_StaticExtents): New specialization for fully dynamic
extents.
(__mdspan::__fwd_prod): New constexpr if branch to avoid
instantiating __fwd_partial_prods.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:26 +02:00
|
|
|
return __extents_prod(__exts, 1, __r + 1, __rank);
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
|
|
|
|
|
return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
|
|
|
|
|
}
|
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:24 +02:00
|
|
|
}
|
2025-06-04 16:58:47 +02:00
|
|
|
|
2025-07-04 10:29:46 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__size(const _Extents& __exts) noexcept
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
{
|
2025-08-11 22:14:55 +02:00
|
|
|
constexpr size_t __sta_prod = [] {
|
|
|
|
|
span<const size_t> __sta_exts = __static_extents<_Extents>();
|
|
|
|
|
size_t __ret = 1;
|
|
|
|
|
for(auto __ext : __sta_exts)
|
|
|
|
|
if (__ext != dynamic_extent)
|
|
|
|
|
__ret *= __ext;
|
|
|
|
|
return __ret;
|
|
|
|
|
}();
|
|
|
|
|
return __extents_prod(__exts, __sta_prod, 0, _Extents::rank());
|
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined
as
\prod_{i = 0}^r E[i]
\prod_{i = r+1}^n E[i]
This is computed as the product of a precomputed static product and the
product of the required dynamic extents.
Disassembly shows that even for low-rank extents, i.e. rank == 1 and
rank == 2, with at least one dynamic extent, the generated code loads
two values; and then runs the loop over at most one element, e.g. for
stride_left_d5 defined below the generated code is:
220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
227: 00
228: 31 d2 xor edx,edx
22a: 48 85 c0 test rax,rax
22d: 74 23 je 252 <stride_left_d5+0x32>
22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
236: 00
237: 48 c1 e1 02 shl rcx,0x2
23b: 74 13 je 250 <stride_left_d5+0x30>
23d: 48 01 f9 add rcx,rdi
240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
243: 48 83 c7 04 add rdi,0x4
247: 48 0f af c2 imul rax,rdx
24b: 48 39 f9 cmp rcx,rdi
24e: 75 f0 jne 240 <stride_left_d5+0x20>
250: 89 c2 mov edx,eax
252: 89 d0 mov eax,edx
254: c3 ret
If there's no dynamic extents, it simply loads the precomputed product
of static extents.
For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
extents.extent(k), with k == 0 for layout_left and k == 1 for
layout_right.
Consider,
using Ed = std::extents<int, dyn>;
int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
{ return m.stride(r); }
using E3d = std::extents<int, 3, dyn>;
int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
{ return m.stride(r); }
using Ed5 = std::extents<int, dyn, 5>;
int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
{ return m.stride(r); }
The optimized code for these three cases is:
0000000000000060 <stride_left_d>:
60: b8 01 00 00 00 mov eax,0x1
65: c3 ret
0000000000000090 <stride_left_3d>:
90: 48 83 fe 01 cmp rsi,0x1
94: 19 c0 sbb eax,eax
96: 83 e0 fe and eax,0xfffffffe
99: 83 c0 03 add eax,0x3
9c: c3 ret
00000000000000a0 <stride_left_d5>:
a0: b8 01 00 00 00 mov eax,0x1
a5: 48 85 f6 test rsi,rsi
a8: 74 02 je ac <stride_left_d5+0xc>
aa: 8b 07 mov eax,DWORD PTR [rdi]
ac: c3 ret
For rank == 1 it simply returns 1 (as expected). For rank == 2, it
either implements a branchless formula, or conditionally loads one
value. In all cases involving a dynamic extent this seems like it's
always doing clearly less work, both in terms of computation and loads.
In cases not involving a dynamic extent, it replaces loading one value
with a branchless sequence of four instructions.
This commit also refactors __size to no use any of the precomputed
arrays. This prevents instantiating __{fwd,rev}_partial_prods for
low-rank extents. This results in a further size reduction of a
reference object file (described two commits prior) by 9% from 46.0kB to
41.9kB.
In a prior commit we optimized __size to produce better object code by
precomputing the static products. This refactor enables the optimizer to
generate the same optimized code.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Optimize
for rank <= 2.
(__mdspan::__rev_prod): Ditto.
(__mdspan::__size): Refactor to use a pre-computed product, not
a partial product.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-08-03 22:57:25 +02:00
|
|
|
}
|
2025-07-04 10:29:46 +02:00
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
template<typename _IndexType, size_t... _Counts>
|
|
|
|
|
auto __build_dextents_type(integer_sequence<size_t, _Counts...>)
|
|
|
|
|
-> extents<_IndexType, ((void) _Counts, dynamic_extent)...>;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _IndexType, size_t _Rank>
|
|
|
|
|
using dextents = decltype(__mdspan::__build_dextents_type<_IndexType>(
|
|
|
|
|
make_index_sequence<_Rank>()));
|
|
|
|
|
|
2025-07-21 17:50:31 +02:00
|
|
|
#if __glibcxx_mdspan >= 202406L
|
|
|
|
|
template<size_t _Rank, typename _IndexType = size_t>
|
|
|
|
|
using dims = dextents<_IndexType, _Rank>;
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-04-29 14:46:09 +02:00
|
|
|
template<typename... _Integrals>
|
|
|
|
|
requires (is_convertible_v<_Integrals, size_t> && ...)
|
|
|
|
|
explicit extents(_Integrals...) ->
|
libstdc++: Better CTAD for span and mdspan [PR120914].
This implements P3029R1. In P3029R1, the CTAD for span is refined to
permit deducing the extent of the span from an integral constant, e.g.
span((T*) ptr, integral_constant<size_t, 5>{});
is deduced as span<T, 5>. Similarly, in
auto exts = extents(integral_constant<int, 2>);
auto md = mdspan((T*) ptr, integral_constant<int, 2>);
exts and md have types extents<size_t, 2> and mdspan<double,
extents<size_t, 2>>, respectively.
PR libstdc++/120914
libstdc++-v3/ChangeLog:
* include/std/span (span): Update CTAD to enable
integral constants [P3029R1].
* include/std/mdspan (extents): ditto.
(mdspan): ditto.
* testsuite/23_containers/span/deduction.cc: Test deduction
guide.
* testsuite/23_containers/mdspan/extents/misc.cc: ditto.
* testsuite/23_containers/mdspan/mdspan.cc: ditto.
Reviewed-by: Jonathan Wakely <jwakely@redhat.com>
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-07-08 11:49:21 +02:00
|
|
|
extents<size_t, __detail::__maybe_static_ext<_Integrals>...>;
|
2025-04-29 14:46:08 +02:00
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
struct layout_left
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class mapping;
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-04 16:58:49 +02:00
|
|
|
struct layout_right
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class mapping;
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-04 16:58:51 +02:00
|
|
|
struct layout_stride
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class mapping;
|
|
|
|
|
};
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
#ifdef __glibcxx_padded_layouts
|
|
|
|
|
template<size_t _PaddingValue>
|
|
|
|
|
struct layout_left_padded
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class mapping;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<size_t _PaddingValue>
|
|
|
|
|
struct layout_right_padded
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class mapping;
|
|
|
|
|
};
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
|
|
|
|
template<typename _Tp>
|
|
|
|
|
constexpr bool __is_extents = false;
|
|
|
|
|
|
|
|
|
|
template<typename _IndexType, size_t... _Extents>
|
|
|
|
|
constexpr bool __is_extents<extents<_IndexType, _Extents...>> = true;
|
|
|
|
|
|
|
|
|
|
template<typename _Extents, typename... _Indices>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__linear_index_left(const _Extents& __exts, _Indices... __indices)
|
|
|
|
|
noexcept
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
_IndexType __res = 0;
|
|
|
|
|
if constexpr (sizeof...(__indices) > 0)
|
|
|
|
|
{
|
|
|
|
|
_IndexType __mult = 1;
|
|
|
|
|
auto __update = [&, __pos = 0u](_IndexType __idx) mutable
|
|
|
|
|
{
|
2025-07-04 10:29:43 +02:00
|
|
|
_GLIBCXX_DEBUG_ASSERT(cmp_less(__idx, __exts.extent(__pos)));
|
2025-06-04 16:58:47 +02:00
|
|
|
__res += __idx * __mult;
|
|
|
|
|
__mult *= __exts.extent(__pos);
|
|
|
|
|
++__pos;
|
|
|
|
|
};
|
|
|
|
|
(__update(__indices), ...);
|
|
|
|
|
}
|
|
|
|
|
return __res;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-23 15:10:04 +02:00
|
|
|
template<typename _IndexType>
|
2025-06-04 16:58:47 +02:00
|
|
|
consteval _IndexType
|
2025-09-23 15:10:04 +02:00
|
|
|
__static_quotient(std::span<const size_t> __sta_exts,
|
|
|
|
|
_IndexType __nom = __gnu_cxx::__int_traits<_IndexType>::__max)
|
2025-06-04 16:58:47 +02:00
|
|
|
{
|
|
|
|
|
for (auto __factor : __sta_exts)
|
|
|
|
|
{
|
|
|
|
|
if (__factor != dynamic_extent)
|
|
|
|
|
__nom /= _IndexType(__factor);
|
|
|
|
|
if (__nom == 0)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return __nom;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-23 15:10:04 +02:00
|
|
|
template<typename _Extents,
|
|
|
|
|
typename _IndexType = typename _Extents::index_type>
|
|
|
|
|
requires __is_extents<_Extents>
|
|
|
|
|
consteval _IndexType
|
|
|
|
|
__static_quotient(_IndexType __nom
|
|
|
|
|
= __gnu_cxx::__int_traits<_IndexType>::__max)
|
|
|
|
|
{
|
|
|
|
|
std::span<const size_t> __sta_exts = __static_extents<_Extents>();
|
|
|
|
|
return __static_quotient<_IndexType>(__sta_exts, __nom);
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Extents>
|
|
|
|
|
constexpr bool
|
|
|
|
|
__is_representable_extents(const _Extents& __exts) noexcept
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = _Extents::index_type;
|
|
|
|
|
|
|
|
|
|
if constexpr (__contains_zero(__static_extents<_Extents>()))
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
constexpr auto __sta_quo = __static_quotient<_Extents>();
|
|
|
|
|
if constexpr (_Extents::rank_dynamic() == 0)
|
|
|
|
|
return __sta_quo != 0;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
auto __dyn_exts = __dynamic_extents(__exts);
|
|
|
|
|
if (__contains_zero(__dyn_exts))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if constexpr (__sta_quo == 0)
|
|
|
|
|
return false;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
auto __dyn_quo = _IndexType(__sta_quo);
|
|
|
|
|
for (auto __factor : __dyn_exts)
|
|
|
|
|
{
|
|
|
|
|
__dyn_quo /= __factor;
|
|
|
|
|
if (__dyn_quo == 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents, typename _IndexType>
|
|
|
|
|
concept __representable_size = _Extents::rank_dynamic() != 0
|
|
|
|
|
|| __contains_zero(__static_extents<_Extents>())
|
|
|
|
|
|| (__static_quotient<_Extents, _IndexType>() != 0);
|
|
|
|
|
|
|
|
|
|
template<typename _Layout, typename _Mapping>
|
|
|
|
|
concept __mapping_of =
|
2025-07-07 11:32:48 +01:00
|
|
|
is_same_v<typename _Layout::template mapping<typename _Mapping::extents_type>,
|
2025-06-04 16:58:47 +02:00
|
|
|
_Mapping>;
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
template<template<size_t> typename _Layout, typename _Mapping>
|
|
|
|
|
concept __padded_mapping_of = __mapping_of<
|
|
|
|
|
_Layout<_Mapping::padding_value>, _Mapping>;
|
|
|
|
|
|
|
|
|
|
#ifdef __glibcxx_padded_layouts
|
|
|
|
|
template<typename _Mapping>
|
|
|
|
|
constexpr bool __is_left_padded_mapping = __padded_mapping_of<
|
|
|
|
|
layout_left_padded, _Mapping>;
|
|
|
|
|
|
|
|
|
|
template<typename _Mapping>
|
|
|
|
|
constexpr bool __is_right_padded_mapping = __padded_mapping_of<
|
|
|
|
|
layout_right_padded, _Mapping>;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
template<typename _PaddedMapping>
|
|
|
|
|
consteval size_t
|
|
|
|
|
__get_static_stride()
|
|
|
|
|
{ return _PaddedMapping::_PaddedStorage::_S_static_stride; }
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
template<typename _Mapping>
|
2025-06-04 16:58:49 +02:00
|
|
|
concept __standardized_mapping = __mapping_of<layout_left, _Mapping>
|
2025-06-04 16:58:51 +02:00
|
|
|
|| __mapping_of<layout_right, _Mapping>
|
2025-09-29 08:00:18 +02:00
|
|
|
|| __mapping_of<layout_stride, _Mapping>
|
|
|
|
|
#ifdef __glibcxx_padded_layouts
|
|
|
|
|
|| __is_left_padded_mapping<_Mapping>
|
|
|
|
|
|| __is_right_padded_mapping<_Mapping>
|
|
|
|
|
#endif
|
|
|
|
|
;
|
2025-06-04 16:58:47 +02:00
|
|
|
|
|
|
|
|
// A tag type to create internal ctors.
|
|
|
|
|
class __internal_ctor
|
|
|
|
|
{ };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class layout_left::mapping
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using layout_type = layout_left;
|
|
|
|
|
|
|
|
|
|
static_assert(__mdspan::__representable_size<extents_type, index_type>,
|
|
|
|
|
"The size of extents_type must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping() noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __extents) noexcept
|
|
|
|
|
: _M_extents(__extents)
|
|
|
|
|
{ __glibcxx_assert(__mdspan::__is_representable_extents(_M_extents)); }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const mapping<_OExtents>& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-06-04 16:58:49 +02:00
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires (extents_type::rank() <= 1)
|
|
|
|
|
&& is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const layout_right::mapping<_OExtents>& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-06-04 16:58:53 +02:00
|
|
|
// noexcept for consistency with other layouts.
|
2025-06-04 16:58:51 +02:00
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(extents_type::rank() > 0)
|
2025-06-04 16:58:53 +02:00
|
|
|
mapping(const layout_stride::mapping<_OExtents>& __other) noexcept
|
2025-06-04 16:58:51 +02:00
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ __glibcxx_assert(*this == __other); }
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
#if __glibcxx_padded_layouts
|
|
|
|
|
template<typename _LeftpadMapping>
|
|
|
|
|
requires __mdspan::__is_left_padded_mapping<_LeftpadMapping>
|
|
|
|
|
&& is_constructible_v<extents_type,
|
|
|
|
|
typename _LeftpadMapping::extents_type>
|
|
|
|
|
constexpr
|
|
|
|
|
explicit(!is_convertible_v<typename _LeftpadMapping::extents_type,
|
|
|
|
|
extents_type>)
|
|
|
|
|
mapping(const _LeftpadMapping& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr size_t __ostride_sta
|
|
|
|
|
= __mdspan::__get_static_stride<_LeftpadMapping>();
|
2025-09-29 08:00:18 +02:00
|
|
|
|
|
|
|
|
if constexpr (extents_type::rank() > 1)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (extents_type::static_extent(0) != dynamic_extent
|
|
|
|
|
&& __ostride_sta != dynamic_extent)
|
|
|
|
|
static_assert(extents_type::static_extent(0) == __ostride_sta);
|
|
|
|
|
else
|
|
|
|
|
__glibcxx_assert(__other.stride(1)
|
|
|
|
|
== __other.extents().extent(0));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif // __glibcxx_padded_layouts
|
|
|
|
|
|
2025-06-04 16:58:47 +02:00
|
|
|
constexpr mapping&
|
|
|
|
|
operator=(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_extents; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
required_span_size() const noexcept
|
2025-07-04 10:29:46 +02:00
|
|
|
{ return __mdspan::__size(_M_extents); }
|
2025-06-04 16:58:47 +02:00
|
|
|
|
2025-09-03 17:28:27 +02:00
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4314. Missing move in mdspan layout mapping::operator()
|
2025-06-04 16:58:47 +02:00
|
|
|
template<__mdspan::__valid_index_type<index_type>... _Indices>
|
|
|
|
|
requires (sizeof...(_Indices) == extents_type::rank())
|
|
|
|
|
constexpr index_type
|
|
|
|
|
operator()(_Indices... __indices) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return __mdspan::__linear_index_left(_M_extents,
|
2025-07-16 15:45:45 +02:00
|
|
|
static_cast<index_type>(std::move(__indices))...);
|
2025-06-04 16:58:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_exhaustive() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_exhaustive() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __i) const noexcept
|
|
|
|
|
requires (extents_type::rank() > 0)
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__i < extents_type::rank());
|
|
|
|
|
return __mdspan::__fwd_prod(_M_extents, __i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires (extents_type::rank() == _OExtents::rank())
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const mapping& __self, const mapping<_OExtents>& __other)
|
|
|
|
|
noexcept
|
|
|
|
|
{ return __self.extents() == __other.extents(); }
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
mapping(const _OExtents& __oexts, __mdspan::__internal_ctor) noexcept
|
|
|
|
|
: _M_extents(__oexts)
|
|
|
|
|
{
|
|
|
|
|
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
|
|
|
|
|
"The size of OtherExtents must be representable as index_type");
|
|
|
|
|
__glibcxx_assert(__mdspan::__is_representable_extents(_M_extents));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[[no_unique_address]] extents_type _M_extents{};
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-04 16:58:49 +02:00
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents, typename... _Indices>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__linear_index_right(const _Extents& __exts, _Indices... __indices)
|
|
|
|
|
noexcept
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
array<_IndexType, sizeof...(__indices)> __ind_arr{__indices...};
|
|
|
|
|
_IndexType __res = 0;
|
|
|
|
|
if constexpr (sizeof...(__indices) > 0)
|
|
|
|
|
{
|
|
|
|
|
_IndexType __mult = 1;
|
|
|
|
|
auto __update = [&, __pos = __exts.rank()](_IndexType) mutable
|
|
|
|
|
{
|
|
|
|
|
--__pos;
|
2025-07-04 10:29:43 +02:00
|
|
|
_GLIBCXX_DEBUG_ASSERT(cmp_less(__ind_arr[__pos],
|
|
|
|
|
__exts.extent(__pos)));
|
2025-06-04 16:58:49 +02:00
|
|
|
__res += __ind_arr[__pos] * __mult;
|
|
|
|
|
__mult *= __exts.extent(__pos);
|
|
|
|
|
};
|
|
|
|
|
(__update(__indices), ...);
|
|
|
|
|
}
|
|
|
|
|
return __res;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class layout_right::mapping
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using layout_type = layout_right;
|
|
|
|
|
|
|
|
|
|
static_assert(__mdspan::__representable_size<extents_type, index_type>,
|
|
|
|
|
"The size of extents_type must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping() noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __extents) noexcept
|
|
|
|
|
: _M_extents(__extents)
|
|
|
|
|
{ __glibcxx_assert(__mdspan::__is_representable_extents(_M_extents)); }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const mapping<_OExtents>& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-06-05 10:40:10 +02:00
|
|
|
template<typename _OExtents>
|
2025-06-04 16:58:49 +02:00
|
|
|
requires (extents_type::rank() <= 1)
|
|
|
|
|
&& is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const layout_left::mapping<_OExtents>& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-06-05 10:40:10 +02:00
|
|
|
template<typename _OExtents>
|
2025-06-04 16:58:51 +02:00
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(extents_type::rank() > 0)
|
|
|
|
|
mapping(const layout_stride::mapping<_OExtents>& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{ __glibcxx_assert(*this == __other); }
|
|
|
|
|
|
2025-09-29 08:00:19 +02:00
|
|
|
#if __glibcxx_padded_layouts
|
|
|
|
|
template<typename _RightPaddedMapping>
|
|
|
|
|
requires __mdspan::__is_right_padded_mapping<_RightPaddedMapping>
|
|
|
|
|
&& is_constructible_v<extents_type,
|
2025-09-30 12:55:18 +02:00
|
|
|
typename _RightPaddedMapping::extents_type>
|
2025-09-29 08:00:19 +02:00
|
|
|
constexpr
|
|
|
|
|
explicit(!is_convertible_v<typename _RightPaddedMapping::extents_type,
|
|
|
|
|
extents_type>)
|
|
|
|
|
mapping(const _RightPaddedMapping& __other) noexcept
|
|
|
|
|
: mapping(__other.extents(), __mdspan::__internal_ctor{})
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __rank = extents_type::rank();
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr size_t __ostride_sta
|
|
|
|
|
= __mdspan::__get_static_stride<_RightPaddedMapping>();
|
2025-09-29 08:00:19 +02:00
|
|
|
|
|
|
|
|
if constexpr (__rank > 1)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (extents_type::static_extent(__rank - 1) != dynamic_extent
|
|
|
|
|
&& __ostride_sta != dynamic_extent)
|
|
|
|
|
static_assert(extents_type::static_extent(__rank - 1)
|
|
|
|
|
== __ostride_sta);
|
|
|
|
|
else
|
|
|
|
|
__glibcxx_assert(__other.stride(__rank - 2)
|
|
|
|
|
== __other.extents().extent(__rank - 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-06-04 16:58:49 +02:00
|
|
|
constexpr mapping&
|
|
|
|
|
operator=(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_extents; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
required_span_size() const noexcept
|
2025-07-04 10:29:46 +02:00
|
|
|
{ return __mdspan::__size(_M_extents); }
|
2025-06-04 16:58:49 +02:00
|
|
|
|
2025-09-03 17:28:27 +02:00
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4314. Missing move in mdspan layout mapping::operator()
|
2025-06-04 16:58:49 +02:00
|
|
|
template<__mdspan::__valid_index_type<index_type>... _Indices>
|
|
|
|
|
requires (sizeof...(_Indices) == extents_type::rank())
|
|
|
|
|
constexpr index_type
|
|
|
|
|
operator()(_Indices... __indices) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return __mdspan::__linear_index_right(
|
2025-07-16 15:45:45 +02:00
|
|
|
_M_extents, static_cast<index_type>(std::move(__indices))...);
|
2025-06-04 16:58:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_unique() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_exhaustive() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_strided() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_unique() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_exhaustive() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_strided() noexcept
|
|
|
|
|
{ return true; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __i) const noexcept
|
|
|
|
|
requires (extents_type::rank() > 0)
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__i < extents_type::rank());
|
|
|
|
|
return __mdspan::__rev_prod(_M_extents, __i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires (extents_type::rank() == _OExtents::rank())
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const mapping& __self, const mapping<_OExtents>& __other)
|
|
|
|
|
noexcept
|
|
|
|
|
{ return __self.extents() == __other.extents(); }
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
mapping(const _OExtents& __oexts, __mdspan::__internal_ctor) noexcept
|
|
|
|
|
: _M_extents(__oexts)
|
|
|
|
|
{
|
|
|
|
|
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
|
|
|
|
|
"The size of OtherExtents must be representable as index_type");
|
|
|
|
|
__glibcxx_assert(__mdspan::__is_representable_extents(_M_extents));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[[no_unique_address]] extents_type _M_extents{};
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-04 16:58:51 +02:00
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
2025-06-05 10:40:10 +02:00
|
|
|
template<typename _Mp>
|
2025-06-04 16:58:51 +02:00
|
|
|
concept __mapping_alike = requires
|
|
|
|
|
{
|
2025-06-05 10:40:10 +02:00
|
|
|
requires __is_extents<typename _Mp::extents_type>;
|
|
|
|
|
{ _Mp::is_always_strided() } -> same_as<bool>;
|
|
|
|
|
{ _Mp::is_always_exhaustive() } -> same_as<bool>;
|
|
|
|
|
{ _Mp::is_always_unique() } -> same_as<bool>;
|
|
|
|
|
bool_constant<_Mp::is_always_strided()>::value;
|
|
|
|
|
bool_constant<_Mp::is_always_exhaustive()>::value;
|
|
|
|
|
bool_constant<_Mp::is_always_unique()>::value;
|
2025-06-04 16:58:51 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<typename _Mapping>
|
|
|
|
|
constexpr typename _Mapping::index_type
|
|
|
|
|
__offset(const _Mapping& __m) noexcept
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = typename _Mapping::index_type;
|
|
|
|
|
constexpr auto __rank = _Mapping::extents_type::rank();
|
|
|
|
|
|
|
|
|
|
if constexpr (__standardized_mapping<_Mapping>)
|
|
|
|
|
return 0;
|
|
|
|
|
else if (__empty(__m.extents()))
|
|
|
|
|
return 0;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
auto __impl = [&__m]<size_t... _Counts>(index_sequence<_Counts...>)
|
|
|
|
|
{ return __m(((void) _Counts, _IndexType(0))...); };
|
|
|
|
|
return __impl(make_index_sequence<__rank>());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Mapping, typename... _Indices>
|
|
|
|
|
constexpr typename _Mapping::index_type
|
|
|
|
|
__linear_index_strides(const _Mapping& __m, _Indices... __indices)
|
|
|
|
|
noexcept
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = typename _Mapping::index_type;
|
|
|
|
|
_IndexType __res = 0;
|
|
|
|
|
if constexpr (sizeof...(__indices) > 0)
|
|
|
|
|
{
|
|
|
|
|
auto __update = [&, __pos = 0u](_IndexType __idx) mutable
|
|
|
|
|
{
|
2025-07-04 10:29:43 +02:00
|
|
|
_GLIBCXX_DEBUG_ASSERT(cmp_less(__idx,
|
|
|
|
|
__m.extents().extent(__pos)));
|
2025-06-04 16:58:51 +02:00
|
|
|
__res += __idx * __m.stride(__pos++);
|
|
|
|
|
};
|
|
|
|
|
(__update(__indices), ...);
|
|
|
|
|
}
|
|
|
|
|
return __res;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class layout_stride::mapping
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using layout_type = layout_stride;
|
|
|
|
|
|
|
|
|
|
static_assert(__mdspan::__representable_size<extents_type, index_type>,
|
|
|
|
|
"The size of extents_type must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping() noexcept
|
|
|
|
|
{
|
|
|
|
|
// The precondition is either statically asserted, or automatically
|
2025-06-05 10:40:10 +02:00
|
|
|
// satisfied because dynamic extents are zero-initialized.
|
2025-06-04 16:58:51 +02:00
|
|
|
size_t __stride = 1;
|
|
|
|
|
for (size_t __i = extents_type::rank(); __i > 0; --__i)
|
|
|
|
|
{
|
|
|
|
|
_M_strides[__i - 1] = index_type(__stride);
|
|
|
|
|
__stride *= size_t(_M_extents.extent(__i - 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const mapping&) noexcept = default;
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
2025-06-04 16:58:51 +02:00
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts,
|
|
|
|
|
span<_OIndexType, extents_type::rank()> __strides) noexcept
|
|
|
|
|
: _M_extents(__exts)
|
|
|
|
|
{
|
|
|
|
|
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
|
|
|
|
|
_M_strides[__i] = index_type(as_const(__strides[__i]));
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
2025-06-04 16:58:51 +02:00
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts,
|
|
|
|
|
const array<_OIndexType, extents_type::rank()>& __strides)
|
|
|
|
|
noexcept
|
|
|
|
|
: mapping(__exts,
|
|
|
|
|
span<const _OIndexType, extents_type::rank()>(__strides))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__mapping_alike _StridedMapping>
|
|
|
|
|
requires (is_constructible_v<extents_type,
|
|
|
|
|
typename _StridedMapping::extents_type>
|
|
|
|
|
&& _StridedMapping::is_always_unique()
|
|
|
|
|
&& _StridedMapping::is_always_strided())
|
|
|
|
|
constexpr explicit(!(
|
|
|
|
|
is_convertible_v<typename _StridedMapping::extents_type, extents_type>
|
|
|
|
|
&& __mdspan::__standardized_mapping<_StridedMapping>))
|
|
|
|
|
mapping(const _StridedMapping& __other) noexcept
|
|
|
|
|
: _M_extents(__other.extents())
|
|
|
|
|
{
|
|
|
|
|
using _OIndexType = _StridedMapping::index_type;
|
|
|
|
|
using _OExtents = _StridedMapping::extents_type;
|
|
|
|
|
|
|
|
|
|
__glibcxx_assert(__mdspan::__offset(__other) == 0);
|
|
|
|
|
static_assert(__mdspan::__representable_size<_OExtents, index_type>,
|
|
|
|
|
"The size of StridedMapping::extents_type must be representable as"
|
|
|
|
|
" index_type");
|
2025-08-03 22:57:30 +02:00
|
|
|
if constexpr (cmp_greater(__gnu_cxx::__int_traits<_OIndexType>::__max,
|
|
|
|
|
__gnu_cxx::__int_traits<index_type>::__max))
|
|
|
|
|
__glibcxx_assert(!cmp_less(
|
|
|
|
|
__gnu_cxx::__int_traits<index_type>::__max,
|
|
|
|
|
__other.required_span_size())
|
|
|
|
|
&& "other.required_span_size() must be representable"
|
|
|
|
|
" as index_type");
|
2025-06-04 16:58:51 +02:00
|
|
|
if constexpr (extents_type::rank() > 0)
|
|
|
|
|
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
|
|
|
|
|
_M_strides[__i] = index_type(__other.stride(__i));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr mapping&
|
|
|
|
|
operator=(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_extents; }
|
|
|
|
|
|
|
|
|
|
constexpr array<index_type, extents_type::rank()>
|
|
|
|
|
strides() const noexcept
|
|
|
|
|
{
|
|
|
|
|
array<index_type, extents_type::rank()> __ret;
|
|
|
|
|
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
|
|
|
|
|
__ret[__i] = _M_strides[__i];
|
|
|
|
|
return __ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
required_span_size() const noexcept
|
|
|
|
|
{
|
|
|
|
|
if (__mdspan::__empty(_M_extents))
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
index_type __ret = 1;
|
|
|
|
|
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
|
|
|
|
|
__ret += (_M_extents.extent(__i) - 1) * _M_strides[__i];
|
|
|
|
|
return __ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-03 17:28:27 +02:00
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4314. Missing move in mdspan layout mapping::operator()
|
2025-06-04 16:58:51 +02:00
|
|
|
template<__mdspan::__valid_index_type<index_type>... _Indices>
|
|
|
|
|
requires (sizeof...(_Indices) == extents_type::rank())
|
|
|
|
|
constexpr index_type
|
|
|
|
|
operator()(_Indices... __indices) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return __mdspan::__linear_index_strides(*this,
|
2025-07-16 15:45:45 +02:00
|
|
|
static_cast<index_type>(std::move(__indices))...);
|
2025-06-04 16:58:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4266. layout_stride::mapping should treat empty mappings as exhaustive
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_exhaustive() noexcept
|
|
|
|
|
{
|
|
|
|
|
return (_Extents::rank() == 0) || __mdspan::__contains_zero(
|
|
|
|
|
__mdspan::__static_extents<extents_type>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4266. layout_stride::mapping should treat empty mappings as exhaustive
|
|
|
|
|
constexpr bool
|
|
|
|
|
is_exhaustive() const noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (!is_always_exhaustive())
|
|
|
|
|
{
|
2025-07-04 10:29:46 +02:00
|
|
|
auto __size = __mdspan::__size(_M_extents);
|
2025-06-04 16:58:51 +02:00
|
|
|
if(__size > 0)
|
|
|
|
|
return __size == required_span_size();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __r) const noexcept { return _M_strides[__r]; }
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__mapping_alike _OMapping>
|
|
|
|
|
requires ((extents_type::rank() == _OMapping::extents_type::rank())
|
|
|
|
|
&& _OMapping::is_always_strided())
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const mapping& __self, const _OMapping& __other) noexcept
|
|
|
|
|
{
|
|
|
|
|
if (__self.extents() != __other.extents())
|
|
|
|
|
return false;
|
|
|
|
|
if constexpr (extents_type::rank() > 0)
|
|
|
|
|
for (size_t __i = 0; __i < extents_type::rank(); ++__i)
|
|
|
|
|
if (!cmp_equal(__self.stride(__i), __other.stride(__i)))
|
|
|
|
|
return false;
|
|
|
|
|
return __mdspan::__offset(__other) == 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2025-09-10 12:10:29 +02:00
|
|
|
using _Strides = typename __array_traits<index_type,
|
|
|
|
|
extents_type::rank()>::_Type;
|
2025-06-04 16:58:51 +02:00
|
|
|
[[no_unique_address]] extents_type _M_extents;
|
2025-09-10 12:10:29 +02:00
|
|
|
[[no_unique_address]] _Strides _M_strides;
|
2025-06-04 16:58:51 +02:00
|
|
|
};
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
#ifdef __glibcxx_padded_layouts
|
|
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t
|
|
|
|
|
__least_multiple(size_t __x, size_t __y)
|
|
|
|
|
{
|
|
|
|
|
if (__x <= 1)
|
|
|
|
|
return __y;
|
|
|
|
|
return (__y / __x + (__y % __x != 0)) * __x ;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _IndexType>
|
|
|
|
|
constexpr bool
|
|
|
|
|
__is_representable_least_multiple(size_t __x, size_t __y)
|
|
|
|
|
{
|
|
|
|
|
constexpr auto __y_max = __gnu_cxx::__int_traits<_IndexType>::__max;
|
|
|
|
|
if(std::cmp_greater(__y, __y_max))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if(__x <= 1)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
auto __max_delta = __y_max - static_cast<_IndexType>(__y);
|
|
|
|
|
auto __y_mod_x = __y % __x;
|
|
|
|
|
auto __delta = (__y_mod_x == 0) ? size_t(0) : (__x - __y_mod_x);
|
|
|
|
|
return std::cmp_less_equal(__delta, __max_delta);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents, size_t _PaddingValue, typename _LayoutTraits,
|
|
|
|
|
size_t _Rank = _Extents::rank()>
|
|
|
|
|
concept __valid_static_stride = (_Extents::rank() <= 1)
|
|
|
|
|
|| (_PaddingValue == dynamic_extent)
|
|
|
|
|
|| (_Extents::static_extent(_LayoutTraits::_S_ext_idx) == dynamic_extent)
|
|
|
|
|
|| (__is_representable_least_multiple<size_t>(
|
|
|
|
|
_PaddingValue, _Extents::static_extent(_LayoutTraits::_S_ext_idx)));
|
|
|
|
|
|
|
|
|
|
template<size_t _PaddedStride, typename _Extents,
|
|
|
|
|
typename _LayoutTraits>
|
|
|
|
|
consteval bool
|
|
|
|
|
__is_representable_padded_size()
|
|
|
|
|
{
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
auto __sta_exts = __static_extents<_Extents>(
|
|
|
|
|
_LayoutTraits::_S_unpad_begin, _LayoutTraits::_S_unpad_end);
|
|
|
|
|
size_t __max = __gnu_cxx::__int_traits<_IndexType>::__max;
|
|
|
|
|
return __static_quotient(__sta_exts, __max / _PaddedStride) != 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _Extents, size_t _PaddedStride, typename _LayoutTraits,
|
|
|
|
|
size_t _Rank = _Extents::rank()>
|
|
|
|
|
concept __valid_padded_size = (_Rank <= 1)
|
|
|
|
|
|| (_PaddedStride == dynamic_extent)
|
|
|
|
|
|| (!__all_static(__static_extents<_Extents>()))
|
|
|
|
|
|| (__contains_zero(__static_extents<_Extents>()))
|
|
|
|
|
|| (__is_representable_padded_size<_PaddedStride, _Extents,
|
|
|
|
|
_LayoutTraits>());
|
|
|
|
|
|
|
|
|
|
template<typename _Extents, typename _Stride, typename... _Indices>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__linear_index_leftpad(const _Extents& __exts, _Stride __stride,
|
|
|
|
|
_Indices... __indices)
|
|
|
|
|
{
|
|
|
|
|
// i0 + stride*(i1 + extents.extent(1)*...)
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
_IndexType __res = 0;
|
|
|
|
|
if constexpr (sizeof...(__indices) > 0)
|
|
|
|
|
{
|
|
|
|
|
_IndexType __mult = 1;
|
|
|
|
|
|
|
|
|
|
auto __update_rest = [&, __pos = 1u](_IndexType __idx) mutable
|
|
|
|
|
{
|
|
|
|
|
__res += __idx * __mult;
|
|
|
|
|
__mult *= __exts.extent(__pos);
|
|
|
|
|
++__pos;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto __update = [&](_IndexType __idx, auto... __rest)
|
|
|
|
|
{
|
|
|
|
|
__res += __idx;
|
|
|
|
|
__mult = __stride.extent(0);
|
|
|
|
|
(__update_rest(__rest), ...);
|
|
|
|
|
};
|
|
|
|
|
__update(__indices...);
|
|
|
|
|
}
|
|
|
|
|
return __res;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 08:00:19 +02:00
|
|
|
template<typename _Extents, typename _Stride, typename... _Indices>
|
|
|
|
|
constexpr typename _Extents::index_type
|
|
|
|
|
__linear_index_rightpad(const _Extents& __exts, _Stride __stride,
|
|
|
|
|
_Indices... __indices)
|
|
|
|
|
{
|
|
|
|
|
// i[n-1] + stride*(i[n-2] + extents.extent(n-2])*...)
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
_IndexType __res = 0;
|
|
|
|
|
if constexpr (sizeof...(__indices) > 0)
|
|
|
|
|
{
|
|
|
|
|
_IndexType __mult = 1;
|
|
|
|
|
array<_IndexType, sizeof...(__indices)> __ind_arr{__indices...};
|
|
|
|
|
|
|
|
|
|
auto __update_rest = [&, __pos = __exts.rank()-1](_IndexType) mutable
|
|
|
|
|
{
|
|
|
|
|
--__pos;
|
|
|
|
|
__res += __ind_arr[__pos] * __mult;
|
|
|
|
|
__mult *= __exts.extent(__pos);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto __update = [&](_IndexType, auto... __rest)
|
|
|
|
|
{
|
|
|
|
|
__res += __ind_arr[__exts.rank() - 1];
|
|
|
|
|
__mult = __stride.extent(0);
|
|
|
|
|
(__update_rest(__rest), ...);
|
|
|
|
|
};
|
|
|
|
|
__update(__indices...);
|
|
|
|
|
}
|
|
|
|
|
return __res;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
template<size_t _Rank>
|
|
|
|
|
struct _LeftPaddedLayoutTraits
|
|
|
|
|
{
|
|
|
|
|
using _LayoutSame = layout_left;
|
|
|
|
|
using _LayoutOther = layout_right;
|
|
|
|
|
|
|
|
|
|
constexpr static const size_t _S_ext_idx = 0;
|
|
|
|
|
constexpr static const size_t _S_stride_idx = 1;
|
|
|
|
|
constexpr static const size_t _S_unpad_begin = 1;
|
|
|
|
|
constexpr static const size_t _S_unpad_end = _Rank;
|
|
|
|
|
|
|
|
|
|
template<typename _IndexType, size_t _StaticStride, size_t..._Extents>
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr static auto
|
|
|
|
|
_S_make_padded_extent(
|
2025-09-29 08:00:18 +02:00
|
|
|
extents<_IndexType, _StaticStride> __stride,
|
|
|
|
|
const extents<_IndexType, _Extents...>& __exts)
|
|
|
|
|
{
|
|
|
|
|
auto __impl = [&]<size_t... _Is>(integer_sequence<size_t, _Is...>)
|
|
|
|
|
{
|
|
|
|
|
return extents<_IndexType, _StaticStride,
|
|
|
|
|
(_Extents...[_Is + 1])...>{
|
|
|
|
|
__stride.extent(0), __exts.extent(_Is + 1)...};
|
|
|
|
|
};
|
|
|
|
|
return __impl(make_index_sequence<sizeof...(_Extents) - 1>());
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2025-09-29 08:00:19 +02:00
|
|
|
template<size_t _Rank>
|
|
|
|
|
struct _RightPaddedLayoutTraits
|
|
|
|
|
{
|
|
|
|
|
using _LayoutSame = layout_right;
|
|
|
|
|
using _LayoutOther = layout_left;
|
|
|
|
|
|
|
|
|
|
constexpr static size_t _S_ext_idx = _Rank - 1;
|
|
|
|
|
constexpr static size_t _S_stride_idx = _Rank - 2;
|
|
|
|
|
constexpr static size_t _S_unpad_begin = 0;
|
|
|
|
|
constexpr static size_t _S_unpad_end = _Rank - 1;
|
|
|
|
|
|
|
|
|
|
template<typename _IndexType, size_t _StaticStride, size_t..._Extents>
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr static auto
|
|
|
|
|
_S_make_padded_extent(
|
2025-09-29 08:00:19 +02:00
|
|
|
extents<_IndexType, _StaticStride> __stride,
|
|
|
|
|
const extents<_IndexType, _Extents...>& __exts)
|
|
|
|
|
{
|
|
|
|
|
auto __impl = [&]<size_t... _Is>(integer_sequence<size_t, _Is...>)
|
|
|
|
|
{
|
|
|
|
|
return extents<_IndexType, (_Extents...[_Is])..., _StaticStride>{
|
|
|
|
|
__exts.extent(_Is)..., __stride.extent(0)};
|
|
|
|
|
};
|
|
|
|
|
return __impl(make_index_sequence<sizeof...(_Extents) - 1>());
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2025-09-29 08:00:18 +02:00
|
|
|
template<size_t _PaddingValue, typename _Extents, typename _LayoutTraits>
|
|
|
|
|
class _PaddedStorage
|
|
|
|
|
{
|
|
|
|
|
using _LayoutSame = typename _LayoutTraits::_LayoutSame;
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
using _IndexType = typename _Extents::index_type;
|
|
|
|
|
constexpr static size_t _S_rank = _Extents::rank();
|
|
|
|
|
|
|
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4372. Weaken Mandates: for dynamic padding values in padded layouts
|
|
|
|
|
static_assert((_PaddingValue == dynamic_extent)
|
|
|
|
|
|| (cmp_less_equal(_PaddingValue,
|
|
|
|
|
__gnu_cxx::__int_traits<_IndexType>::__max)),
|
|
|
|
|
"padding_value must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
static_assert(__representable_size<_Extents, _IndexType>,
|
|
|
|
|
"The size of extents_type must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
static_assert(__valid_static_stride<_Extents, _PaddingValue,
|
|
|
|
|
_LayoutTraits>,
|
|
|
|
|
"The padded stride must be representable as size_t");
|
|
|
|
|
|
|
|
|
|
static constexpr size_t _S_static_stride = [] consteval
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __rank = _Extents::rank();
|
|
|
|
|
if constexpr (__rank <= 1)
|
|
|
|
|
return 0;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __ext_idx = _LayoutTraits::_S_ext_idx;
|
|
|
|
|
constexpr size_t __sta_ext = _Extents::static_extent(__ext_idx);
|
|
|
|
|
if constexpr (__sta_ext == 0)
|
|
|
|
|
return size_t(0);
|
|
|
|
|
else if constexpr (_PaddingValue == dynamic_extent
|
|
|
|
|
|| __sta_ext == dynamic_extent)
|
|
|
|
|
return dynamic_extent;
|
|
|
|
|
else
|
|
|
|
|
return __least_multiple(_PaddingValue, __sta_ext);
|
|
|
|
|
}
|
|
|
|
|
}();
|
|
|
|
|
|
|
|
|
|
static_assert(_S_static_stride == dynamic_extent
|
|
|
|
|
|| cmp_less_equal(_S_static_stride,
|
|
|
|
|
__gnu_cxx::__int_traits<_IndexType>::__max),
|
|
|
|
|
"Padded stride must be representable as index_type");
|
|
|
|
|
|
|
|
|
|
static_assert(__valid_padded_size<_Extents, _S_static_stride,
|
|
|
|
|
_LayoutTraits>);
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
_PaddedStorage() noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
if constexpr (_S_static_stride == dynamic_extent
|
|
|
|
|
&& _S_static_padextent() != dynamic_extent)
|
|
|
|
|
_M_stride = _Stride{_S_static_padextent()};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(const _Extents& __exts)
|
|
|
|
|
: _M_extents(__exts)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (!__all_static(__static_extents<_Extents>()))
|
|
|
|
|
__glibcxx_assert(__is_representable_extents(_M_extents));
|
|
|
|
|
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
{
|
|
|
|
|
_IndexType __stride;
|
|
|
|
|
if constexpr (_PaddingValue == dynamic_extent)
|
|
|
|
|
__stride = _M_padextent();
|
|
|
|
|
else if constexpr (_S_static_padextent() != dynamic_extent)
|
|
|
|
|
return;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(
|
|
|
|
|
__is_representable_least_multiple<_IndexType>(
|
|
|
|
|
_PaddingValue, _M_padextent()));
|
|
|
|
|
|
|
|
|
|
__stride = static_cast<_IndexType>(
|
|
|
|
|
__least_multiple(_PaddingValue, _M_padextent()));
|
|
|
|
|
|
|
|
|
|
__glibcxx_assert(__is_representable_extents(
|
|
|
|
|
_LayoutTraits::_S_make_padded_extent(
|
|
|
|
|
std::dextents<_IndexType, 1>{__stride},
|
|
|
|
|
_M_extents)));
|
|
|
|
|
}
|
|
|
|
|
_M_stride = _Stride{__stride};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(const _Extents& __exts, _IndexType __pad)
|
|
|
|
|
: _M_extents(__exts)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_PaddingValue != dynamic_extent)
|
|
|
|
|
__glibcxx_assert(cmp_equal(_PaddingValue, __pad));
|
|
|
|
|
if constexpr (_S_rank > 1 && _S_static_stride == dynamic_extent)
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(
|
|
|
|
|
__is_representable_least_multiple<_IndexType>(
|
|
|
|
|
__pad, _M_padextent()));
|
|
|
|
|
|
|
|
|
|
_M_stride = _Stride{static_cast<_IndexType>(
|
|
|
|
|
__least_multiple(__pad, _M_padextent()))};
|
|
|
|
|
|
|
|
|
|
__glibcxx_assert(__is_representable_extents(
|
|
|
|
|
_LayoutTraits::_S_make_padded_extent(
|
|
|
|
|
_M_stride, _M_extents)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(const typename _LayoutSame::mapping<_OExtents>&
|
|
|
|
|
__other)
|
|
|
|
|
: _PaddedStorage(_Extents(__other.extents()))
|
|
|
|
|
{
|
|
|
|
|
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
|
|
|
|
|
constexpr size_t __ext_idx = _LayoutTraits::_S_ext_idx;
|
|
|
|
|
if constexpr (_S_rank > 1 && _PaddingValue != dynamic_extent)
|
|
|
|
|
{
|
|
|
|
|
static_assert(_S_static_stride == dynamic_extent
|
|
|
|
|
|| _OExtents::static_extent(__ext_idx) == dynamic_extent
|
|
|
|
|
|| _S_static_stride == _OExtents::static_extent(__ext_idx),
|
|
|
|
|
"The padded stride must be compatible with other");
|
|
|
|
|
|
|
|
|
|
if constexpr (_S_static_stride == dynamic_extent
|
|
|
|
|
|| _OExtents::static_extent(__stride_idx) == dynamic_extent)
|
|
|
|
|
__glibcxx_assert(std::cmp_equal(_M_padstride(),
|
|
|
|
|
_M_padextent()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(const typename layout_stride::mapping<_OExtents>&
|
|
|
|
|
__other)
|
|
|
|
|
: _M_extents(__other.extents())
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
|
|
|
|
|
__gnu_cxx::__int_traits<_IndexType>
|
|
|
|
|
::__max));
|
|
|
|
|
|
|
|
|
|
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_PaddingValue != dynamic_extent)
|
|
|
|
|
__glibcxx_assert(cmp_equal(__other.stride(__stride_idx),
|
|
|
|
|
_M_calc_padstride())
|
|
|
|
|
&& "The padded stride must be compatible with other");
|
|
|
|
|
if constexpr (_S_static_stride == dynamic_extent)
|
|
|
|
|
_M_stride = _Stride{__other.stride(__stride_idx)};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _SamePaddedMapping>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(_LayoutTraits::_LayoutSame,
|
|
|
|
|
const _SamePaddedMapping& __other)
|
|
|
|
|
: _M_extents(__other.extents())
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
{
|
|
|
|
|
static_assert(_PaddingValue == dynamic_extent
|
|
|
|
|
|| _SamePaddedMapping::padding_value == dynamic_extent
|
|
|
|
|
|| _PaddingValue == _SamePaddedMapping::padding_value,
|
|
|
|
|
"If neither PaddingValue is dynamic_extent, then they must "
|
|
|
|
|
"be equal");
|
|
|
|
|
|
|
|
|
|
constexpr size_t __stride_idx = _LayoutTraits::_S_stride_idx;
|
|
|
|
|
if constexpr (_PaddingValue != dynamic_extent)
|
|
|
|
|
__glibcxx_assert(cmp_equal(__other.stride(__stride_idx),
|
|
|
|
|
_M_calc_padstride())
|
|
|
|
|
&& "The padded stride must be compatible with other");
|
|
|
|
|
if constexpr (_S_static_stride == dynamic_extent)
|
|
|
|
|
_M_stride = _Stride{__other.stride(__stride_idx)};
|
|
|
|
|
}
|
|
|
|
|
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
|
|
|
|
|
__gnu_cxx::__int_traits<_IndexType>::__max));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _OtherPaddedMapping>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
_PaddedStorage(_LayoutTraits::_LayoutOther,
|
|
|
|
|
const _OtherPaddedMapping& __other) noexcept
|
|
|
|
|
: _M_extents(__other.extents())
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(cmp_less_equal(__other.required_span_size(),
|
|
|
|
|
__gnu_cxx::__int_traits<_IndexType>::__max));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
_M_is_always_exhaustive() noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_S_rank <= 1)
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
return _S_static_padextent() != dynamic_extent
|
|
|
|
|
&& _S_static_stride != dynamic_extent
|
|
|
|
|
&& _S_static_padextent() == _S_static_stride;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr bool
|
|
|
|
|
_M_is_exhaustive() const noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_M_is_always_exhaustive())
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
return cmp_equal(_M_padextent(), _M_padstride());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr static size_t
|
|
|
|
|
_S_static_padextent() noexcept
|
|
|
|
|
{ return _Extents::static_extent(_LayoutTraits::_S_ext_idx); }
|
|
|
|
|
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
_M_padextent() const noexcept
|
|
|
|
|
{ return _M_extents.extent(_LayoutTraits::_S_ext_idx); }
|
|
|
|
|
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
_M_calc_padstride() const noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_S_static_stride != dynamic_extent)
|
|
|
|
|
return _S_static_stride;
|
|
|
|
|
else if constexpr (_PaddingValue != dynamic_extent)
|
|
|
|
|
return __least_multiple(_PaddingValue, _M_padextent());
|
|
|
|
|
else
|
|
|
|
|
return _M_padextent();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
_M_padstride() const noexcept
|
|
|
|
|
{ return _M_stride.extent(0); }
|
|
|
|
|
|
|
|
|
|
constexpr _IndexType
|
|
|
|
|
_M_required_span_size() const noexcept
|
|
|
|
|
{
|
|
|
|
|
if constexpr (_S_rank == 0)
|
|
|
|
|
return 1;
|
|
|
|
|
else if (__mdspan::__empty(_M_extents))
|
|
|
|
|
return 0;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
size_t __stride = static_cast<size_t>(_M_padstride());
|
|
|
|
|
size_t __prod_rest = __mdspan::__fwd_prod(_M_extents,
|
|
|
|
|
_LayoutTraits::_S_unpad_begin, _LayoutTraits::_S_unpad_end);
|
|
|
|
|
size_t __delta = _M_padstride() - _M_padextent();
|
|
|
|
|
return static_cast<_IndexType>(__stride * __prod_rest - __delta);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _SamePaddedMapping>
|
|
|
|
|
constexpr bool
|
|
|
|
|
_M_equal(const _SamePaddedMapping& __other) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return _M_extents == __other.extents()
|
|
|
|
|
&& (_S_rank < 2
|
|
|
|
|
|| cmp_equal(_M_stride.extent(0),
|
|
|
|
|
__other.stride(_LayoutTraits::_S_stride_idx)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
using _Stride = std::extents<_IndexType, _S_static_stride>;
|
|
|
|
|
[[no_unique_address]] _Stride _M_stride;
|
|
|
|
|
[[no_unique_address]] _Extents _M_extents;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<size_t _PaddingValue>
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class layout_left_padded<_PaddingValue>::mapping
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
static constexpr size_t padding_value = _PaddingValue;
|
|
|
|
|
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using layout_type = layout_left_padded<padding_value>;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
static constexpr size_t _S_rank = extents_type::rank();
|
|
|
|
|
using _PaddedStorage = __mdspan::_PaddedStorage<_PaddingValue,
|
|
|
|
|
_Extents, __mdspan::_LeftPaddedLayoutTraits<_S_rank>>;
|
|
|
|
|
[[no_unique_address]] _PaddedStorage _M_storage;
|
|
|
|
|
|
|
|
|
|
consteval friend size_t
|
|
|
|
|
__mdspan::__get_static_stride<mapping>();
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
_M_extent(size_t __r) const noexcept
|
|
|
|
|
{ return _M_storage._M_extents.extent(__r); }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
_M_padstride() const noexcept
|
|
|
|
|
{ return _M_storage._M_stride.extent(0); }
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
constexpr
|
|
|
|
|
mapping() noexcept
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts)
|
|
|
|
|
: _M_storage(__exts)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type> _OIndexType>
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts, _OIndexType __pad)
|
2025-09-29 08:00:18 +02:00
|
|
|
: _M_storage(__exts,
|
|
|
|
|
__mdspan::__index_type_cast<index_type>(std::move(__pad)))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const layout_left::mapping<_OExtents>& __other)
|
|
|
|
|
: _M_storage(__other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<_OExtents, extents_type>
|
|
|
|
|
constexpr explicit(_OExtents::rank() > 0)
|
|
|
|
|
mapping(const typename layout_stride::mapping<_OExtents>& __other)
|
|
|
|
|
: _M_storage(__other)
|
|
|
|
|
{ __glibcxx_assert(*this == __other); }
|
|
|
|
|
|
|
|
|
|
template<typename _LeftpadMapping>
|
|
|
|
|
requires __mdspan::__is_left_padded_mapping<_LeftpadMapping>
|
|
|
|
|
&& is_constructible_v<extents_type,
|
|
|
|
|
typename _LeftpadMapping::extents_type>
|
|
|
|
|
constexpr explicit(_S_rank > 1 && (padding_value != dynamic_extent
|
|
|
|
|
|| _LeftpadMapping::padding_value == dynamic_extent))
|
|
|
|
|
mapping(const _LeftpadMapping& __other)
|
|
|
|
|
: _M_storage(layout_left{}, __other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _RightPaddedMapping>
|
|
|
|
|
requires (__mdspan::__is_right_padded_mapping<_RightPaddedMapping>
|
|
|
|
|
|| __mdspan::__mapping_of<layout_right, _RightPaddedMapping>)
|
|
|
|
|
&& (_S_rank <= 1)
|
|
|
|
|
&& is_constructible_v<extents_type,
|
|
|
|
|
typename _RightPaddedMapping::extents_type>
|
|
|
|
|
constexpr explicit(!is_convertible_v<
|
|
|
|
|
typename _RightPaddedMapping::extents_type, extents_type>)
|
|
|
|
|
mapping(const _RightPaddedMapping& __other) noexcept
|
|
|
|
|
: _M_storage(layout_right{}, __other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr mapping&
|
|
|
|
|
operator=(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_storage._M_extents; }
|
|
|
|
|
|
|
|
|
|
constexpr array<index_type, _S_rank>
|
|
|
|
|
strides() const noexcept
|
|
|
|
|
{
|
|
|
|
|
array<index_type, _S_rank> __ret;
|
|
|
|
|
if constexpr (_S_rank > 0)
|
|
|
|
|
__ret[0] = 1;
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
__ret[1] = _M_padstride();
|
|
|
|
|
if constexpr (_S_rank > 2)
|
|
|
|
|
for(size_t __i = 2; __i < _S_rank; ++__i)
|
|
|
|
|
__ret[__i] = __ret[__i - 1] * _M_extent(__i - 1);
|
|
|
|
|
return __ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
required_span_size() const noexcept
|
|
|
|
|
{ return _M_storage._M_required_span_size(); }
|
|
|
|
|
|
|
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4314. Missing move in mdspan layout mapping::operator()
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type>... _Indices>
|
|
|
|
|
requires (sizeof...(_Indices) == _S_rank)
|
|
|
|
|
constexpr index_type
|
|
|
|
|
operator()(_Indices... __indices) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return __mdspan::__linear_index_leftpad(
|
|
|
|
|
extents(), _M_storage._M_stride,
|
|
|
|
|
static_cast<index_type>(std::move(__indices))...);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_exhaustive() noexcept
|
|
|
|
|
{ return _PaddedStorage::_M_is_always_exhaustive(); }
|
|
|
|
|
|
|
|
|
|
constexpr bool
|
|
|
|
|
is_exhaustive() noexcept
|
|
|
|
|
{ return _M_storage._M_is_exhaustive(); }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-09-30 12:55:18 +02:00
|
|
|
is_unique() noexcept { return true; }
|
2025-09-29 08:00:18 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-09-30 12:55:18 +02:00
|
|
|
is_always_strided() noexcept { return true; }
|
2025-09-29 08:00:18 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __r) const noexcept
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__r < _S_rank);
|
|
|
|
|
if (__r == 0)
|
|
|
|
|
return 1;
|
|
|
|
|
else
|
|
|
|
|
return static_cast<index_type>(
|
|
|
|
|
static_cast<size_t>(_M_padstride()) *
|
|
|
|
|
static_cast<size_t>(__mdspan::__fwd_prod(extents(), 1, __r)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _LeftpadMapping>
|
|
|
|
|
requires(__mdspan::__is_left_padded_mapping<_LeftpadMapping>
|
|
|
|
|
&& _LeftpadMapping::extents_type::rank() == _S_rank)
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const mapping& __self, const _LeftpadMapping& __other)
|
|
|
|
|
noexcept
|
|
|
|
|
{ return __self._M_storage._M_equal(__other); }
|
|
|
|
|
};
|
2025-09-29 08:00:19 +02:00
|
|
|
|
|
|
|
|
template<size_t _PaddingValue>
|
|
|
|
|
template<typename _Extents>
|
|
|
|
|
class layout_right_padded<_PaddingValue>::mapping {
|
|
|
|
|
public:
|
|
|
|
|
static constexpr size_t padding_value = _PaddingValue;
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using layout_type = layout_right_padded<_PaddingValue>;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
static constexpr size_t _S_rank = extents_type::rank();
|
|
|
|
|
using _PaddedStorage = __mdspan::_PaddedStorage<_PaddingValue,
|
|
|
|
|
_Extents, __mdspan::_RightPaddedLayoutTraits<_S_rank>>;
|
|
|
|
|
[[no_unique_address]] _PaddedStorage _M_storage;
|
|
|
|
|
|
|
|
|
|
consteval friend size_t
|
|
|
|
|
__mdspan::__get_static_stride<mapping>();
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
_M_extent(size_t __r) const noexcept
|
|
|
|
|
{ return _M_storage._M_extents.extent(__r); }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
_M_padstride() const noexcept
|
|
|
|
|
{ return _M_storage._M_stride.extent(0); }
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
constexpr
|
|
|
|
|
mapping() noexcept
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts)
|
|
|
|
|
: _M_storage(__exts)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type> _OIndexType>
|
2025-09-30 12:55:18 +02:00
|
|
|
constexpr
|
|
|
|
|
mapping(const extents_type& __exts, _OIndexType __pad)
|
2025-09-29 08:00:19 +02:00
|
|
|
: _M_storage(__exts,
|
|
|
|
|
__mdspan::__index_type_cast<index_type>(std::move(__pad)))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<extents_type, _OExtents>
|
|
|
|
|
constexpr explicit(!is_convertible_v<_OExtents, extents_type>)
|
|
|
|
|
mapping(const layout_right::mapping<_OExtents>& __other)
|
|
|
|
|
: _M_storage(__other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OExtents>
|
|
|
|
|
requires is_constructible_v<_OExtents, extents_type>
|
|
|
|
|
constexpr explicit(_OExtents::rank() > 0)
|
|
|
|
|
mapping(const typename layout_stride::mapping<_OExtents>& __other)
|
|
|
|
|
: _M_storage(__other)
|
|
|
|
|
{ __glibcxx_assert(*this == __other); }
|
|
|
|
|
|
|
|
|
|
template<typename _RightPaddedMapping>
|
|
|
|
|
requires __mdspan::__is_right_padded_mapping<_RightPaddedMapping>
|
|
|
|
|
&& is_constructible_v<extents_type,
|
|
|
|
|
typename _RightPaddedMapping::extents_type>
|
|
|
|
|
constexpr explicit(_S_rank > 1 && (padding_value != dynamic_extent
|
|
|
|
|
|| _RightPaddedMapping::padding_value == dynamic_extent))
|
|
|
|
|
mapping(const _RightPaddedMapping& __other)
|
|
|
|
|
: _M_storage(layout_right{}, __other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _LeftPaddedMapping>
|
|
|
|
|
requires (__mdspan::__is_left_padded_mapping<_LeftPaddedMapping>
|
|
|
|
|
|| __mdspan::__mapping_of<layout_left, _LeftPaddedMapping>)
|
|
|
|
|
&& (_S_rank <= 1)
|
|
|
|
|
&& is_constructible_v<extents_type,
|
|
|
|
|
typename _LeftPaddedMapping::extents_type>
|
|
|
|
|
constexpr explicit(!is_convertible_v<
|
|
|
|
|
typename _LeftPaddedMapping::extents_type, extents_type>)
|
|
|
|
|
mapping(const _LeftPaddedMapping& __other) noexcept
|
|
|
|
|
: _M_storage(layout_left{}, __other)
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr mapping& operator=(const mapping&) noexcept = default;
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_storage._M_extents; }
|
|
|
|
|
|
|
|
|
|
constexpr array<index_type, _S_rank>
|
|
|
|
|
strides() const noexcept
|
|
|
|
|
{
|
|
|
|
|
array<index_type, _S_rank> __ret;
|
|
|
|
|
if constexpr (_S_rank > 0)
|
|
|
|
|
__ret[_S_rank - 1] = 1;
|
|
|
|
|
if constexpr (_S_rank > 1)
|
|
|
|
|
__ret[_S_rank - 2] = _M_padstride();
|
|
|
|
|
if constexpr (_S_rank > 2)
|
|
|
|
|
for(size_t __i = _S_rank - 2; __i > 0; --__i)
|
|
|
|
|
__ret[__i - 1] = __ret[__i] * _M_extent(__i);
|
|
|
|
|
return __ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
required_span_size() const noexcept
|
|
|
|
|
{ return _M_storage._M_required_span_size(); }
|
|
|
|
|
|
|
|
|
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
|
|
|
|
// 4314. Missing move in mdspan layout mapping::operator()
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type>... _Indices>
|
|
|
|
|
requires (sizeof...(_Indices) == _S_rank)
|
|
|
|
|
constexpr index_type
|
|
|
|
|
operator()(_Indices... __indices) const noexcept
|
|
|
|
|
{
|
|
|
|
|
return __mdspan::__linear_index_rightpad(
|
|
|
|
|
extents(), _M_storage._M_stride,
|
|
|
|
|
static_cast<index_type>(std::move(__indices))...);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_exhaustive() noexcept
|
|
|
|
|
{ return _PaddedStorage::_M_is_always_exhaustive(); }
|
|
|
|
|
|
|
|
|
|
constexpr bool
|
|
|
|
|
is_exhaustive() noexcept
|
|
|
|
|
{ return _M_storage._M_is_exhaustive(); }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_always_unique() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-09-30 12:55:18 +02:00
|
|
|
is_unique() noexcept { return true; }
|
2025-09-29 08:00:19 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-09-30 12:55:18 +02:00
|
|
|
is_always_strided() noexcept { return true; }
|
2025-09-29 08:00:19 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
|
|
|
|
is_strided() noexcept { return true; }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __r) const noexcept
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(__r < _S_rank);
|
|
|
|
|
if constexpr (_S_rank <= 1)
|
|
|
|
|
return 1;
|
|
|
|
|
else if (__r == _S_rank - 1)
|
|
|
|
|
return 1;
|
|
|
|
|
else if (__r == _S_rank - 2)
|
|
|
|
|
return _M_padstride();
|
|
|
|
|
else
|
|
|
|
|
return static_cast<index_type>(
|
|
|
|
|
static_cast<size_t>(_M_padstride()) *
|
|
|
|
|
static_cast<size_t>(__mdspan::__fwd_prod(
|
|
|
|
|
extents(), __r + 1, _S_rank - 1)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _RightPaddedMapping>
|
|
|
|
|
requires(__mdspan::__is_right_padded_mapping<_RightPaddedMapping>
|
|
|
|
|
&& _RightPaddedMapping::extents_type::rank() == _S_rank)
|
|
|
|
|
friend constexpr bool
|
|
|
|
|
operator==(const mapping& __self, const _RightPaddedMapping& __other)
|
|
|
|
|
noexcept
|
|
|
|
|
{ return __self._M_storage._M_equal(__other); }
|
|
|
|
|
};
|
2025-09-29 08:00:18 +02:00
|
|
|
#endif // __glibcxx_padded_layouts
|
|
|
|
|
|
2025-06-30 09:23:16 +02:00
|
|
|
template<typename _ElementType>
|
|
|
|
|
struct default_accessor
|
|
|
|
|
{
|
|
|
|
|
static_assert(!is_array_v<_ElementType>,
|
|
|
|
|
"ElementType must not be an array type");
|
|
|
|
|
static_assert(!is_abstract_v<_ElementType>,
|
|
|
|
|
"ElementType must not be an abstract class type");
|
|
|
|
|
|
|
|
|
|
using offset_policy = default_accessor;
|
|
|
|
|
using element_type = _ElementType;
|
|
|
|
|
using reference = element_type&;
|
|
|
|
|
using data_handle_type = element_type*;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
default_accessor() noexcept = default;
|
|
|
|
|
|
|
|
|
|
template<typename _OElementType>
|
|
|
|
|
requires is_convertible_v<_OElementType(*)[], element_type(*)[]>
|
|
|
|
|
constexpr
|
|
|
|
|
default_accessor(default_accessor<_OElementType>) noexcept
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr reference
|
|
|
|
|
access(data_handle_type __p, size_t __i) const noexcept
|
|
|
|
|
{ return __p[__i]; }
|
|
|
|
|
|
|
|
|
|
constexpr data_handle_type
|
|
|
|
|
offset(data_handle_type __p, size_t __i) const noexcept
|
|
|
|
|
{ return __p + __i; }
|
|
|
|
|
};
|
|
|
|
|
|
2025-08-04 12:59:27 +02:00
|
|
|
#ifdef __glibcxx_aligned_accessor
|
|
|
|
|
template<typename _ElementType, size_t _ByteAlignment>
|
|
|
|
|
struct aligned_accessor
|
|
|
|
|
{
|
|
|
|
|
static_assert(has_single_bit(_ByteAlignment),
|
|
|
|
|
"ByteAlignment must be a power of two");
|
|
|
|
|
static_assert(_ByteAlignment >= alignof(_ElementType));
|
|
|
|
|
|
|
|
|
|
using offset_policy = default_accessor<_ElementType>;
|
|
|
|
|
using element_type = _ElementType;
|
|
|
|
|
using reference = element_type&;
|
|
|
|
|
using data_handle_type = element_type*;
|
|
|
|
|
|
|
|
|
|
static constexpr size_t byte_alignment = _ByteAlignment;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
aligned_accessor() noexcept = default;
|
|
|
|
|
|
|
|
|
|
template<typename _OElementType, size_t _OByteAlignment>
|
2025-09-03 17:28:27 +02:00
|
|
|
requires (_OByteAlignment >= byte_alignment)
|
2025-08-04 12:59:27 +02:00
|
|
|
&& is_convertible_v<_OElementType(*)[], element_type(*)[]>
|
|
|
|
|
constexpr
|
|
|
|
|
aligned_accessor(aligned_accessor<_OElementType, _OByteAlignment>)
|
|
|
|
|
noexcept
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OElementType>
|
|
|
|
|
requires is_convertible_v<_OElementType(*)[], element_type(*)[]>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
aligned_accessor(default_accessor<_OElementType>) noexcept
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OElementType>
|
|
|
|
|
requires is_convertible_v<element_type(*)[], _OElementType(*)[]>
|
|
|
|
|
constexpr
|
|
|
|
|
operator default_accessor<_OElementType>() const noexcept
|
|
|
|
|
{ return {}; }
|
|
|
|
|
|
|
|
|
|
constexpr reference
|
|
|
|
|
access(data_handle_type __p, size_t __i) const noexcept
|
|
|
|
|
{ return std::assume_aligned<byte_alignment>(__p)[__i]; }
|
|
|
|
|
|
|
|
|
|
constexpr typename offset_policy::data_handle_type
|
|
|
|
|
offset(data_handle_type __p, size_t __i) const noexcept
|
|
|
|
|
{ return std::assume_aligned<byte_alignment>(__p) + __i; }
|
|
|
|
|
};
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-07-08 10:24:26 +02:00
|
|
|
namespace __mdspan
|
|
|
|
|
{
|
|
|
|
|
template<typename _Extents, typename _IndexType, size_t _Nm>
|
|
|
|
|
constexpr bool
|
|
|
|
|
__is_multi_index(const _Extents& __exts, span<_IndexType, _Nm> __indices)
|
|
|
|
|
{
|
|
|
|
|
static_assert(__exts.rank() == _Nm);
|
|
|
|
|
for (size_t __i = 0; __i < __exts.rank(); ++__i)
|
|
|
|
|
if (__indices[__i] >= __exts.extent(__i))
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename _ElementType, typename _Extents,
|
|
|
|
|
typename _LayoutPolicy = layout_right,
|
|
|
|
|
typename _AccessorPolicy = default_accessor<_ElementType>>
|
|
|
|
|
class mdspan
|
|
|
|
|
{
|
|
|
|
|
static_assert(!is_array_v<_ElementType>,
|
|
|
|
|
"ElementType must not be an array type");
|
|
|
|
|
static_assert(!is_abstract_v<_ElementType>,
|
|
|
|
|
"ElementType must not be an abstract class type");
|
|
|
|
|
static_assert(__mdspan::__is_extents<_Extents>,
|
|
|
|
|
"Extents must be a specialization of std::extents");
|
|
|
|
|
static_assert(is_same_v<_ElementType,
|
|
|
|
|
typename _AccessorPolicy::element_type>);
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
using extents_type = _Extents;
|
|
|
|
|
using layout_type = _LayoutPolicy;
|
|
|
|
|
using accessor_type = _AccessorPolicy;
|
|
|
|
|
using mapping_type = typename layout_type::template mapping<extents_type>;
|
|
|
|
|
using element_type = _ElementType;
|
|
|
|
|
using value_type = remove_cv_t<element_type>;
|
|
|
|
|
using index_type = typename extents_type::index_type;
|
|
|
|
|
using size_type = typename extents_type::size_type;
|
|
|
|
|
using rank_type = typename extents_type::rank_type;
|
|
|
|
|
using data_handle_type = typename accessor_type::data_handle_type;
|
|
|
|
|
using reference = typename accessor_type::reference;
|
|
|
|
|
|
|
|
|
|
static constexpr rank_type
|
|
|
|
|
rank() noexcept { return extents_type::rank(); }
|
|
|
|
|
|
|
|
|
|
static constexpr rank_type
|
|
|
|
|
rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
|
|
|
|
|
|
|
|
|
|
static constexpr size_t
|
|
|
|
|
static_extent(rank_type __r) noexcept
|
|
|
|
|
{ return extents_type::static_extent(__r); }
|
|
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
extent(rank_type __r) const noexcept { return extents().extent(__r); }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan()
|
|
|
|
|
requires (rank_dynamic() > 0)
|
2025-07-21 13:07:37 +02:00
|
|
|
&& is_default_constructible_v<data_handle_type>
|
2025-07-08 10:24:26 +02:00
|
|
|
&& is_default_constructible_v<mapping_type>
|
2025-07-21 13:07:37 +02:00
|
|
|
&& is_default_constructible_v<accessor_type> = default;
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan(const mdspan& __other) = default;
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan(mdspan&& __other) = default;
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
|
|
|
|
|
requires (sizeof...(_OIndexTypes) == rank()
|
|
|
|
|
|| sizeof...(_OIndexTypes) == rank_dynamic())
|
|
|
|
|
&& is_constructible_v<mapping_type, extents_type>
|
|
|
|
|
&& is_default_constructible_v<accessor_type>
|
|
|
|
|
constexpr explicit
|
|
|
|
|
mdspan(data_handle_type __handle, _OIndexTypes... __exts)
|
|
|
|
|
: _M_accessor(),
|
|
|
|
|
_M_mapping(_Extents(static_cast<index_type>(std::move(__exts))...)),
|
|
|
|
|
_M_handle(std::move(__handle))
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType, size_t _Nm>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
|
|
|
|
&& (_Nm == rank() || _Nm == rank_dynamic())
|
2025-07-08 10:24:26 +02:00
|
|
|
&& is_constructible_v<mapping_type, extents_type>
|
|
|
|
|
&& is_default_constructible_v<accessor_type>
|
|
|
|
|
constexpr explicit(_Nm != rank_dynamic())
|
|
|
|
|
mdspan(data_handle_type __handle, span<_OIndexType, _Nm> __exts)
|
|
|
|
|
: _M_accessor(), _M_mapping(extents_type(__exts)),
|
|
|
|
|
_M_handle(std::move(__handle))
|
|
|
|
|
{ }
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType, size_t _Nm>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
|
|
|
|
&& (_Nm == rank() || _Nm == rank_dynamic())
|
2025-07-08 10:24:26 +02:00
|
|
|
&& is_constructible_v<mapping_type, extents_type>
|
|
|
|
|
&& is_default_constructible_v<accessor_type>
|
|
|
|
|
constexpr explicit(_Nm != rank_dynamic())
|
|
|
|
|
mdspan(data_handle_type __handle, const array<_OIndexType, _Nm>& __exts)
|
|
|
|
|
: _M_accessor(), _M_mapping(extents_type(__exts)),
|
|
|
|
|
_M_handle(std::move(__handle))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan(data_handle_type __handle, const extents_type& __exts)
|
|
|
|
|
requires is_constructible_v<mapping_type, const extents_type&>
|
|
|
|
|
&& is_default_constructible_v<accessor_type>
|
|
|
|
|
: _M_accessor(), _M_mapping(__exts), _M_handle(std::move(__handle))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan(data_handle_type __handle, const mapping_type& __mapping)
|
|
|
|
|
requires is_default_constructible_v<accessor_type>
|
|
|
|
|
: _M_accessor(), _M_mapping(__mapping), _M_handle(std::move(__handle))
|
|
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
constexpr
|
|
|
|
|
mdspan(data_handle_type __handle, const mapping_type& __mapping,
|
|
|
|
|
const accessor_type& __accessor)
|
|
|
|
|
: _M_accessor(__accessor), _M_mapping(__mapping),
|
2025-09-29 08:00:18 +02:00
|
|
|
_M_handle(std::move(__handle))
|
2025-07-08 10:24:26 +02:00
|
|
|
{ }
|
|
|
|
|
|
|
|
|
|
template<typename _OElementType, typename _OExtents, typename _OLayout,
|
|
|
|
|
typename _OAccessor>
|
|
|
|
|
requires is_constructible_v<mapping_type,
|
2025-07-08 22:04:29 +01:00
|
|
|
const typename _OLayout::template mapping<_OExtents>&>
|
2025-07-08 10:24:26 +02:00
|
|
|
&& is_constructible_v<accessor_type, const _OAccessor&>
|
|
|
|
|
constexpr explicit(!is_convertible_v<
|
2025-07-08 22:04:29 +01:00
|
|
|
const typename _OLayout::template mapping<_OExtents>&, mapping_type>
|
2025-07-08 10:24:26 +02:00
|
|
|
|| !is_convertible_v<const _OAccessor&, accessor_type>)
|
|
|
|
|
mdspan(const mdspan<_OElementType, _OExtents, _OLayout, _OAccessor>&
|
|
|
|
|
__other)
|
|
|
|
|
: _M_accessor(__other.accessor()), _M_mapping(__other.mapping()),
|
|
|
|
|
_M_handle(__other.data_handle())
|
|
|
|
|
{
|
|
|
|
|
static_assert(is_constructible_v<data_handle_type,
|
|
|
|
|
const typename _OAccessor::data_handle_type&>);
|
|
|
|
|
static_assert(is_constructible_v<extents_type, _OExtents>);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr mdspan&
|
|
|
|
|
operator=(const mdspan& __other) = default;
|
|
|
|
|
|
|
|
|
|
constexpr mdspan&
|
|
|
|
|
operator=(mdspan&& __other) = default;
|
|
|
|
|
|
|
|
|
|
template<__mdspan::__valid_index_type<index_type>... _OIndexTypes>
|
|
|
|
|
requires (sizeof...(_OIndexTypes) == rank())
|
|
|
|
|
constexpr reference
|
|
|
|
|
operator[](_OIndexTypes... __indices) const
|
|
|
|
|
{
|
|
|
|
|
auto __checked_call = [this](auto... __idxs) -> index_type
|
|
|
|
|
{
|
|
|
|
|
if constexpr (sizeof...(__idxs) > 0)
|
|
|
|
|
__glibcxx_assert(__mdspan::__is_multi_index(extents(),
|
|
|
|
|
span<const index_type, sizeof...(__idxs)>({__idxs...})));
|
|
|
|
|
return _M_mapping(__idxs...);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto __index = __checked_call(
|
|
|
|
|
static_cast<index_type>(std::move(__indices))...);
|
|
|
|
|
return _M_accessor.access(_M_handle, __index);
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
2025-07-08 10:24:26 +02:00
|
|
|
constexpr reference
|
|
|
|
|
operator[](span<_OIndexType, rank()> __indices) const
|
|
|
|
|
{
|
|
|
|
|
auto __call = [&]<size_t... _Counts>(index_sequence<_Counts...>)
|
|
|
|
|
-> reference
|
|
|
|
|
{ return (*this)[index_type(as_const(__indices[_Counts]))...]; };
|
|
|
|
|
return __call(make_index_sequence<rank()>());
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-16 15:45:44 +02:00
|
|
|
template<typename _OIndexType>
|
|
|
|
|
requires __mdspan::__valid_index_type<const _OIndexType&, index_type>
|
2025-07-08 10:24:26 +02:00
|
|
|
constexpr reference
|
|
|
|
|
operator[](const array<_OIndexType, rank()>& __indices) const
|
|
|
|
|
{ return (*this)[span<const _OIndexType, rank()>(__indices)]; }
|
|
|
|
|
|
|
|
|
|
constexpr size_type
|
|
|
|
|
size() const noexcept
|
|
|
|
|
{
|
|
|
|
|
__glibcxx_assert(cmp_less_equal(_M_mapping.required_span_size(),
|
2025-08-03 22:57:30 +02:00
|
|
|
__gnu_cxx::__int_traits<size_t>
|
|
|
|
|
::__max));
|
2025-07-08 10:24:26 +02:00
|
|
|
return size_type(__mdspan::__size(extents()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
|
constexpr bool
|
|
|
|
|
empty() const noexcept
|
2025-07-27 14:40:10 +02:00
|
|
|
{ return __mdspan::__empty(extents()); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
friend constexpr void
|
|
|
|
|
swap(mdspan& __x, mdspan& __y) noexcept
|
|
|
|
|
{
|
|
|
|
|
using std::swap;
|
|
|
|
|
swap(__x._M_mapping, __y._M_mapping);
|
|
|
|
|
swap(__x._M_accessor, __y._M_accessor);
|
|
|
|
|
swap(__x._M_handle, __y._M_handle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr const extents_type&
|
|
|
|
|
extents() const noexcept { return _M_mapping.extents(); }
|
|
|
|
|
|
|
|
|
|
constexpr const data_handle_type&
|
|
|
|
|
data_handle() const noexcept { return _M_handle; }
|
|
|
|
|
|
|
|
|
|
constexpr const mapping_type&
|
|
|
|
|
mapping() const noexcept { return _M_mapping; }
|
|
|
|
|
|
|
|
|
|
constexpr const accessor_type&
|
|
|
|
|
accessor() const noexcept { return _M_accessor; }
|
|
|
|
|
|
2025-07-21 13:07:36 +02:00
|
|
|
// Strengthened noexcept for all `is_*` methods.
|
|
|
|
|
|
2025-07-08 10:24:26 +02:00
|
|
|
static constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_always_unique() noexcept(noexcept(mapping_type::is_always_unique()))
|
|
|
|
|
{ return mapping_type::is_always_unique(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_always_exhaustive()
|
|
|
|
|
noexcept(noexcept(mapping_type::is_always_exhaustive()))
|
|
|
|
|
{ return mapping_type::is_always_exhaustive(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
static constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_always_strided()
|
|
|
|
|
noexcept(noexcept(mapping_type::is_always_strided()))
|
|
|
|
|
{ return mapping_type::is_always_strided(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_unique() const noexcept(noexcept(_M_mapping.is_unique()))
|
|
|
|
|
{ return _M_mapping.is_unique(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_exhaustive() const noexcept(noexcept(_M_mapping.is_exhaustive()))
|
|
|
|
|
{ return _M_mapping.is_exhaustive(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
constexpr bool
|
2025-07-21 13:07:36 +02:00
|
|
|
is_strided() const noexcept(noexcept(_M_mapping.is_strided()))
|
2025-07-27 14:40:10 +02:00
|
|
|
{ return _M_mapping.is_strided(); }
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
constexpr index_type
|
|
|
|
|
stride(rank_type __r) const { return _M_mapping.stride(__r); }
|
|
|
|
|
|
|
|
|
|
private:
|
2025-07-21 13:07:37 +02:00
|
|
|
[[no_unique_address]] accessor_type _M_accessor = accessor_type();
|
|
|
|
|
[[no_unique_address]] mapping_type _M_mapping = mapping_type();
|
|
|
|
|
[[no_unique_address]] data_handle_type _M_handle = data_handle_type();
|
2025-07-08 10:24:26 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<typename _CArray>
|
|
|
|
|
requires is_array_v<_CArray> && (rank_v<_CArray> == 1)
|
|
|
|
|
mdspan(_CArray&)
|
|
|
|
|
-> mdspan<remove_all_extents_t<_CArray>,
|
|
|
|
|
extents<size_t, extent_v<_CArray, 0>>>;
|
|
|
|
|
|
|
|
|
|
template<typename _Pointer>
|
|
|
|
|
requires is_pointer_v<remove_reference_t<_Pointer>>
|
|
|
|
|
mdspan(_Pointer&&)
|
|
|
|
|
-> mdspan<remove_pointer_t<remove_reference_t<_Pointer>>, extents<size_t>>;
|
|
|
|
|
|
|
|
|
|
template<typename _ElementType, typename... _Integrals>
|
|
|
|
|
requires (is_convertible_v<_Integrals, size_t> && ...)
|
|
|
|
|
&& (sizeof...(_Integrals) > 0)
|
|
|
|
|
explicit mdspan(_ElementType*, _Integrals...)
|
|
|
|
|
-> mdspan<_ElementType,
|
libstdc++: Better CTAD for span and mdspan [PR120914].
This implements P3029R1. In P3029R1, the CTAD for span is refined to
permit deducing the extent of the span from an integral constant, e.g.
span((T*) ptr, integral_constant<size_t, 5>{});
is deduced as span<T, 5>. Similarly, in
auto exts = extents(integral_constant<int, 2>);
auto md = mdspan((T*) ptr, integral_constant<int, 2>);
exts and md have types extents<size_t, 2> and mdspan<double,
extents<size_t, 2>>, respectively.
PR libstdc++/120914
libstdc++-v3/ChangeLog:
* include/std/span (span): Update CTAD to enable
integral constants [P3029R1].
* include/std/mdspan (extents): ditto.
(mdspan): ditto.
* testsuite/23_containers/span/deduction.cc: Test deduction
guide.
* testsuite/23_containers/mdspan/extents/misc.cc: ditto.
* testsuite/23_containers/mdspan/mdspan.cc: ditto.
Reviewed-by: Jonathan Wakely <jwakely@redhat.com>
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
2025-07-08 11:49:21 +02:00
|
|
|
extents<size_t, __detail::__maybe_static_ext<_Integrals>...>>;
|
2025-07-08 10:24:26 +02:00
|
|
|
|
|
|
|
|
template<typename _ElementType, typename _OIndexType, size_t _Nm>
|
|
|
|
|
mdspan(_ElementType*, span<_OIndexType, _Nm>)
|
|
|
|
|
-> mdspan<_ElementType, dextents<size_t, _Nm>>;
|
|
|
|
|
|
|
|
|
|
template<typename _ElementType, typename _OIndexType, size_t _Nm>
|
|
|
|
|
mdspan(_ElementType*, const array<_OIndexType, _Nm>&)
|
|
|
|
|
-> mdspan<_ElementType, dextents<size_t, _Nm>>;
|
|
|
|
|
|
|
|
|
|
template<typename _ElementType, typename _IndexType, size_t... _ExtentsPack>
|
|
|
|
|
mdspan(_ElementType*, const extents<_IndexType, _ExtentsPack...>&)
|
|
|
|
|
-> mdspan<_ElementType, extents<_IndexType, _ExtentsPack...>>;
|
|
|
|
|
|
|
|
|
|
template<typename _ElementType, typename _MappingType>
|
|
|
|
|
mdspan(_ElementType*, const _MappingType&)
|
|
|
|
|
-> mdspan<_ElementType, typename _MappingType::extents_type,
|
|
|
|
|
typename _MappingType::layout_type>;
|
|
|
|
|
|
|
|
|
|
template<typename _MappingType, typename _AccessorType>
|
|
|
|
|
mdspan(const typename _AccessorType::data_handle_type&, const _MappingType&,
|
|
|
|
|
const _AccessorType&)
|
|
|
|
|
-> mdspan<typename _AccessorType::element_type,
|
|
|
|
|
typename _MappingType::extents_type,
|
|
|
|
|
typename _MappingType::layout_type, _AccessorType>;
|
|
|
|
|
|
2025-04-29 14:46:08 +02:00
|
|
|
_GLIBCXX_END_NAMESPACE_VERSION
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|