mirror of
https://github.com/gcc-mirror/gcc.git
synced 2026-05-06 14:59:39 +02:00
aarch64: Fix SVE vec_perm for VL2048 VNx16QI
SVE's vec_perm pattern is restricted to constant VLs. There are two
expansions: one for when the selector is known to refer to only the
first vector, and one for the general case.
The first expansion uses a single TBL whereas the fallback uses a
five-instruction sequence that includes a SUB of nunits and two TBLs.
Normally the first expansion is purely an optimisation. However,
in the specific case of a VL2048 permutation of bytes, the first
form is needed for correctness, since the SUB of nunits (256)
would be truncated to a SUB of zero.
For example, in:
svint8_t f(svint8_t x, svint8_t y, svint8_t z) {
return __builtin_shuffle(x, y, z);
}
"z" can only select from "x" for VL2048. The testcase previously
generated:
tbl z0.b, {z0.b}, z2.b
tbl z1.b, {z1.b}, z2.b
orr z0.d, z0.d, z1.d
ret
where the SUB is optimised away. This sequence is equivalent to:
return __builtin_shuffle(x | y, x | y, z);
even though "y" should be entirely ignored.
I used "<= nunits - 1U" rather than "< nunits" to match the existing
check and as a hopefully natural way of making the rhs unsigned.
gcc/
* config/aarch64/aarch64.cc (aarch64_expand_sve_vec_perm): Check
whether all indices of a variable selector refer to the first
values vector.
gcc/testsuite/
* gcc.target/aarch64/sve/vec_perm_2.c: New test.
* gcc.target/aarch64/sve/vec_perm_3.c: Likewise.
This commit is contained in:
@@ -27344,8 +27344,9 @@ aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
|
||||
rtx sel_reg = force_reg (sel_mode, sel);
|
||||
|
||||
/* Check if the sel only references the first values vector. */
|
||||
if (CONST_VECTOR_P (sel)
|
||||
&& aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
|
||||
if (GET_MODE_MASK (GET_MODE_INNER (sel_mode)) <= nunits - 1U
|
||||
|| (CONST_VECTOR_P (sel)
|
||||
&& aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)))
|
||||
{
|
||||
emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
|
||||
return;
|
||||
|
||||
26
gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c
Normal file
26
gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c
Normal file
@@ -0,0 +1,26 @@
|
||||
/* { dg-options "-O2 -msve-vector-bits=2048" } */
|
||||
/* { dg-final { check-function-bodies "**" "" } } */
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
/*
|
||||
** test8:
|
||||
** tbl z0\.b, {z0\.b}, z2\.b
|
||||
** ret
|
||||
*/
|
||||
svint8_t
|
||||
test8 (svint8_t x, svint8_t y, svint8_t z)
|
||||
{
|
||||
return __builtin_shuffle (x, y, z);
|
||||
}
|
||||
|
||||
svint16_t
|
||||
test16 (svint16_t x, svint16_t y, svint16_t z)
|
||||
{
|
||||
return __builtin_shuffle (x, y, z);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tand\t} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tsub\t} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\ttbl\t} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\torr\t} 1 } } */
|
||||
22
gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c
Normal file
22
gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c
Normal file
@@ -0,0 +1,22 @@
|
||||
/* { dg-options "-O2 -msve-vector-bits=1024" } */
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
svint8_t
|
||||
test8 (svint8_t x, svint8_t y, svint8_t z)
|
||||
{
|
||||
return __builtin_shuffle (x, y, z);
|
||||
}
|
||||
|
||||
svint16_t
|
||||
test16 (svint16_t x, svint16_t y, svint16_t z)
|
||||
{
|
||||
return __builtin_shuffle (x, y, z);
|
||||
}
|
||||
|
||||
/* test8 does not need an AND. In principle, its subtraction of 128
|
||||
from the selector can be rendered as a SUB, an ADD, or an EOR. */
|
||||
/* { dg-final { scan-assembler-times {\tand\t} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\t(?:sub|add|eor)\t} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\ttbl\t} 4 } } */
|
||||
/* { dg-final { scan-assembler-times {\torr\t} 2 } } */
|
||||
Reference in New Issue
Block a user