From 3268942a8b08ed15f7e09ad834cd9e87b56dbf6b Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 4 May 2026 21:06:53 +0100 Subject: [PATCH] aarch64: Fix SVE vec_perm for VL2048 VNx16QI SVE's vec_perm pattern is restricted to constant VLs. There are two expansions: one for when the selector is known to refer to only the first vector, and one for the general case. The first expansion uses a single TBL whereas the fallback uses a five-instruction sequence that includes a SUB of nunits and two TBLs. Normally the first expansion is purely an optimisation. However, in the specific case of a VL2048 permutation of bytes, the first form is needed for correctness, since the SUB of nunits (256) would be truncated to a SUB of zero. For example, in: svint8_t f(svint8_t x, svint8_t y, svint8_t z) { return __builtin_shuffle(x, y, z); } "z" can only select from "x" for VL2048. The testcase previously generated: tbl z0.b, {z0.b}, z2.b tbl z1.b, {z1.b}, z2.b orr z0.d, z0.d, z1.d ret where the SUB is optimised away. This sequence is equivalent to: return __builtin_shuffle(x | y, x | y, z); even though "y" should be entirely ignored. I used "<= nunits - 1U" rather than "< nunits" to match the existing check and as a hopefully natural way of making the rhs unsigned. gcc/ * config/aarch64/aarch64.cc (aarch64_expand_sve_vec_perm): Check whether all indices of a variable selector refer to the first values vector. gcc/testsuite/ * gcc.target/aarch64/sve/vec_perm_2.c: New test. * gcc.target/aarch64/sve/vec_perm_3.c: Likewise. --- gcc/config/aarch64/aarch64.cc | 5 ++-- .../gcc.target/aarch64/sve/vec_perm_2.c | 26 +++++++++++++++++++ .../gcc.target/aarch64/sve/vec_perm_3.c | 22 ++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 162947b047b..3816df92b18 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -27344,8 +27344,9 @@ aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) rtx sel_reg = force_reg (sel_mode, sel); /* Check if the sel only references the first values vector. */ - if (CONST_VECTOR_P (sel) - && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)) + if (GET_MODE_MASK (GET_MODE_INNER (sel_mode)) <= nunits - 1U + || (CONST_VECTOR_P (sel) + && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))) { emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg); return; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c new file mode 100644 index 00000000000..42fa5f1f97f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c @@ -0,0 +1,26 @@ +/* { dg-options "-O2 -msve-vector-bits=2048" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include + +/* +** test8: +** tbl z0\.b, {z0\.b}, z2\.b +** ret +*/ +svint8_t +test8 (svint8_t x, svint8_t y, svint8_t z) +{ + return __builtin_shuffle (x, y, z); +} + +svint16_t +test16 (svint16_t x, svint16_t y, svint16_t z) +{ + return __builtin_shuffle (x, y, z); +} + +/* { dg-final { scan-assembler-times {\tand\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tsub\t} 1 } } */ +/* { dg-final { scan-assembler-times {\ttbl\t} 3 } } */ +/* { dg-final { scan-assembler-times {\torr\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c new file mode 100644 index 00000000000..2fead0890c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c @@ -0,0 +1,22 @@ +/* { dg-options "-O2 -msve-vector-bits=1024" } */ + +#include + +svint8_t +test8 (svint8_t x, svint8_t y, svint8_t z) +{ + return __builtin_shuffle (x, y, z); +} + +svint16_t +test16 (svint16_t x, svint16_t y, svint16_t z) +{ + return __builtin_shuffle (x, y, z); +} + +/* test8 does not need an AND. In principle, its subtraction of 128 + from the selector can be rendered as a SUB, an ADD, or an EOR. */ +/* { dg-final { scan-assembler-times {\tand\t} 1 } } */ +/* { dg-final { scan-assembler-times {\t(?:sub|add|eor)\t} 2 } } */ +/* { dg-final { scan-assembler-times {\ttbl\t} 4 } } */ +/* { dg-final { scan-assembler-times {\torr\t} 2 } } */