From 3268942a8b08ed15f7e09ad834cd9e87b56dbf6b Mon Sep 17 00:00:00 2001
From: Richard Sandiford <rdsandiford@googlemail.com>
Date: Mon, 4 May 2026 21:06:53 +0100
Subject: [PATCH] aarch64: Fix SVE vec_perm for VL2048 VNx16QI

SVE's vec_perm pattern is restricted to constant VLs.  There are two
expansions: one for when the selector is known to refer to only the
first vector, and one for the general case.

The first expansion uses a single TBL whereas the fallback uses a
five-instruction sequence that includes a SUB of nunits and two TBLs.

Normally the first expansion is purely an optimisation.  However,
in the specific case of a VL2048 permutation of bytes, the first
form is needed for correctness, since the SUB of nunits (256)
would be truncated to a SUB of zero.

For example, in:

  svint8_t f(svint8_t x, svint8_t y, svint8_t z) {
    return __builtin_shuffle(x, y, z);
  }

"z" can only select from "x" for VL2048.  The testcase previously
generated:

        tbl     z0.b, {z0.b}, z2.b
        tbl     z1.b, {z1.b}, z2.b
        orr     z0.d, z0.d, z1.d
        ret

where the SUB is optimised away.  This sequence is equivalent to:

    return __builtin_shuffle(x | y, x | y, z);

even though "y" should be entirely ignored.

I used "<= nunits - 1U" rather than "< nunits" to match the existing
check and as a hopefully natural way of making the rhs unsigned.

gcc/
	* config/aarch64/aarch64.cc (aarch64_expand_sve_vec_perm): Check
	whether all indices of a variable selector refer to the first
	values vector.

gcc/testsuite/
	* gcc.target/aarch64/sve/vec_perm_2.c: New test.
	* gcc.target/aarch64/sve/vec_perm_3.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc                 |  5 ++--
 .../gcc.target/aarch64/sve/vec_perm_2.c       | 26 +++++++++++++++++++
 .../gcc.target/aarch64/sve/vec_perm_3.c       | 22 ++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 162947b047b..3816df92b18 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27344,8 +27344,9 @@ aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
   rtx sel_reg = force_reg (sel_mode, sel);
 
   /* Check if the sel only references the first values vector.  */
-  if (CONST_VECTOR_P (sel)
-      && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
+  if (GET_MODE_MASK (GET_MODE_INNER (sel_mode)) <= nunits - 1U
+      || (CONST_VECTOR_P (sel)
+	  && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)))
     {
       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
       return;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c
new file mode 100644
index 00000000000..42fa5f1f97f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_2.c
@@ -0,0 +1,26 @@
+/* { dg-options "-O2 -msve-vector-bits=2048" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** test8:
+**	tbl	z0\.b, {z0\.b}, z2\.b
+**	ret
+*/
+svint8_t
+test8 (svint8_t x, svint8_t y, svint8_t z)
+{
+  return __builtin_shuffle (x, y, z);
+}
+
+svint16_t
+test16 (svint16_t x, svint16_t y, svint16_t z)
+{
+  return __builtin_shuffle (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times {\tand\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\t} 1 } } */
+/* { dg-final { scan-assembler-times {\ttbl\t} 3 } } */
+/* { dg-final { scan-assembler-times {\torr\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c
new file mode 100644
index 00000000000..2fead0890c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_perm_3.c
@@ -0,0 +1,22 @@
+/* { dg-options "-O2 -msve-vector-bits=1024" } */
+
+#include <arm_sve.h>
+
+svint8_t
+test8 (svint8_t x, svint8_t y, svint8_t z)
+{
+  return __builtin_shuffle (x, y, z);
+}
+
+svint16_t
+test16 (svint16_t x, svint16_t y, svint16_t z)
+{
+  return __builtin_shuffle (x, y, z);
+}
+
+/* test8 does not need an AND.  In principle, its subtraction of 128
+   from the selector can be rendered as a SUB, an ADD, or an EOR.  */
+/* { dg-final { scan-assembler-times {\tand\t} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:sub|add|eor)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\ttbl\t} 4 } } */
+/* { dg-final { scan-assembler-times {\torr\t} 2 } } */