aarch64: Model zero-high-half semantics of [SU]QXTN instructions

Split the aarch64_<su>qmovn<mode> pattern into separate scalar and
vector variants. Further split the vector RTL  pattern into big/
little endian variants that model the zero-high-half semantics of the
underlying instruction. Modeling these semantics allows for better
RTL combinations while also removing some register allocation issues
as the compiler now knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64-simd-builtins.def: Split generator
	for aarch64_<su>qmovn builtins into scalar and vector
	variants.
	* config/aarch64/aarch64-simd.md (aarch64_<su>qmovn<mode>_insn_le):
	Define.
	(aarch64_<su>qmovn<mode>_insn_be): Define.
	(aarch64_<su>qmovn<mode>): Split into scalar and vector
	variants. Change vector variant to an expander that emits the
	correct instruction depending on endianness.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
This commit is contained in:
Jonathan Wright
2021-06-14 15:09:18 +01:00
parent c86a303968
commit d0889b5d37
3 changed files with 59 additions and 4 deletions

View File

@@ -271,8 +271,10 @@
BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
/* Implemented by aarch64_<su>qmovn<mode>. */
BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE)
BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, NONE)
BUILTIN_VQN (UNOP, sqmovn, 0, NONE)
BUILTIN_SD_HSDI (UNOP, sqmovn, 0, NONE)
BUILTIN_VQN (UNOP, uqmovn, 0, NONE)
BUILTIN_SD_HSDI (UNOP, uqmovn, 0, NONE)
/* Implemented by aarch64_<su>qxtn2<mode>. */
BUILTIN_VQN (BINOP, sqxtn2, 0, NONE)

View File

@@ -4875,10 +4875,54 @@
(define_insn "aarch64_<su>qmovn<mode>"
[(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
(SAT_TRUNC:<VNARROWQ>
(match_operand:VSQN_HSDI 1 "register_operand" "w")))]
(match_operand:SD_HSDI 1 "register_operand" "w")))]
"TARGET_SIMD"
"<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_insn "aarch64_<su>qmovn<mode>_insn_le"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
(SAT_TRUNC:<VNARROWQ>
(match_operand:VQN 1 "register_operand" "w"))
(match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
"TARGET_SIMD && !BYTES_BIG_ENDIAN"
"<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_insn "aarch64_<su>qmovn<mode>_insn_be"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
(match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
(SAT_TRUNC:<VNARROWQ>
(match_operand:VQN 1 "register_operand" "w"))))]
"TARGET_SIMD && BYTES_BIG_ENDIAN"
"<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_expand "aarch64_<su>qmovn<mode>"
[(set (match_operand:<VNARROWQ> 0 "register_operand")
(SAT_TRUNC:<VNARROWQ>
(match_operand:VQN 1 "register_operand")))]
"TARGET_SIMD"
{
rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
if (BYTES_BIG_ENDIAN)
emit_insn (gen_aarch64_<su>qmovn<mode>_insn_be (tmp, operands[1],
CONST0_RTX (<VNARROWQ>mode)));
else
emit_insn (gen_aarch64_<su>qmovn<mode>_insn_le (tmp, operands[1],
CONST0_RTX (<VNARROWQ>mode)));
/* The intrinsic expects a narrow result, so emit a subreg that will get
optimized away as appropriate. */
emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
<VNARROWQ2>mode));
DONE;
}
)
(define_insn "aarch64_<su>qxtn2<mode>_le"

View File

@@ -67,6 +67,13 @@ TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
TEST_UNARY (vqmovn, int8x16_t, int16x8_t, s16, s8)
TEST_UNARY (vqmovn, int16x8_t, int32x4_t, s32, s16)
TEST_UNARY (vqmovn, int32x4_t, int64x2_t, s64, s32)
TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
/* { dg-final { scan-assembler-not "dup\\t" } } */
/* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} } */
@@ -79,3 +86,5 @@ TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
/* { dg-final { scan-assembler-times "\\tsqrshrun\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\txtn\\tv" 6} } */
/* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} } */