mirror of
https://github.com/gcc-mirror/gcc.git
synced 2026-05-06 23:25:24 +02:00
x86_64: Start TImode STV chains from zero-extension or *concatditi.
Currently x86_64's TImode STV pass has the restriction that candidate
chains must start with a TImode load from memory. This patch improves
the functionality of STV to allow zero-extensions and construction of
TImode pseudos from two DImode values (i.e. *concatditi) to both be
considered candidate chain initiators. For example, this allows chains
starting from an __int128 function argument to be processed by STV.
Compiled with -O2 on x86_64:
__int128 m0,m1,m2,m3;
void foo(__int128 m)
{
m0 = m;
m1 = m;
m2 = m;
m3 = m;
}
Previously generated:
foo: xchgq %rdi, %rsi
movq %rsi, m0(%rip)
movq %rdi, m0+8(%rip)
movq %rsi, m1(%rip)
movq %rdi, m1+8(%rip)
movq %rsi, m2(%rip)
movq %rdi, m2+8(%rip)
movq %rsi, m3(%rip)
movq %rdi, m3+8(%rip)
ret
With the patch, we now generate:
foo: movq %rdi, %xmm0
movq %rsi, %xmm1
punpcklqdq %xmm1, %xmm0
movaps %xmm0, m0(%rip)
movaps %xmm0, m1(%rip)
movaps %xmm0, m2(%rip)
movaps %xmm0, m3(%rip)
ret
or with -mavx2:
foo: vmovq %rdi, %xmm1
vpinsrq $1, %rsi, %xmm1, %xmm0
vmovdqa %xmm0, m0(%rip)
vmovdqa %xmm0, m1(%rip)
vmovdqa %xmm0, m2(%rip)
vmovdqa %xmm0, m3(%rip)
ret
Likewise, for zero-extension:
__int128 m0,m1,m2,m3;
void bar(unsigned long x)
{
__int128 m = x;
m0 = m;
m1 = m;
m2 = m;
m3 = m;
}
Previously with -O2:
bar: movq %rdi, m0(%rip)
movq $0, m0+8(%rip)
movq %rdi, m1(%rip)
movq $0, m1+8(%rip)
movq %rdi, m2(%rip)
movq $0, m2+8(%rip)
movq %rdi, m3(%rip)
movq $0, m3+8(%rip)
ret
with this patch:
bar: movq %rdi, %xmm0
movaps %xmm0, m0(%rip)
movaps %xmm0, m1(%rip)
movaps %xmm0, m2(%rip)
movaps %xmm0, m3(%rip)
ret
As shown in the examples above, the scalar-to-vector (STV) conversion of
*concatditi has an overhead [treating two DImode registers as a TImode
value is free on x86_64], but specifying this penalty allows the STV
pass to make an informed decision if the total cost/gain of the chain
is a net win.
2025-10-21 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (timode_concatdi_p): New
function to recognize the various variants of *concatditi3_[1-7].
(scalar_chain::add_insn): Like VEC_SELECT, ZERO_EXTEND and
timode_concatdi_p instructions don't require their input
operands to be converted (to TImode).
(timode_scalar_chain::compute_convert_gain): Split/clone XOR and
IOR cases from AND case, to handle timode_concatdi_p costs.
<case PLUS>: Handle timode_concatdi_p conversion costs.
<case ZERO_EXTEND>: Provide costs of DImode to TImode extension.
(timode_convert_concatdi): Helper function to transform
a *concatditi3 instruction into a vec_concatv2di instruction.
(timode_scalar_chain::convert_insn): Split/clone XOR and IOR
cases from ANS case, to handle timode_concatdi_p using the new
timode_convert_concatdi helper function.
<case ZERO_EXTEND>: Convert zero_extendditi2 to *vec_concatv2di_0.
<case PLUS>: Handle timode_concatdi_p using the new
timode_convert_concatdi helper function.
(timode_scalar_to_vector_candidate_p): Support timode_concatdi_p
instructions in IOR, XOR and PLUS cases.
<case ZERO_EXTEND>: Consider zero extension of a register from
DImode to TImode to be a candidate.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-10.c: New test case.
* gcc.target/i386/sse4_1-stv-11.c: Likewise.
* gcc.target/i386/sse4_1-stv-12.c: Likewise.
This commit is contained in:
@@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Check whether X is a convertible *concatditi_? variant. X is known
|
||||
to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
|
||||
|
||||
static bool
|
||||
timode_concatdi_p (rtx x)
|
||||
{
|
||||
rtx op0 = XEXP (x, 0);
|
||||
rtx op1 = XEXP (x, 1);
|
||||
|
||||
if (GET_CODE (op1) == ASHIFT)
|
||||
std::swap (op0, op1);
|
||||
|
||||
return GET_CODE (op0) == ASHIFT
|
||||
&& GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
|
||||
&& GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
|
||||
&& REG_P (XEXP (XEXP (op0, 0), 0))
|
||||
&& CONST_INT_P (XEXP (op0, 1))
|
||||
&& INTVAL (XEXP (op0, 1)) == 64
|
||||
&& GET_CODE (op1) == ZERO_EXTEND
|
||||
&& GET_MODE (XEXP (op1, 0)) == DImode
|
||||
&& REG_P (XEXP (op1, 0));
|
||||
}
|
||||
|
||||
|
||||
/* Add instruction into a chain. Return true if OK, false if the search
|
||||
was aborted. */
|
||||
|
||||
@@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
|
||||
if (!analyze_register_chain (candidates, ref, disallowed))
|
||||
return false;
|
||||
|
||||
/* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
|
||||
if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
|
||||
return true;
|
||||
/* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
|
||||
to be converted/convertible. */
|
||||
if (def_set)
|
||||
switch (GET_CODE (SET_SRC (def_set)))
|
||||
{
|
||||
case VEC_SELECT:
|
||||
return true;
|
||||
case ZERO_EXTEND:
|
||||
if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
|
||||
return true;
|
||||
break;
|
||||
case PLUS:
|
||||
case IOR:
|
||||
case XOR:
|
||||
if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
|
||||
if (!DF_REF_REG_MEM_P (ref))
|
||||
@@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain ()
|
||||
break;
|
||||
|
||||
case AND:
|
||||
case XOR:
|
||||
case IOR:
|
||||
if (!MEM_P (dst))
|
||||
igain = COSTS_N_INSNS (1);
|
||||
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
|
||||
igain += timode_immed_const_gain (XEXP (src, 1), bb);
|
||||
break;
|
||||
|
||||
case XOR:
|
||||
case IOR:
|
||||
if (timode_concatdi_p (src))
|
||||
{
|
||||
/* vmovq;vpinsrq (11 bytes). */
|
||||
igain = speed_p ? -2 * ix86_cost->sse_to_integer
|
||||
: -COSTS_N_BYTES (11);
|
||||
break;
|
||||
}
|
||||
if (!MEM_P (dst))
|
||||
igain = COSTS_N_INSNS (1);
|
||||
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
|
||||
igain += timode_immed_const_gain (XEXP (src, 1), bb);
|
||||
break;
|
||||
|
||||
case PLUS:
|
||||
if (timode_concatdi_p (src))
|
||||
/* vmovq;vpinsrq (11 bytes). */
|
||||
igain = speed_p ? -2 * ix86_cost->sse_to_integer
|
||||
: -COSTS_N_BYTES (11);
|
||||
break;
|
||||
|
||||
case ASHIFT:
|
||||
case LSHIFTRT:
|
||||
/* See ix86_expand_v1ti_shift. */
|
||||
@@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain ()
|
||||
igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
|
||||
break;
|
||||
|
||||
case ZERO_EXTEND:
|
||||
if (GET_MODE (XEXP (src, 0)) == DImode)
|
||||
/* xor (2 bytes) vs. vmovq (5 bytes). */
|
||||
igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
|
||||
: -COSTS_N_BYTES (3);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
|
||||
Insert this before INSN, and return the result as a V1TImode subreg. */
|
||||
|
||||
static rtx
|
||||
timode_convert_concatdi (rtx src, rtx_insn *insn)
|
||||
{
|
||||
rtx hi, lo;
|
||||
rtx tmp = gen_reg_rtx (V2DImode);
|
||||
if (GET_CODE (XEXP (src, 0)) == ASHIFT)
|
||||
{
|
||||
hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
|
||||
lo = XEXP (XEXP (src, 1), 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
|
||||
lo = XEXP (XEXP (src, 0), 0);
|
||||
}
|
||||
emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
|
||||
return gen_rtx_SUBREG (V1TImode, tmp, 0);
|
||||
}
|
||||
|
||||
/* Convert INSN from TImode to V1T1mode. */
|
||||
|
||||
void
|
||||
@@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
|
||||
PUT_MODE (src, V1TImode);
|
||||
break;
|
||||
}
|
||||
/* FALLTHRU */
|
||||
convert_op (&XEXP (src, 0), insn);
|
||||
convert_op (&XEXP (src, 1), insn);
|
||||
PUT_MODE (src, V1TImode);
|
||||
if (MEM_P (dst))
|
||||
{
|
||||
tmp = gen_reg_rtx (V1TImode);
|
||||
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
||||
src = tmp;
|
||||
}
|
||||
break;
|
||||
|
||||
case XOR:
|
||||
case IOR:
|
||||
if (timode_concatdi_p (src))
|
||||
{
|
||||
src = timode_convert_concatdi (src, insn);
|
||||
break;
|
||||
}
|
||||
convert_op (&XEXP (src, 0), insn);
|
||||
convert_op (&XEXP (src, 1), insn);
|
||||
PUT_MODE (src, V1TImode);
|
||||
@@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
|
||||
PUT_MODE (src, V1TImode);
|
||||
break;
|
||||
|
||||
case ZERO_EXTEND:
|
||||
if (GET_MODE (XEXP (src, 0)) == DImode)
|
||||
{
|
||||
/* Convert to *vec_concatv2di_0. */
|
||||
rtx tmp = gen_reg_rtx (V2DImode);
|
||||
rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
|
||||
emit_insn_before (gen_move_insn (tmp, pat), insn);
|
||||
src = gen_rtx_SUBREG (vmode, tmp, 0);
|
||||
}
|
||||
else
|
||||
gcc_unreachable ();
|
||||
break;
|
||||
|
||||
case PLUS:
|
||||
if (timode_concatdi_p (src))
|
||||
src = timode_convert_concatdi (src, insn);
|
||||
else
|
||||
gcc_unreachable ();
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
@@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
|
||||
|
||||
case IOR:
|
||||
case XOR:
|
||||
if (timode_concatdi_p (src))
|
||||
return true;
|
||||
return (REG_P (XEXP (src, 0))
|
||||
|| timode_mem_p (XEXP (src, 0)))
|
||||
&& (REG_P (XEXP (src, 1))
|
||||
@@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
|
||||
&& CONST_INT_P (XEXP (src, 1))
|
||||
&& (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
|
||||
|
||||
case PLUS:
|
||||
return timode_concatdi_p (src);
|
||||
|
||||
case ZERO_EXTEND:
|
||||
return REG_P (XEXP (src, 0))
|
||||
&& GET_MODE (XEXP (src, 0)) == DImode;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
13
gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c
Normal file
13
gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c
Normal file
@@ -0,0 +1,13 @@
|
||||
/* { dg-do compile { target int128 } } */
|
||||
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
|
||||
|
||||
__int128 m0,m1,m2,m3;
|
||||
void foo(__int128 m)
|
||||
{
|
||||
m0 = m;
|
||||
m1 = m;
|
||||
m2 = m;
|
||||
m3 = m;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "movaps" 4 } } */
|
||||
14
gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c
Normal file
14
gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c
Normal file
@@ -0,0 +1,14 @@
|
||||
/* { dg-do compile { target int128 } } */
|
||||
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
|
||||
|
||||
__int128 m0,m1,m2,m3;
|
||||
void foo(unsigned long x)
|
||||
{
|
||||
__int128 m = x;
|
||||
m0 = m;
|
||||
m1 = m;
|
||||
m2 = m;
|
||||
m3 = m;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "movaps" 4 } } */
|
||||
14
gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c
Normal file
14
gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c
Normal file
@@ -0,0 +1,14 @@
|
||||
/* { dg-do compile { target int128 } } */
|
||||
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
|
||||
|
||||
__int128 m0,m1,m2,m3;
|
||||
void foo(unsigned int x)
|
||||
{
|
||||
__int128 m = x;
|
||||
m0 = m;
|
||||
m1 = m;
|
||||
m2 = m;
|
||||
m3 = m;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "movaps" 4 } } */
|
||||
Reference in New Issue
Block a user