x86_64: Start TImode STV chains from zero-extension or *concatditi.

Currently x86_64's TImode STV pass has the restriction that candidate
chains must start with a TImode load from memory.  This patch improves
the functionality of STV to allow zero-extensions and construction of
TImode pseudos from two DImode values (i.e. *concatditi) to both be
considered candidate chain initiators.  For example, this allows chains
starting from an __int128 function argument to be processed by STV.

Compiled with -O2 on x86_64:

__int128 m0,m1,m2,m3;
void foo(__int128 m)
{
    m0 = m;
    m1 = m;
    m2 = m;
    m3 = m;
}

Previously generated:

foo:    xchgq   %rdi, %rsi
        movq    %rsi, m0(%rip)
        movq    %rdi, m0+8(%rip)
        movq    %rsi, m1(%rip)
        movq    %rdi, m1+8(%rip)
        movq    %rsi, m2(%rip)
        movq    %rdi, m2+8(%rip)
        movq    %rsi, m3(%rip)
        movq    %rdi, m3+8(%rip)
        ret

With the patch, we now generate:

foo:	movq    %rdi, %xmm0
        movq    %rsi, %xmm1
        punpcklqdq      %xmm1, %xmm0
        movaps  %xmm0, m0(%rip)
        movaps  %xmm0, m1(%rip)
        movaps  %xmm0, m2(%rip)
        movaps  %xmm0, m3(%rip)
        ret

or with -mavx2:

foo:	vmovq   %rdi, %xmm1
        vpinsrq $1, %rsi, %xmm1, %xmm0
        vmovdqa %xmm0, m0(%rip)
        vmovdqa %xmm0, m1(%rip)
        vmovdqa %xmm0, m2(%rip)
        vmovdqa %xmm0, m3(%rip)
        ret

Likewise, for zero-extension:

__int128 m0,m1,m2,m3;
void bar(unsigned long x)
{
    __int128 m = x;
    m0 = m;
    m1 = m;
    m2 = m;
    m3 = m;
}

Previously with -O2:

bar:    movq    %rdi, m0(%rip)
        movq    $0, m0+8(%rip)
        movq    %rdi, m1(%rip)
        movq    $0, m1+8(%rip)
        movq    %rdi, m2(%rip)
        movq    $0, m2+8(%rip)
        movq    %rdi, m3(%rip)
        movq    $0, m3+8(%rip)
        ret

with this patch:

bar:	movq    %rdi, %xmm0
        movaps  %xmm0, m0(%rip)
        movaps  %xmm0, m1(%rip)
        movaps  %xmm0, m2(%rip)
        movaps  %xmm0, m3(%rip)
        ret

As shown in the examples above, the scalar-to-vector (STV) conversion of
*concatditi has an overhead [treating two DImode registers as a TImode
value is free on x86_64], but specifying this penalty allows the STV
pass to make an informed decision if the total cost/gain of the chain
is a net win.

2025-10-21  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386-features.cc (timode_concatdi_p): New
	function to recognize the various variants of *concatditi3_[1-7].
	(scalar_chain::add_insn): Like VEC_SELECT, ZERO_EXTEND and
	timode_concatdi_p instructions don't require their input
	operands to be converted (to TImode).
	(timode_scalar_chain::compute_convert_gain): Split/clone XOR and
	IOR cases from AND case, to handle timode_concatdi_p costs.
	<case PLUS>: Handle timode_concatdi_p conversion costs.
	<case ZERO_EXTEND>: Provide costs of DImode to TImode extension.
	(timode_convert_concatdi): Helper function to transform
	a *concatditi3 instruction into a vec_concatv2di instruction.
	(timode_scalar_chain::convert_insn): Split/clone XOR and IOR
	cases from ANS case, to handle timode_concatdi_p using the new
	timode_convert_concatdi helper function.
	<case ZERO_EXTEND>: Convert zero_extendditi2 to *vec_concatv2di_0.
	<case PLUS>: Handle timode_concatdi_p using the new
	timode_convert_concatdi helper function.
	(timode_scalar_to_vector_candidate_p): Support timode_concatdi_p
	instructions in IOR, XOR and PLUS cases.
	<case ZERO_EXTEND>: Consider zero extension of a register from
	DImode to TImode to be a candidate.

gcc/testsuite/ChangeLog
	* gcc.target/i386/sse4_1-stv-10.c: New test case.
	* gcc.target/i386/sse4_1-stv-11.c: Likewise.
	* gcc.target/i386/sse4_1-stv-12.c: Likewise.
This commit is contained in:
Roger Sayle
2025-10-21 13:14:58 +01:00
parent 794ec42354
commit f4afefbbbe
4 changed files with 180 additions and 6 deletions

View File

@@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
return true;
}
/* Check whether X is a convertible *concatditi_? variant. X is known
to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
static bool
timode_concatdi_p (rtx x)
{
rtx op0 = XEXP (x, 0);
rtx op1 = XEXP (x, 1);
if (GET_CODE (op1) == ASHIFT)
std::swap (op0, op1);
return GET_CODE (op0) == ASHIFT
&& GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
&& GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
&& REG_P (XEXP (XEXP (op0, 0), 0))
&& CONST_INT_P (XEXP (op0, 1))
&& INTVAL (XEXP (op0, 1)) == 64
&& GET_CODE (op1) == ZERO_EXTEND
&& GET_MODE (XEXP (op1, 0)) == DImode
&& REG_P (XEXP (op1, 0));
}
/* Add instruction into a chain. Return true if OK, false if the search
was aborted. */
@@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
if (!analyze_register_chain (candidates, ref, disallowed))
return false;
/* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
return true;
/* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
to be converted/convertible. */
if (def_set)
switch (GET_CODE (SET_SRC (def_set)))
{
case VEC_SELECT:
return true;
case ZERO_EXTEND:
if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
return true;
break;
case PLUS:
case IOR:
case XOR:
if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
return true;
break;
default:
break;
}
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
if (!DF_REF_REG_MEM_P (ref))
@@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain ()
break;
case AND:
case XOR:
case IOR:
if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
igain += timode_immed_const_gain (XEXP (src, 1), bb);
break;
case XOR:
case IOR:
if (timode_concatdi_p (src))
{
/* vmovq;vpinsrq (11 bytes). */
igain = speed_p ? -2 * ix86_cost->sse_to_integer
: -COSTS_N_BYTES (11);
break;
}
if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
igain += timode_immed_const_gain (XEXP (src, 1), bb);
break;
case PLUS:
if (timode_concatdi_p (src))
/* vmovq;vpinsrq (11 bytes). */
igain = speed_p ? -2 * ix86_cost->sse_to_integer
: -COSTS_N_BYTES (11);
break;
case ASHIFT:
case LSHIFTRT:
/* See ix86_expand_v1ti_shift. */
@@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain ()
igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
break;
case ZERO_EXTEND:
if (GET_MODE (XEXP (src, 0)) == DImode)
/* xor (2 bytes) vs. vmovq (5 bytes). */
igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
: -COSTS_N_BYTES (3);
break;
default:
break;
}
@@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
}
}
/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
Insert this before INSN, and return the result as a V1TImode subreg. */
static rtx
timode_convert_concatdi (rtx src, rtx_insn *insn)
{
rtx hi, lo;
rtx tmp = gen_reg_rtx (V2DImode);
if (GET_CODE (XEXP (src, 0)) == ASHIFT)
{
hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
lo = XEXP (XEXP (src, 1), 0);
}
else
{
hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
lo = XEXP (XEXP (src, 0), 0);
}
emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
return gen_rtx_SUBREG (V1TImode, tmp, 0);
}
/* Convert INSN from TImode to V1T1mode. */
void
@@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
PUT_MODE (src, V1TImode);
break;
}
/* FALLTHRU */
convert_op (&XEXP (src, 0), insn);
convert_op (&XEXP (src, 1), insn);
PUT_MODE (src, V1TImode);
if (MEM_P (dst))
{
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
}
break;
case XOR:
case IOR:
if (timode_concatdi_p (src))
{
src = timode_convert_concatdi (src, insn);
break;
}
convert_op (&XEXP (src, 0), insn);
convert_op (&XEXP (src, 1), insn);
PUT_MODE (src, V1TImode);
@@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
PUT_MODE (src, V1TImode);
break;
case ZERO_EXTEND:
if (GET_MODE (XEXP (src, 0)) == DImode)
{
/* Convert to *vec_concatv2di_0. */
rtx tmp = gen_reg_rtx (V2DImode);
rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
emit_insn_before (gen_move_insn (tmp, pat), insn);
src = gen_rtx_SUBREG (vmode, tmp, 0);
}
else
gcc_unreachable ();
break;
case PLUS:
if (timode_concatdi_p (src))
src = timode_convert_concatdi (src, insn);
else
gcc_unreachable ();
break;
default:
gcc_unreachable ();
}
@@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
case IOR:
case XOR:
if (timode_concatdi_p (src))
return true;
return (REG_P (XEXP (src, 0))
|| timode_mem_p (XEXP (src, 0)))
&& (REG_P (XEXP (src, 1))
@@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
&& CONST_INT_P (XEXP (src, 1))
&& (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
case PLUS:
return timode_concatdi_p (src);
case ZERO_EXTEND:
return REG_P (XEXP (src, 0))
&& GET_MODE (XEXP (src, 0)) == DImode;
default:
return false;
}

View File

@@ -0,0 +1,13 @@
/* { dg-do compile { target int128 } } */
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
__int128 m0,m1,m2,m3;
void foo(__int128 m)
{
m0 = m;
m1 = m;
m2 = m;
m3 = m;
}
/* { dg-final { scan-assembler-times "movaps" 4 } } */

View File

@@ -0,0 +1,14 @@
/* { dg-do compile { target int128 } } */
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
__int128 m0,m1,m2,m3;
void foo(unsigned long x)
{
__int128 m = x;
m0 = m;
m1 = m;
m2 = m;
m3 = m;
}
/* { dg-final { scan-assembler-times "movaps" 4 } } */

View File

@@ -0,0 +1,14 @@
/* { dg-do compile { target int128 } } */
/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
__int128 m0,m1,m2,m3;
void foo(unsigned int x)
{
__int128 m = x;
m0 = m;
m1 = m;
m2 = m;
m3 = m;
}
/* { dg-final { scan-assembler-times "movaps" 4 } } */