asf: Fix calling of emit_move_insn on registers of different modes [PR119884]

This patch uses `lowpart_subreg` for the base register initialization,
instead of zero-extending it. We had tried this solution before, but
we were leaving undefined bytes in the upper part of the register.
This shouldn't be happening as we are supposed to write the whole
register when the load is eliminated. This was occurring when having
multiple stores with the same offset as the load, generating a
register move for all of them, overwriting the bit inserts that were
inserted before them.

In order to overcome this, we are removing redundant stores from the
sequence, i.e. stores that write to addresses that will be overwritten
by stores that come after them in the sequence. We are using the same
bitmap that is used for the load elimination check, to keep track of
the bytes that are written by each store.

Also, we are now allowing the load to be eliminated even when there
are overlaps between the stores, as there is no obvious reason why we
shouldn't do that, we just want the stores to cover all of the load's
bytes.

Bootstrapped/regtested on AArch64 and x86_64.

	PR rtl-optimization/119884

gcc/ChangeLog:

	* avoid-store-forwarding.cc (process_store_forwarding):
	Use `lowpart_subreg` for the base register initialization
	and remove redundant stores from the store/load sequence.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr119884.c: New test.
This commit is contained in:
Konstantinos Eleftheriou
2025-05-19 13:00:05 +02:00
committed by Philipp Tomsich
parent 764ea9ea3e
commit ec5349c37a
2 changed files with 53 additions and 11 deletions

View File

@@ -176,20 +176,28 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
/* Memory sizes should be constants at this stage. */
HOST_WIDE_INT load_size = MEM_SIZE (load_mem).to_constant ();
/* If the stores cover all the bytes of the load without overlap then we can
eliminate the load entirely and use the computed value instead. */
/* If the stores cover all the bytes of the load, then we can eliminate
the load entirely and use the computed value instead.
We can also eliminate stores on addresses that are overwritten
by later stores. */
sbitmap forwarded_bytes = sbitmap_alloc (load_size);
bitmap_clear (forwarded_bytes);
unsigned int i;
store_fwd_info* it;
auto_vec<store_fwd_info> redundant_stores;
auto_vec<int> store_ind_to_remove;
FOR_EACH_VEC_ELT (stores, i, it)
{
HOST_WIDE_INT store_size = MEM_SIZE (it->store_mem).to_constant ();
if (bitmap_bit_in_range_p (forwarded_bytes, it->offset,
it->offset + store_size - 1))
break;
if (bitmap_all_bits_in_range_p (forwarded_bytes, it->offset,
it->offset + store_size - 1))
{
redundant_stores.safe_push (*it);
store_ind_to_remove.safe_push (i);
continue;
}
bitmap_set_range (forwarded_bytes, it->offset, store_size);
}
@@ -215,6 +223,15 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
fprintf (dump_file, "(Load elimination candidate)\n");
}
/* Remove redundant stores from the vector. Although this is quadratic,
there doesn't seem to be much point optimizing it. The number of
redundant stores is expected to be low and the length of the list is
limited by a --param. The dependence checking that we did earlier is
also quadratic in the size of this list. */
store_ind_to_remove.reverse ();
for (int i : store_ind_to_remove)
stores.ordered_remove (i);
rtx load = single_set (load_insn);
rtx dest;
@@ -231,18 +248,16 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
{
it->mov_reg = gen_reg_rtx (GET_MODE (it->store_mem));
rtx_insn *insns = NULL;
const bool has_zero_offset = it->offset == 0;
/* If we're eliminating the load then find the store with zero offset
and use it as the base register to avoid a bit insert if possible. */
if (load_elim && it->offset == 0)
if (load_elim && has_zero_offset)
{
start_sequence ();
machine_mode dest_mode = GET_MODE (dest);
rtx base_reg = it->mov_reg;
if (known_gt (GET_MODE_BITSIZE (dest_mode),
GET_MODE_BITSIZE (GET_MODE (it->mov_reg))))
base_reg = gen_rtx_ZERO_EXTEND (dest_mode, it->mov_reg);
rtx base_reg = lowpart_subreg (GET_MODE (dest), it->mov_reg,
GET_MODE (it->mov_reg));
if (base_reg)
{
@@ -380,6 +395,16 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
print_rtl_single (dump_file, insn);
}
}
if (redundant_stores.length () > 0)
{
fprintf (dump_file, "\nRedundant stores that have been removed:\n");
FOR_EACH_VEC_ELT (redundant_stores, i, it)
{
fprintf (dump_file, " ");
print_rtl_single (dump_file, it->store_insn);
}
}
}
stats_sf_avoided++;
@@ -399,6 +424,10 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
delete_insn (it->store_insn);
}
/* Delete redundant stores. */
FOR_EACH_VEC_ELT (redundant_stores, i, it)
delete_insn (it->store_insn);
df_insn_rescan (load_insn);
if (load_elim)

View File

@@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fno-dse -favoid-store-forwarding" } */
typedef __attribute__((__vector_size__(64))) char V;
char c;
V v;
char
foo()
{
v *= c;
return v[0];
}