avoid-store-forwarding: Reject overlapping stores [PR124476]

The redundant-store tracking introduced in ec5349c37a replaced a
safe bitmap_bit_in_range_p check (which bailed on any overlap) with
bitmap_all_bits_in_range_p (which only removed fully redundant stores).
This broke "last writer wins" semantics for partially overlapping
stores: when an earlier store's BFI was applied after the base store's
value, it overwrote bytes that should have belonged to the later store.

Restore the original overlap check from 1d8de1e93e: bail out of the
optimization when any bit in a store's byte range is already claimed
by a later store in program order.  Remove the now-unnecessary
redundant-store tracking (redundant_stores, store_ind_to_remove).

gcc/ChangeLog:

	PR rtl-optimization/124476
	* avoid-store-forwarding.cc
	(store_forwarding_analyzer::process_store_forwarding): Replace
	bitmap_all_bits_in_range_p with bitmap_any_bit_in_range_p and
	return false on partial overlap.  Remove redundant-store vectors
	and their associated removal, dump, and deletion logic.

gcc/testsuite/ChangeLog:

	PR rtl-optimization/124476
	* gcc.dg/pr124476.c: New test.
This commit is contained in:
Philipp Tomsich
2026-03-13 11:01:45 +01:00
parent 1f97354f55
commit a48fbdd6a8
2 changed files with 50 additions and 35 deletions

View File

@@ -171,28 +171,23 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
/* Memory sizes should be constants at this stage. */
HOST_WIDE_INT load_size = MEM_SIZE (load_mem).to_constant ();
/* If the stores cover all the bytes of the load, then we can eliminate
the load entirely and use the computed value instead.
We can also eliminate stores on addresses that are overwritten
by later stores. */
/* If the stores cover all the bytes of the load without overlap then we can
eliminate the load entirely and use the computed value instead.
Bail out when partially overlapping stores are detected, as the pass
cannot correctly handle "last writer wins" semantics for the
overlapping byte ranges (see PR124476). */
auto_sbitmap forwarded_bytes (load_size);
bitmap_clear (forwarded_bytes);
unsigned int i;
store_fwd_info* it;
auto_vec<store_fwd_info> redundant_stores;
auto_vec<int> store_ind_to_remove;
FOR_EACH_VEC_ELT (stores, i, it)
{
HOST_WIDE_INT store_size = MEM_SIZE (it->store_mem).to_constant ();
if (bitmap_all_bits_in_range_p (forwarded_bytes, it->offset,
it->offset + store_size - 1))
{
redundant_stores.safe_push (*it);
store_ind_to_remove.safe_push (i);
continue;
}
if (bitmap_any_bit_in_range_p (forwarded_bytes, it->offset,
it->offset + store_size - 1))
return false;
bitmap_set_range (forwarded_bytes, it->offset, store_size);
}
@@ -218,15 +213,6 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
fprintf (dump_file, "(Load elimination candidate)\n");
}
/* Remove redundant stores from the vector. Although this is quadratic,
there doesn't seem to be much point optimizing it. The number of
redundant stores is expected to be low and the length of the list is
limited by a --param. The dependence checking that we did earlier is
also quadratic in the size of this list. */
store_ind_to_remove.reverse ();
for (int i : store_ind_to_remove)
stores.ordered_remove (i);
rtx load = single_set (load_insn);
rtx dest;
@@ -413,15 +399,6 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
}
}
if (redundant_stores.length () > 0)
{
fprintf (dump_file, "\nRedundant stores that have been removed:\n");
FOR_EACH_VEC_ELT (redundant_stores, i, it)
{
fprintf (dump_file, " ");
print_rtl_single (dump_file, it->store_insn);
}
}
}
stats_sf_avoided++;
@@ -441,10 +418,6 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn,
delete_insn (it->store_insn);
}
/* Delete redundant stores. */
FOR_EACH_VEC_ELT (redundant_stores, i, it)
delete_insn (it->store_insn);
df_insn_rescan (load_insn);
if (load_elim)

View File

@@ -0,0 +1,42 @@
/* PR rtl-optimization/124476 */
/* { dg-do run } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O1 -favoid-store-forwarding" } */
/* Verify that overlapping stores are handled correctly.
The avoid-store-forwarding pass must respect "last writer wins"
semantics when stores have partially overlapping byte ranges.
The expected value is little-endian specific: the memmove writes
bytes 0-3 last (zeroing them), leaving bytes 4-7 as 0x05 from the
earlier memset, so f = 0x0505050500000000 on little-endian. */
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
int main () { return 0; }
#else
int y;
__attribute__((noipa)) void
foo (int a, long *ret)
{
long f;
char c;
__builtin_memset (2 + (char *) &f, a, 6);
char n = *(char *) __builtin_memset (&c, a, 1);
__builtin_memmove (&f, &y, 4);
long r = f + a + n;
*ret = r;
}
int
main ()
{
long x;
foo (5, &x);
if (x != 0x050505050000000aL)
__builtin_abort ();
return 0;
}
#endif