libcpp: replace SSE4.2 helper with an SSSE3 one

Since the characters we are searching for (CR, LF, '\', '?') all have
distinct ASCII codes mod 16, PSHUFB can help match them all at once.

Directly use the new helper if __SSSE3__ is defined. It makes the other
helpers unused, so mark them inline to prevent warnings.

Rewrite and simplify init_vectorized_lexer.

libcpp/ChangeLog:

	* config.in: Regenerate.
	* configure: Regenerate.
	* configure.ac: Check for SSSE3 instead of SSE4.2.
	* files.cc (read_file_guts): Bump padding to 64 if HAVE_SSSE3.
	* lex.cc (search_line_acc_char): Mark inline, not "unused".
	(search_line_sse2): Mark inline.
	(search_line_sse42): Replace with...
	(search_line_ssse3): ... this new function.  Adjust the use...
	(init_vectorized_lexer): ... here.  Simplify.
This commit is contained in:
Alexander Monakov
2024-08-06 09:47:23 +03:00
parent b8ea13ebf1
commit 20a5b48249
5 changed files with 72 additions and 109 deletions

View File

@@ -210,8 +210,8 @@
/* Define to 1 if you have the `putc_unlocked' function. */
#undef HAVE_PUTC_UNLOCKED
/* Define to 1 if you can assemble SSE4 insns. */
#undef HAVE_SSE4
/* Define to 1 if you can assemble SSSE3 insns. */
#undef HAVE_SSSE3
/* Define to 1 if you have the <stddef.h> header file. */
#undef HAVE_STDDEF_H

4
libcpp/configure vendored
View File

@@ -9140,14 +9140,14 @@ case $target in
int
main ()
{
asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))
asm ("pshufb %xmm0, %xmm1")
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_SSE4 1" >>confdefs.h
$as_echo "#define HAVE_SSSE3 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext

View File

@@ -197,9 +197,9 @@ fi
case $target in
i?86-* | x86_64-*)
AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))],
[AC_DEFINE([HAVE_SSE4], [1],
[Define to 1 if you can assemble SSE4 insns.])])
AC_TRY_COMPILE([], [asm ("pshufb %xmm0, %xmm1")],
[AC_DEFINE([HAVE_SSSE3], [1],
[Define to 1 if you can assemble SSSE3 insns.])])
esac
# Enable --enable-host-shared.

View File

@@ -693,7 +693,7 @@ static bool
read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
const char *input_charset)
{
ssize_t size, total, count;
ssize_t size, pad, total, count;
uchar *buf;
bool regular;
@@ -732,11 +732,14 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
the majority of C source files. */
size = 8 * 1024;
/* The + 16 here is space for the final '\n' and 15 bytes of padding,
used to quiet warnings from valgrind or Address Sanitizer, when the
optimized lexer accesses aligned 16-byte memory chunks, including
the bytes after the malloced, area, and stops lexing on '\n'. */
buf = XNEWVEC (uchar, size + 16);
#ifdef HAVE_SSSE3
pad = 64;
#else
pad = 16;
#endif
/* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding,
allowing search_line_fast to use (possibly misaligned) vector loads. */
buf = XNEWVEC (uchar, size + pad);
total = 0;
while ((count = read (file->fd, buf + total, size - total)) > 0)
{
@@ -747,7 +750,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
if (regular)
break;
size *= 2;
buf = XRESIZEVEC (uchar, buf, size + 16);
buf = XRESIZEVEC (uchar, buf, size + pad);
}
}
@@ -765,7 +768,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
file->buffer = _cpp_convert_input (pfile,
input_charset,
buf, size + 16, total,
buf, size + pad, total,
&file->buffer_start,
&file->st.st_size);
file->buffer_valid = file->buffer;

View File

@@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
and branches without increasing the number of arithmetic operations.
It's almost certainly going to be a win with 64-bit word size. */
static const uchar * search_line_acc_char (const uchar *, const uchar *)
ATTRIBUTE_UNUSED;
static const uchar *
static inline const uchar *
search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
const word_type repl_nl = acc_char_replicate ('\n');
@@ -293,7 +290,7 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = {
/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
static const uchar *
static inline const uchar *
#ifndef __SSE2__
__attribute__((__target__("sse2")))
#endif
@@ -344,120 +341,83 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
return (const uchar *)p + found;
}
#ifdef HAVE_SSE4
/* A version of the fast scanner using SSE 4.2 vectorized string insns. */
#ifdef HAVE_SSSE3
/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */
static const uchar *
#ifndef __SSE4_2__
__attribute__((__target__("sse4.2")))
static inline const uchar *
#ifndef __SSSE3__
__attribute__((__target__("ssse3")))
#endif
search_line_sse42 (const uchar *s, const uchar *end)
search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
typedef char v16qi __attribute__ ((__vector_size__ (16)));
static const v16qi search = { '\n', '\r', '?', '\\' };
typedef v16qi v16qi_u __attribute__ ((__aligned__ (1)));
/* Helper vector for pshufb-based matching:
each character C we're searching for is at position (C % 16). */
v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' };
static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63);
uintptr_t si = (uintptr_t)s;
uintptr_t index;
/* Check for unaligned input. */
if (si & 15)
v16qi d1, d2, t1, t2;
/* Unaligned loads. Reading beyond the final newline is safe,
since files.cc:read_file_guts pads the allocation. */
d1 = *(const v16qi_u *)s;
d2 = *(const v16qi_u *)(s + 16);
unsigned m1, m2, found;
/* Process two 16-byte chunks per iteration. */
do
{
v16qi sv;
if (__builtin_expect (end - s < 16, 0)
&& __builtin_expect ((si & 0xfff) > 0xff0, 0))
{
/* There are less than 16 bytes left in the buffer, and less
than 16 bytes left on the page. Reading 16 bytes at this
point might generate a spurious page fault. Defer to the
SSE2 implementation, which already handles alignment. */
return search_line_sse2 (s, end);
}
/* ??? The builtin doesn't understand that the PCMPESTRI read from
memory need not be aligned. */
sv = __builtin_ia32_loaddqu ((const char *) s);
index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
if (__builtin_expect (index < 16, 0))
goto found;
/* Advance the pointer to an aligned address. We will re-scan a
few bytes, but we no longer need care for reading past the
end of a page, since we're guaranteed a match. */
s = (const uchar *)((si + 15) & -16);
t1 = __builtin_ia32_pshufb128 (lut, d1);
t2 = __builtin_ia32_pshufb128 (lut, d2);
m1 = __builtin_ia32_pmovmskb128 (t1 == d1);
m2 = __builtin_ia32_pmovmskb128 (t2 == d2);
s += 32;
d1 = *(const v16qi_u *)s;
d2 = *(const v16qi_u *)(s + 16);
found = m1 + (m2 << 16);
}
/* Main loop, processing 16 bytes at a time. */
#ifdef __GCC_ASM_FLAG_OUTPUTS__
while (1)
{
char f;
/* By using inline assembly instead of the builtin,
we can use the result, as well as the flags set. */
__asm ("%vpcmpestri\t$0, %2, %3"
: "=c"(index), "=@ccc"(f)
: "m"(*s), "x"(search), "a"(4), "d"(16));
if (f)
break;
s += 16;
}
#else
s -= 16;
/* By doing the whole loop in inline assembly,
we can make proper use of the flags set. */
__asm ( ".balign 16\n"
"0: add $16, %1\n"
" %vpcmpestri\t$0, (%1), %2\n"
" jnc 0b"
: "=&c"(index), "+r"(s)
: "x"(search), "a"(4), "d"(16));
#endif
found:
return s + index;
while (!found);
/* Prefer to compute 's - 32' here, not spend an extra instruction
to make a copy of the previous value of 's' in the loop. */
__asm__ ("" : "+r"(s));
return s - 32 + __builtin_ctz (found);
}
#else
/* Work around out-dated assemblers without sse4 support. */
#define search_line_sse42 search_line_sse2
/* Work around out-dated assemblers without SSSE3 support. */
#define search_line_ssse3 search_line_sse2
#endif
#ifdef __SSSE3__
/* No need for CPU probing, just use the best available variant. */
#define search_line_fast search_line_ssse3
#else
/* Check the CPU capabilities. */
#include "../gcc/config/i386/cpuid.h"
typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
static search_line_fast_type search_line_fast;
static search_line_fast_type search_line_fast
#if defined(__SSE2__)
= search_line_sse2;
#else
= search_line_acc_char;
#endif
#define HAVE_init_vectorized_lexer 1
static inline void
init_vectorized_lexer (void)
{
unsigned dummy, ecx = 0, edx = 0;
search_line_fast_type impl = search_line_acc_char;
int minimum = 0;
unsigned ax, bx, cx, dx;
#if defined(__SSE4_2__)
minimum = 3;
#elif defined(__SSE2__)
minimum = 2;
#endif
if (!__get_cpuid (1, &ax, &bx, &cx, &dx))
return;
if (minimum == 3)
impl = search_line_sse42;
else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
{
if (minimum == 3 || (ecx & bit_SSE4_2))
impl = search_line_sse42;
else if (minimum == 2 || (edx & bit_SSE2))
impl = search_line_sse2;
}
search_line_fast = impl;
if (cx & bit_SSSE3)
search_line_fast = search_line_ssse3;
else if (dx & bit_SSE2)
search_line_fast = search_line_sse2;
}
#endif
#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)