diff --git a/libcpp/config.in b/libcpp/config.in index 253ef03a3de..b2e2f4e842c 100644 --- a/libcpp/config.in +++ b/libcpp/config.in @@ -210,8 +210,8 @@ /* Define to 1 if you have the `putc_unlocked' function. */ #undef HAVE_PUTC_UNLOCKED -/* Define to 1 if you can assemble SSE4 insns. */ -#undef HAVE_SSE4 +/* Define to 1 if you can assemble SSSE3 insns. */ +#undef HAVE_SSSE3 /* Define to 1 if you have the header file. */ #undef HAVE_STDDEF_H diff --git a/libcpp/configure b/libcpp/configure index 32d6aaa3069..1391081ba09 100755 --- a/libcpp/configure +++ b/libcpp/configure @@ -9140,14 +9140,14 @@ case $target in int main () { -asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0)) +asm ("pshufb %xmm0, %xmm1") ; return 0; } _ACEOF if ac_fn_c_try_compile "$LINENO"; then : -$as_echo "#define HAVE_SSE4 1" >>confdefs.h +$as_echo "#define HAVE_SSSE3 1" >>confdefs.h fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext diff --git a/libcpp/configure.ac b/libcpp/configure.ac index b883fec776f..981f97c4abd 100644 --- a/libcpp/configure.ac +++ b/libcpp/configure.ac @@ -197,9 +197,9 @@ fi case $target in i?86-* | x86_64-*) - AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))], - [AC_DEFINE([HAVE_SSE4], [1], - [Define to 1 if you can assemble SSE4 insns.])]) + AC_TRY_COMPILE([], [asm ("pshufb %xmm0, %xmm1")], + [AC_DEFINE([HAVE_SSSE3], [1], + [Define to 1 if you can assemble SSSE3 insns.])]) esac # Enable --enable-host-shared. diff --git a/libcpp/files.cc b/libcpp/files.cc index 78f56e30bde..3775091d259 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -693,7 +693,7 @@ static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, const char *input_charset) { - ssize_t size, total, count; + ssize_t size, pad, total, count; uchar *buf; bool regular; @@ -732,11 +732,14 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; - /* The + 16 here is space for the final '\n' and 15 bytes of padding, - used to quiet warnings from valgrind or Address Sanitizer, when the - optimized lexer accesses aligned 16-byte memory chunks, including - the bytes after the malloced, area, and stops lexing on '\n'. */ - buf = XNEWVEC (uchar, size + 16); +#ifdef HAVE_SSSE3 + pad = 64; +#else + pad = 16; +#endif + /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, + allowing search_line_fast to use (possibly misaligned) vector loads. */ + buf = XNEWVEC (uchar, size + pad); total = 0; while ((count = read (file->fd, buf + total, size - total)) > 0) { @@ -747,7 +750,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, if (regular) break; size *= 2; - buf = XRESIZEVEC (uchar, buf, size + 16); + buf = XRESIZEVEC (uchar, buf, size + pad); } } @@ -765,7 +768,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, file->buffer = _cpp_convert_input (pfile, input_charset, - buf, size + 16, total, + buf, size + pad, total, &file->buffer_start, &file->st.st_size); file->buffer_valid = file->buffer; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 1591dcdf151..daf2c770bc3 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED, and branches without increasing the number of arithmetic operations. It's almost certainly going to be a win with 64-bit word size. */ -static const uchar * search_line_acc_char (const uchar *, const uchar *) - ATTRIBUTE_UNUSED; - -static const uchar * +static inline const uchar * search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { const word_type repl_nl = acc_char_replicate ('\n'); @@ -293,7 +290,7 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = { /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ -static const uchar * +static inline const uchar * #ifndef __SSE2__ __attribute__((__target__("sse2"))) #endif @@ -344,120 +341,83 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) return (const uchar *)p + found; } -#ifdef HAVE_SSE4 -/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ +#ifdef HAVE_SSSE3 +/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ -static const uchar * -#ifndef __SSE4_2__ -__attribute__((__target__("sse4.2"))) +static inline const uchar * +#ifndef __SSSE3__ +__attribute__((__target__("ssse3"))) #endif -search_line_sse42 (const uchar *s, const uchar *end) +search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { typedef char v16qi __attribute__ ((__vector_size__ (16))); - static const v16qi search = { '\n', '\r', '?', '\\' }; + typedef v16qi v16qi_u __attribute__ ((__aligned__ (1))); + /* Helper vector for pshufb-based matching: + each character C we're searching for is at position (C % 16). */ + v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }; + static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63); - uintptr_t si = (uintptr_t)s; - uintptr_t index; - - /* Check for unaligned input. */ - if (si & 15) + v16qi d1, d2, t1, t2; + /* Unaligned loads. Reading beyond the final newline is safe, + since files.cc:read_file_guts pads the allocation. */ + d1 = *(const v16qi_u *)s; + d2 = *(const v16qi_u *)(s + 16); + unsigned m1, m2, found; + /* Process two 16-byte chunks per iteration. */ + do { - v16qi sv; - - if (__builtin_expect (end - s < 16, 0) - && __builtin_expect ((si & 0xfff) > 0xff0, 0)) - { - /* There are less than 16 bytes left in the buffer, and less - than 16 bytes left on the page. Reading 16 bytes at this - point might generate a spurious page fault. Defer to the - SSE2 implementation, which already handles alignment. */ - return search_line_sse2 (s, end); - } - - /* ??? The builtin doesn't understand that the PCMPESTRI read from - memory need not be aligned. */ - sv = __builtin_ia32_loaddqu ((const char *) s); - index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); - - if (__builtin_expect (index < 16, 0)) - goto found; - - /* Advance the pointer to an aligned address. We will re-scan a - few bytes, but we no longer need care for reading past the - end of a page, since we're guaranteed a match. */ - s = (const uchar *)((si + 15) & -16); + t1 = __builtin_ia32_pshufb128 (lut, d1); + t2 = __builtin_ia32_pshufb128 (lut, d2); + m1 = __builtin_ia32_pmovmskb128 (t1 == d1); + m2 = __builtin_ia32_pmovmskb128 (t2 == d2); + s += 32; + d1 = *(const v16qi_u *)s; + d2 = *(const v16qi_u *)(s + 16); + found = m1 + (m2 << 16); } - - /* Main loop, processing 16 bytes at a time. */ -#ifdef __GCC_ASM_FLAG_OUTPUTS__ - while (1) - { - char f; - - /* By using inline assembly instead of the builtin, - we can use the result, as well as the flags set. */ - __asm ("%vpcmpestri\t$0, %2, %3" - : "=c"(index), "=@ccc"(f) - : "m"(*s), "x"(search), "a"(4), "d"(16)); - if (f) - break; - - s += 16; - } -#else - s -= 16; - /* By doing the whole loop in inline assembly, - we can make proper use of the flags set. */ - __asm ( ".balign 16\n" - "0: add $16, %1\n" - " %vpcmpestri\t$0, (%1), %2\n" - " jnc 0b" - : "=&c"(index), "+r"(s) - : "x"(search), "a"(4), "d"(16)); -#endif - - found: - return s + index; + while (!found); + /* Prefer to compute 's - 32' here, not spend an extra instruction + to make a copy of the previous value of 's' in the loop. */ + __asm__ ("" : "+r"(s)); + return s - 32 + __builtin_ctz (found); } #else -/* Work around out-dated assemblers without sse4 support. */ -#define search_line_sse42 search_line_sse2 +/* Work around out-dated assemblers without SSSE3 support. */ +#define search_line_ssse3 search_line_sse2 #endif +#ifdef __SSSE3__ +/* No need for CPU probing, just use the best available variant. */ +#define search_line_fast search_line_ssse3 +#else /* Check the CPU capabilities. */ #include "../gcc/config/i386/cpuid.h" typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); -static search_line_fast_type search_line_fast; +static search_line_fast_type search_line_fast +#if defined(__SSE2__) + = search_line_sse2; +#else + = search_line_acc_char; +#endif #define HAVE_init_vectorized_lexer 1 static inline void init_vectorized_lexer (void) { - unsigned dummy, ecx = 0, edx = 0; - search_line_fast_type impl = search_line_acc_char; - int minimum = 0; + unsigned ax, bx, cx, dx; -#if defined(__SSE4_2__) - minimum = 3; -#elif defined(__SSE2__) - minimum = 2; -#endif + if (!__get_cpuid (1, &ax, &bx, &cx, &dx)) + return; - if (minimum == 3) - impl = search_line_sse42; - else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) - { - if (minimum == 3 || (ecx & bit_SSE4_2)) - impl = search_line_sse42; - else if (minimum == 2 || (edx & bit_SSE2)) - impl = search_line_sse2; - } - - search_line_fast = impl; + if (cx & bit_SSSE3) + search_line_fast = search_line_ssse3; + else if (dx & bit_SSE2) + search_line_fast = search_line_sse2; } +#endif #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)