From 070e48f66edebdde0606e97c5022bb8d4b8aa57b Mon Sep 17 00:00:00 2001 From: Brick <6098371+0x1F9F1@users.noreply.github.com> Date: Wed, 3 Jul 2024 15:36:50 +0100 Subject: [PATCH] Tidy up/optimize BlitNtoNPixelAlpha_AVX2 --- src/video/SDL_blit.h | 10 ++- src/video/SDL_blit_A.c | 2 +- src/video/SDL_blit_A_avx2.c | 171 ++++++++++-------------------------- 3 files changed, 53 insertions(+), 130 deletions(-) diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index b994f9e9bd..387db2f2ca 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -501,6 +501,7 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface); dst = a << 24 | r << 16 | g << 8 | b; \ } while (0) /* Blend a single color channel or alpha value */ +/* dC = ((sC * sA) + (dC * (255 - sA))) / 255 */ #define ALPHA_BLEND_CHANNEL(sC, dC, sA) \ do { \ Uint16 x; \ @@ -510,6 +511,7 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface); dC = x >> 8; \ } while (0) /* Perform a division by 255 after a multiplication of two 8-bit color channels */ +/* out = (sC * dC) / 255 */ #define MULT_DIV_255(sC, dC, out) \ do { \ Uint16 x = sC * dC; \ @@ -524,11 +526,11 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface); ALPHA_BLEND_CHANNEL(sG, dG, A); \ ALPHA_BLEND_CHANNEL(sB, dB, A); \ } while (0) -/* Blend the ARGB values of two 32-bit pixels */ -#define ALPHA_BLEND_ARGB_PIXELS(src, dst) \ +/* Blend two 32-bit pixels with the same format */ +#define ALPHA_BLEND_RGBA_4(src, dst, ashift) \ do { \ - Uint32 srcA = src >> 24; \ - src |= 0xFF000000; \ + Uint32 srcA = (src >> ashift) & 0xFF; \ + src |= ((Uint32)0xFF) << ashift; \ \ Uint32 srcRB = src & 0x00FF00FF; \ Uint32 dstRB = dst & 0x00FF00FF; \ diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 3b97440ac1..4dd67e341b 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -1171,7 +1171,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) { PIXEL_TO_ARGB_PIXEL(*(Uint32 *) src, srcfmt, Pixel); Uint32 blended = *(Uint32 *) dst; - ALPHA_BLEND_ARGB_PIXELS(Pixel, blended); + ALPHA_BLEND_RGBA_4(Pixel, blended); *(Uint32*)dst = blended; src += srcbpp; dst += dstbpp; diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c index 04b5851c2d..d6828e2bee 100644 --- a/src/video/SDL_blit_A_avx2.c +++ b/src/video/SDL_blit_A_avx2.c @@ -7,63 +7,14 @@ #define SDL_blit_A_avx2_c #include "SDL_blit.h" -#include "SDL_blit_A_sse4_1.h" - -__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaSplatMask_AVX2(const SDL_PixelFormat* dstfmt) { - Uint8 index = dstfmt->Ashift / 8; - return _mm256_set_epi8( - index + 28, index + 28, index + 28, index + 28, index + 24, index + 24, index + 24, index + 24, - index + 20, index + 20, index + 20, index + 20, index + 16, index + 16, index + 16, index + 16, - index + 12, index + 12, index + 12, index + 12, index + 8, index + 8, index + 8, index + 8, - index + 4, index + 4, index + 4, index + 4, index, index, index, index); -} - -__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaSaturateMask_AVX2(const SDL_PixelFormat* dstfmt) { - const Uint8 bin = dstfmt->Ashift / 8; - return _mm256_set_epi8( - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0, - bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0); -} - -__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatShuffleMask_AVX2(const SDL_PixelFormat* srcfmt, - const SDL_PixelFormat* dstfmt) { - /* Calculate shuffle indices based on the source and destination SDL_PixelFormat */ - Uint8 shuffleIndices[32]; - Uint8 dstAshift = dstfmt->Ashift / 8; - Uint8 dstRshift = dstfmt->Rshift / 8; - Uint8 dstGshift = dstfmt->Gshift / 8; - Uint8 dstBshift = dstfmt->Bshift / 8; - for (int i = 0; i < 8; ++i) { - shuffleIndices[dstAshift + i * 4] = srcfmt->Ashift / 8 + i * 4; - shuffleIndices[dstRshift + i * 4] = srcfmt->Rshift / 8 + i * 4; - shuffleIndices[dstGshift + i * 4] = srcfmt->Gshift / 8 + i * 4; - shuffleIndices[dstBshift + i * 4] = srcfmt->Bshift / 8 + i * 4; - } - - /* Create shuffle mask based on the calculated indices */ - return _mm256_set_epi8( - shuffleIndices[31], shuffleIndices[30], shuffleIndices[29], shuffleIndices[28], - shuffleIndices[27], shuffleIndices[26], shuffleIndices[25], shuffleIndices[24], - shuffleIndices[23], shuffleIndices[22], shuffleIndices[21], shuffleIndices[20], - shuffleIndices[19], shuffleIndices[18], shuffleIndices[17], shuffleIndices[16], - shuffleIndices[15], shuffleIndices[14], shuffleIndices[13], shuffleIndices[12], - shuffleIndices[11], shuffleIndices[10], shuffleIndices[9], shuffleIndices[8], - shuffleIndices[7], shuffleIndices[6], shuffleIndices[5], shuffleIndices[4], - shuffleIndices[3], shuffleIndices[2], shuffleIndices[1], shuffleIndices[0] - ); -} /** * Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending */ -__m256i SDL_TARGETING("avx2") MixRGBA_AVX2(__m256i src, __m256i dst, const __m256i alpha_shuffle, - const __m256i alpha_saturate) { +SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2( + __m256i src, __m256i dst, + const __m256i alpha_shuffle, const __m256i alpha_saturate) +{ // SIMD implementation of blend_mul2. // dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA)) // dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA)) @@ -112,87 +63,57 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) SDL_PixelFormat *srcfmt = info->src_fmt; SDL_PixelFormat *dstfmt = info->dst_fmt; - int chunks = width / 8; - SDL_bool free_format = SDL_FALSE; - /* Handle case when passed invalid format, assume ARGB destination */ - if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) { - dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888); - free_format = SDL_TRUE; - } - const __m256i shift_mask = GetSDL_PixelFormatShuffleMask_AVX2(srcfmt, dstfmt); - const __m256i splat_mask = GetSDL_PixelFormatAlphaSplatMask_AVX2(dstfmt); - const __m256i saturate_mask = GetSDL_PixelFormatAlphaSaturateMask_AVX2(dstfmt); - const __m128i sse4_1_shift_mask = GetSDL_PixelFormatShuffleMask_SSE4_1(srcfmt, dstfmt); - const __m128i sse4_1_splat_mask = GetSDL_PixelFormatAlphaSplatMask_SSE4_1(dstfmt); - const __m128i sse4_1_saturate_mask = GetSDL_PixelFormatAlphaSaturateMask_SSE4_1(dstfmt); + const __m256i mask_offsets = _mm256_set_epi8( + 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + + const __m256i shift_mask = _mm256_add_epi32( + _mm256_set1_epi32( + ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | + ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | + ((srcfmt->Bshift >> 3) << dstfmt->Bshift) | + ((srcfmt->Ashift >> 3) << dstfmt->Ashift)), + mask_offsets); + + const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets); + const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask); while (height--) { - /* Process 8-wide chunks of source color data that may be in wrong format */ - for (int i = 0; i < chunks; i += 1) { - __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *) (src + i * 32)), shift_mask); - /* Alpha-blend in 8-wide chunk from src into destination */ - __m256i c_dst = _mm256_loadu_si256((__m256i*) (dst + i * 32)); - __m256i c_mix = MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask); - _mm256_storeu_si256((__m256i*) (dst + i * 32), c_mix); + int i = 0; + + for (; i + 8 <= width; i += 8) { + // Load 8 src pixels and shuffle into the dst format + __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask); + + // Load 8 dst pixels + __m256i c_dst = _mm256_loadu_si256((__m256i *)dst); + + // Blend the pixels together and save the result + _mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask)); + + src += 32; + dst += 32; } - /* Handle remaining pixels when width is not a multiple of 4 */ - if (width % 8 != 0) { - int remaining_pixels = width % 8; - int offset = width - remaining_pixels; - if (remaining_pixels >= 4) { - Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); - Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); - __m128i c_src = _mm_loadu_si128((__m128i*)src_ptr); - c_src = _mm_shuffle_epi8(c_src, sse4_1_shift_mask); - __m128i c_dst = _mm_loadu_si128((__m128i*)dst_ptr); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask); - _mm_storeu_si128((__m128i*)dst_ptr, c_mix); - remaining_pixels -= 4; - offset += 4; - } - if (remaining_pixels >= 2) { - Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); - Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); - __m128i c_src = _mm_loadu_si64(src_ptr); - c_src = _mm_shuffle_epi8(c_src, sse4_1_shift_mask); - __m128i c_dst = _mm_loadu_si64(dst_ptr); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask); - _mm_storeu_si64(dst_ptr, c_mix); - remaining_pixels -= 2; - offset += 2; - } - if (remaining_pixels == 1) { - Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); - Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); - Uint32 pixel = AlignPixelToSDL_PixelFormat(*src_ptr, srcfmt, dstfmt); - /* Old GCC has bad or no _mm_loadu_si32 */ - #if defined(__GNUC__) && (__GNUC__ < 11) - __m128i c_src = _mm_set_epi32(0, 0, 0, pixel); - __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr); - #else - __m128i c_src = _mm_loadu_si32(&pixel); - __m128i c_dst = _mm_loadu_si32(dst_ptr); - #endif - __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask); - /* Old GCC has bad or no _mm_storeu_si32 */ - #if defined(__GNUC__) && (__GNUC__ < 11) - *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); - #else - _mm_storeu_si32(dst_ptr, mixed_pixel); - #endif - } - } + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; - src += 4 * width; - dst += 4 * width; + src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | + (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | + (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | + (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift); + + ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift); + + *(Uint32 *)dst = dst32; + + src += 4; + dst += 4; + } src += srcskip; dst += dstskip; } - if (free_format) { - SDL_DestroyPixelFormat(dstfmt); - } } #endif