From 87958f44d090bc719df849882d241bc217de5822 Mon Sep 17 00:00:00 2001 From: Isaac Aronson Date: Thu, 7 Sep 2023 21:14:00 -0500 Subject: [PATCH] Support arbitrary destination formats; remove buffer from AVX2 --- src/video/SDL_blit_A_avx2.c | 46 ++++++++++++---------- src/video/SDL_blit_A_sse4_1.c | 73 ++++++++++++++++++++++++----------- src/video/SDL_blit_A_sse4_1.h | 6 ++- 3 files changed, 79 insertions(+), 46 deletions(-) diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c index 4f8d01b313..66c9d01c16 100644 --- a/src/video/SDL_blit_A_avx2.c +++ b/src/video/SDL_blit_A_avx2.c @@ -9,21 +9,29 @@ #include "SDL_blit.h" #include "SDL_blit_A_sse4_1.h" +__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaMask_AVX2(SDL_PixelFormat* dstfmt) { + Uint8 index = dstfmt->Ashift / 4; + /* Handle case where bad input sent */ + if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) { + index = 6; + } + return _mm256_set_epi8( + -1, index + 24, -1, index + 24, -1, index + 24, -1, index + 24, + -1, index + 16, -1, index + 16, -1, index + 16, -1, index + 16, + -1, index + 8, -1, index + 8, -1, index + 8, -1, index + 8, + -1, index, -1, index, -1, index, -1, index); +} + /** * Using the AVX2 instruction set, blit eight pixels with alpha blending * @param src A pointer to four 32-bit pixels of ARGB format to blit into dst * @param dst A pointer to four 32-bit pixels of ARGB format to retain visual data for while alpha blending * @return A 128-bit wide vector of four alpha-blended pixels in ARGB format */ -__m128i SDL_TARGETING("avx2") MixRGBA_AVX2(__m128i src, __m128i dst) { +__m128i SDL_TARGETING("avx2") MixRGBA_AVX2(__m128i src, __m128i dst, __m256i alphaMask) { __m256i src_color = _mm256_cvtepu8_epi16(src); __m256i dst_color = _mm256_cvtepu8_epi16(dst); - const __m256i SHUFFLE_ALPHA = _mm256_set_epi8( - -1, 30, -1, 30, -1, 30, -1, 30, - -1, 22, -1, 22, -1, 22, -1, 22, - -1, 14, -1, 14, -1, 14, -1, 14, - -1, 6, -1, 6, -1, 6, -1, 6); - __m256i alpha = _mm256_shuffle_epi8(src_color, SHUFFLE_ALPHA); + __m256i alpha = _mm256_shuffle_epi8(src_color, alphaMask); __m256i sub = _mm256_sub_epi16(src_color, dst_color); __m256i mul = _mm256_mullo_epi16(sub, alpha); /** @@ -51,22 +59,20 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) Uint8 *dst = info->dst; int dstskip = info->dst_skip; SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; int chunks = width / 4; - Uint8 *buf = SDL_malloc(sizeof(Uint8) * chunks * 16); + const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt); + const __m256i alphaMask = GetSDL_PixelFormatAlphaMask_AVX2(dstfmt); + const __m128i sse4_1AlphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt); while (height--) { /* Process 4-wide chunks of source color data that may be in wrong format */ for (int i = 0; i < chunks; i += 1) { - __m128i c_src = AlignPixelToSDL_PixelFormat_x4(_mm_loadu_si128((__m128i *) (src + i * 16)), srcfmt); - _mm_store_si128((__m128i*)(buf + i * 16), c_src); - } - - /* Alpha-blend in 4-wide chunk from src into destination */ - for (int i = 0; i < chunks; i += 1) { - __m128i c_src = _mm_loadu_si128((__m128i*) (buf + i * 16)); + __m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *) (src + i * 16)), colorShiftMask); + /* Alpha-blend in 4-wide chunk from src into destination */ __m128i c_dst = _mm_loadu_si128((__m128i*) (dst + i * 16)); - __m128i c_mix = MixRGBA_AVX2(c_src, c_dst); + __m128i c_mix = MixRGBA_AVX2(c_src, c_dst, alphaMask); _mm_storeu_si128((__m128i*) (dst + i * 16), c_mix); } @@ -78,9 +84,9 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); __m128i c_src = _mm_loadu_si64(src_ptr); - c_src = AlignPixelToSDL_PixelFormat_x4(c_src, srcfmt); + c_src = _mm_shuffle_epi8(c_src, colorShiftMask); __m128i c_dst = _mm_loadu_si64(dst_ptr); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask); _mm_storeu_si64(dst_ptr, c_mix); remaining_pixels -= 2; offset += 2; @@ -97,7 +103,7 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) __m128i c_src = _mm_loadu_si32(&pixel); __m128i c_dst = _mm_loadu_si32(dst_ptr); #endif - __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst); + __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask); /* Old GCC has bad or no _mm_storeu_si32 */ #if defined(__GNUC__) && (__GNUC__ < 11) *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); @@ -113,8 +119,6 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) src += srcskip; dst += dstskip; } - SDL_free(buf); - } #endif diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c index b8bf6f4902..5a779b01c2 100644 --- a/src/video/SDL_blit_A_sse4_1.c +++ b/src/video/SDL_blit_A_sse4_1.c @@ -9,23 +9,34 @@ #include "SDL_blit.h" #include "SDL_blit_A_sse4_1.h" +/** + * A helper function to create an alpha mask for use with MixRGBA_SSE4_1 based on pixel format + */ +__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(SDL_PixelFormat* dstfmt) { + Uint8 index = dstfmt->Ashift / 8; + /* Handle case where bad input sent */ + if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) { + index = 3; + } + return _mm_set_epi8( + -1, index + 4, -1, index + 4, -1, index + 4, -1, index + 4, + -1, index, -1, index, -1, index, -1, index); +} + /** * Using the SSE4.1 instruction set, blit four pixels with alpha blending * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format */ -__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst) { +__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst, __m128i alphaMask) { __m128i src_color = _mm_cvtepu8_epi16(src); __m128i dst_color = _mm_cvtepu8_epi16(dst); /** * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha * channel around to create 16-bit integers. */ - const __m128i SHUFFLE_ALPHA = _mm_set_epi8( - -1, 7, -1, 7, -1, 7, -1, 7, - -1, 3, -1, 3, -1, 3, -1, 3); - __m128i alpha = _mm_shuffle_epi8(src, SHUFFLE_ALPHA); + __m128i alpha = _mm_shuffle_epi8(src, alphaMask); __m128i sub = _mm_sub_epi16(src_color, dst_color); __m128i mul = _mm_mullo_epi16(sub, alpha); const __m128i SHUFFLE_REDUCE = _mm_set_epi8( @@ -46,21 +57,36 @@ Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcForma } /* - * This helper function converts arbitrary pixel format data into ARGB form with a 4 pixel-wide shuffle + * This helper function converts arbitrary pixel formats into a shuffle mask for _mm_shuffle_epi8 */ -__m128i SDL_TARGETING("sse4.1") AlignPixelToSDL_PixelFormat_x4(__m128i colors, const SDL_PixelFormat* srcFormat) { - // Create shuffle masks based on the source SDL_PixelFormat to ARGB - __m128i srcShuffleMask = _mm_set_epi8( - srcFormat->Ashift / 8 + 12, srcFormat->Rshift / 8 + 12, srcFormat->Gshift / 8 + 12, srcFormat->Bshift / 8 + 12, - srcFormat->Ashift / 8 + 8, srcFormat->Rshift / 8 + 8, srcFormat->Gshift / 8 + 8, srcFormat->Bshift / 8 + 8, - srcFormat->Ashift / 8 + 4, srcFormat->Rshift / 8 + 4, srcFormat->Gshift / 8 + 4, srcFormat->Bshift / 8 + 4, - srcFormat->Ashift / 8, srcFormat->Rshift / 8, srcFormat->Gshift / 8, srcFormat->Bshift / 8 - ); +__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatShuffleMask(const SDL_PixelFormat* srcFormat, const SDL_PixelFormat* dstFormat) { + /* Calculate shuffle indices based on the source and destination SDL_PixelFormat */ + Uint8 shuffleIndices[16]; + Uint8 dstAshift = dstFormat->Ashift / 8; + Uint8 dstRshift = dstFormat->Rshift / 8; + Uint8 dstGshift = dstFormat->Gshift / 8; + Uint8 dstBshift = dstFormat->Bshift / 8; + /* Handle case where bad input sent */ + if (dstAshift == dstBshift && dstAshift == 0) { + dstAshift = 3; + } + for (int i = 0; i < 4; ++i) { + shuffleIndices[dstAshift + i * 4] = srcFormat->Ashift / 8 + i * 4; + shuffleIndices[dstRshift + i * 4] = srcFormat->Rshift / 8 + i * 4; + shuffleIndices[dstGshift + i * 4] = srcFormat->Gshift / 8 + i * 4; + shuffleIndices[dstBshift + i * 4] = srcFormat->Bshift / 8 + i * 4; + } - // Shuffle the colors - return _mm_shuffle_epi8(colors, srcShuffleMask); + /* Create shuffle mask based on the calculated indices */ + return _mm_set_epi8( + shuffleIndices[15], shuffleIndices[14], shuffleIndices[13], shuffleIndices[12], + shuffleIndices[11], shuffleIndices[10], shuffleIndices[9], shuffleIndices[8], + shuffleIndices[7], shuffleIndices[6], shuffleIndices[5], shuffleIndices[4], + shuffleIndices[3], shuffleIndices[2], shuffleIndices[1], shuffleIndices[0] + ); } + void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { int width = info->dst_w; int height = info->dst_h; @@ -69,22 +95,25 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { Uint8 *dst = info->dst; int dstskip = info->dst_skip; SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; int chunks = width / 4; Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8)); + const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt); + const __m128i alphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt); while (height--) { /* Process 4-wide chunks of source color data that may be in wrong format into buffer */ for (int i = 0; i < chunks; i += 1) { __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16)); - _mm_storeu_si128((__m128i*)(buffer + i * 16), AlignPixelToSDL_PixelFormat_x4(colors, srcfmt)); + _mm_storeu_si128((__m128i*)(buffer + i * 16), _mm_shuffle_epi8(colors, colorShiftMask)); } /* Alpha-blend in 2-wide chunks from buffer into destination */ for (int i = 0; i < chunks * 2; i += 1) { __m128i c_src = _mm_loadu_si64((buffer + (i * 8))); __m128i c_dst = _mm_loadu_si64((dst + i * 8)); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); _mm_storeu_si64(dst + i * 8, c_mix); } @@ -96,9 +125,9 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); __m128i c_src = _mm_loadu_si64(src_ptr); - c_src = AlignPixelToSDL_PixelFormat_x4(c_src, srcfmt); + c_src = _mm_shuffle_epi8(c_src, colorShiftMask); __m128i c_dst = _mm_loadu_si64(dst_ptr); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); _mm_storeu_si64(dst_ptr, c_mix); remaining_pixels -= 2; offset += 2; @@ -115,7 +144,7 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { __m128i c_src = _mm_loadu_si32(&pixel); __m128i c_dst = _mm_loadu_si32(dst_ptr); #endif - __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst); + __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); /* Old GCC has bad or no _mm_storeu_si32 */ #if defined(__GNUC__) && (__GNUC__ < 11) *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); @@ -131,8 +160,6 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { src += srcskip; dst += dstskip; } - - SDL_free(buffer); } #endif diff --git a/src/video/SDL_blit_A_sse4_1.h b/src/video/SDL_blit_A_sse4_1.h index 2850290fd5..ec7f8b7ae4 100644 --- a/src/video/SDL_blit_A_sse4_1.h +++ b/src/video/SDL_blit_A_sse4_1.h @@ -4,9 +4,11 @@ #ifdef SDL_SSE4_1_INTRINSICS Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat); -__m128i SDL_TARGETING("sse4.1") AlignPixelToSDL_PixelFormat_x4(__m128i colors, const SDL_PixelFormat* srcFormat); +__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(SDL_PixelFormat* dstfmt); -__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst); +__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatShuffleMask(const SDL_PixelFormat* srcFormat, const SDL_PixelFormat* dstFormat); + +__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst, __m128i alphaMask); void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info);