From c6c1e673c0a861ae0524c15106ea7be3967dd8b6 Mon Sep 17 00:00:00 2001
From: Brick <6098371+0x1F9F1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 14:36:48 +0100
Subject: [PATCH] Optimized SDL_Convert_*_to_*_Scalar

They are now all branch-less and avoid float multiplication/conversion where possible
---
 src/audio/SDL_audiotypecvt.c | 183 ++++++++++++++++++++++++++++-------
 1 file changed, 146 insertions(+), 37 deletions(-)

diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c
index 74377a2008..c70c298c93 100644
--- a/src/audio/SDL_audiotypecvt.c
+++ b/src/audio/SDL_audiotypecvt.c
@@ -39,50 +39,155 @@
 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
 #endif
 
-#define DIVBY128     0.0078125f
-#define DIVBY32768   0.000030517578125f
-#define DIVBY8388607 0.00000011920930376163766f
-
 #if NEED_SCALAR_CONVERTER_FALLBACKS
 
-/* these all convert backwards because (currently) float32 is >= to the size of anything it converts to, so it lets us safely convert in-place. */
-#define AUDIOCVT_TOFLOAT_SCALAR(from, fromtype, equation) \
-    static void SDL_Convert_##from##_to_F32_Scalar(float *dst, const fromtype *src, int num_samples) { \
-        int i; \
-        LOG_DEBUG_AUDIO_CONVERT(#from, "F32"); \
-        for (i = num_samples - 1; i >= 0; --i) { \
-            dst[i] = equation; \
-        } \
+/* This code requires that floats are in the IEEE-754 binary32 format */
+SDL_COMPILE_TIME_ASSERT(float_bits, sizeof(float) == sizeof(Uint32));
+
+union float_bits {
+    Uint32 u32;
+    float f32;
+};
+
+static void SDL_Convert_S8_to_F32_Scalar(float *dst, const Sint8 *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("S8", "F32");
+
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [65536.0, 65538.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x = { .u32 = (Uint8)src[i] ^ 0x47800080u };
+        dst[i] = x.f32 - 65537.0f;
     }
+}
 
-AUDIOCVT_TOFLOAT_SCALAR(S8, Sint8, ((float)src[i]) * DIVBY128)
-AUDIOCVT_TOFLOAT_SCALAR(U8, Uint8, (((float)src[i]) * DIVBY128) - 1.0f)
-AUDIOCVT_TOFLOAT_SCALAR(S16, Sint16, ((float)src[i]) * DIVBY32768)
-AUDIOCVT_TOFLOAT_SCALAR(S32, Sint32, ((float)(src[i] >> 8)) * DIVBY8388607)
-#undef AUDIOCVT_FROMFLOAT_SCALAR
+static void SDL_Convert_U8_to_F32_Scalar(float *dst, const Uint8 *src, int num_samples)
+{
+    int i;
 
-/* these all convert forwards because (currently) float32 is >= to the size of anything it converts from, so it lets us safely convert in-place. */
-#define AUDIOCVT_FROMFLOAT_SCALAR(to, totype, clampmin, clampmax, equation) \
-    static void SDL_Convert_F32_to_##to##_Scalar(totype *dst, const float *src, int num_samples) { \
-        int i; \
-        LOG_DEBUG_AUDIO_CONVERT("F32", #to); \
-        for (i = 0; i < num_samples; i++) { \
-            const float sample = src[i]; \
-            if (sample >= 1.0f) { \
-                dst[i] = (totype) (clampmax); \
-            } else if (sample <= -1.0f) { \
-                dst[i] = (totype) (clampmin); \
-            } else { \
-                dst[i] = (totype) (equation); \
-            } \
-        } \
+    LOG_DEBUG_AUDIO_CONVERT("U8", "F32");
+
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [65536.0, 65538.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x = { .u32 = (Uint8)src[i] ^ 0x47800000u };
+        dst[i] = x.f32 - 65537.0f;
     }
+}
 
-AUDIOCVT_FROMFLOAT_SCALAR(S8, Sint8, -128, 127, sample * 127.0f);
-AUDIOCVT_FROMFLOAT_SCALAR(U8, Uint8, 0, 255, (sample + 1.0f) * 127.0f);
-AUDIOCVT_FROMFLOAT_SCALAR(S16, Sint16, -32768, 32767, sample * 32767.0f);
-AUDIOCVT_FROMFLOAT_SCALAR(S32, Sint32, -2147483648LL, 2147483647, ((Sint32)(sample * 8388607.0f)) << 8);
-#undef AUDIOCVT_FROMFLOAT_SCALAR
+static void SDL_Convert_S16_to_F32_Scalar(float *dst, const Sint16 *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("S16", "F32");
+
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [256.0, 258.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x = { .u32 = (Uint16)src[i] ^ 0x43808000u };
+        dst[i] = x.f32 - 257.0f;
+    }
+}
+
+static void SDL_Convert_S32_to_F32_Scalar(float *dst, const Sint32 *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("S32", "F32");
+
+    for (i = num_samples - 1; i >= 0; --i) {
+        dst[i] = (float)src[i] * 0x1p-31f;
+    }
+}
+
+/* Create a bit-mask based on the sign-bit. Should optimize to a single arithmetic-shift-right */
+#define SIGNMASK(x) (Uint32)(0u - ((Uint32)(x) >> 31))
+
+static void SDL_Convert_F32_to_S8_Scalar(Sint8 *dst, const float *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("F32", "S8");
+
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
+         * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
+         * 3) Clamp the value to [-128, 127] */
+        union float_bits x = { .f32 = src[i] + 98304.0f };
+
+        Uint32 y = x.u32 - 0x47C00000u;
+        Uint32 z = 0x7Fu - (y ^ SIGNMASK(y));
+        y = y ^ (z & SIGNMASK(z));
+
+        dst[i] = (Sint8)(y & 0xFF);
+    }
+}
+
+static void SDL_Convert_F32_to_U8_Scalar(Uint8 *dst, const float *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("F32", "U8");
+
+    for (i = 0; i < num_samples; ++i) {
+        union float_bits x = { .f32 = src[i] + 98304.0f };
+
+        /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
+         * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
+         * 3) Clamp the value to [-128, 127]
+         * 4) Shift the integer range from [-128, 127] to [0, 255] */
+        Uint32 y = x.u32 - 0x47C00000u;
+        Uint32 z = 0x7Fu - (y ^ SIGNMASK(y));
+        y = (y ^ 0x80u) ^ (z & SIGNMASK(z));
+
+        dst[i] = (Uint8)(y & 0xFF);
+    }
+}
+
+static void SDL_Convert_F32_to_S16_Scalar(Sint16 *dst, const float *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("F32", "S16");
+
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [383.0, 385.0]
+         * 2) Shift the integer range from [0x43BF8000, 0x43C08000] to [-32768, 32768]
+         * 3) Clamp values outside the [-32768, 32767] range */
+        union float_bits x = { .f32 = src[i] + 384.0f };
+
+        Uint32 y = x.u32 - 0x43C00000u;
+        Uint32 z = 0x7FFFu - (y ^ SIGNMASK(y));
+        y = y ^ (z & SIGNMASK(z));
+
+        dst[i] = (Sint16)(y & 0xFFFF);
+    }
+}
+
+static void SDL_Convert_F32_to_S32_Scalar(Sint32 *dst, const float *src, int num_samples)
+{
+    int i;
+
+    LOG_DEBUG_AUDIO_CONVERT("F32", "S32");
+
+    for (i = 0; i < num_samples; ++i) {
+        union float_bits x = { .f32 = src[i] };
+
+        /* 1) Shift the float range from [-1.0, 1.0] to [-2147483648.0, 2147483648.0]
+         * 2) Set values outside the [-2147483648.0, 2147483647.0] range to -2147483648.0
+         * 3) Convert the float to an integer, and fixup values outside the valid range */
+        Uint32 y = x.u32 + 0x0F800000u;
+        Uint32 z = y - 0xCF000000u;
+        z &= SIGNMASK(y ^ z);
+        x.u32 = y - z;
+
+        dst[i] = (Sint32)((Uint32)(Sint32)x.f32 ^ SIGNMASK(z));
+    }
+}
+
+#undef SIGNMASK
 
 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
 
@@ -426,6 +531,10 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(Sint32 *dst, const
 #endif
 
 #ifdef SDL_NEON_INTRINSICS
+#define DIVBY128        0x1p-7f
+#define DIVBY32768      0x1p-15f
+#define DIVBY8388607    0x1.000002p-23f
+
 static void SDL_Convert_S8_to_F32_NEON(float *dst, const Sint8 *src, int num_samples)
 {
     int i;