Update deps in libs folder

2025-09-10 12:03:06 +02:00 · 2025-07-16 00:31:30 +08:00 · 2025-07-16 00:31:30 +08:00 · 8687787e6f
commit 8687787e6f
parent 2d19ede535
8 changed files with 2187 additions and 1266 deletions
--- a/libs/detours/SOURCE.txt
+++ b/libs/detours/SOURCE.txt
@ -2,7 +2,7 @@

 https://github.com/microsoft/Detours

-VERSION: https://github.com/microsoft/Detours/tree/4b8c659f549b0ab21cf649377c7a84eb708f5e68
+VERSION: https://github.com/microsoft/Detours/tree/9764cebcb1a75940e68fa83d6730ffaf0f669401

 #### LICENSE

--- a/libs/detours/detours.cpp
+++ b/libs/detours/detours.cpp
@ -156,6 +156,8 @@ inline PBYTE detour_gen_brk(PBYTE pbCode, PBYTE pbLimit)

 inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
 {
+    PBYTE pbCodeOriginal;
+
    if (pbCode == NULL) {
        return NULL;
    }
@ -179,6 +181,7 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
        PBYTE pbNew = pbCode + 2 + *(CHAR *)&pbCode[1];
        DETOUR_TRACE(("%p->%p: skipped over short jump.\n", pbCode, pbNew));
        pbCode = pbNew;
+        pbCodeOriginal = pbCode;

        // First, skip over the import vector if there is one.
        if (pbCode[0] == 0xff && pbCode[1] == 0x25) {   // jmp [imm32]
@ -195,6 +198,23 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
            pbNew = pbCode + 5 + *(UNALIGNED INT32 *)&pbCode[1];
            DETOUR_TRACE(("%p->%p: skipped over long jump.\n", pbCode, pbNew));
            pbCode = pbNew;
+
+            // Patches applied by the OS will jump through an HPAT page to get
+            // the target function in the patch image. The jump is always performed
+            // to the target function found at the current instruction pointer +
+            // PAGE_SIZE - 6 (size of jump).
+            // If this is an OS patch, we want to detour at the point of the target function
+            // padding in the base image. Ideally, we would detour at the target function, but
+            // since it's patched it begins with a short jump (to padding) which isn't long
+            // enough to hold the detour code bytes.
+            if (pbCode[0] == 0xff &&
+                pbCode[1] == 0x25 &&
+                *(UNALIGNED INT32 *)&pbCode[2] == (UNALIGNED INT32)(pbCode + 0x1000)) {   // jmp [eip+PAGE_SIZE-6]
+
+                DETOUR_TRACE(("%p->%p: OS patch encountered, reset back to long jump 5 bytes prior to target function.\n", pbCode, pbCodeOriginal));
+                pbCode = pbCodeOriginal;
+            }
+
        }
    }
    return pbCode;
@ -369,6 +389,8 @@ inline PBYTE detour_gen_brk(PBYTE pbCode, PBYTE pbLimit)

 inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
 {
+    PBYTE pbCodeOriginal;
+
    if (pbCode == NULL) {
        return NULL;
    }
@ -392,6 +414,7 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
        PBYTE pbNew = pbCode + 2 + *(CHAR *)&pbCode[1];
        DETOUR_TRACE(("%p->%p: skipped over short jump.\n", pbCode, pbNew));
        pbCode = pbNew;
+        pbCodeOriginal = pbCode;

        // First, skip over the import vector if there is one.
        if (pbCode[0] == 0xff && pbCode[1] == 0x25) {   // jmp [+imm32]
@ -408,6 +431,21 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
            pbNew = pbCode + 5 + *(UNALIGNED INT32 *)&pbCode[1];
            DETOUR_TRACE(("%p->%p: skipped over long jump.\n", pbCode, pbNew));
            pbCode = pbNew;
+
+            // Patches applied by the OS will jump through an HPAT page to get
+            // the target function in the patch image. The jump is always performed
+            // to the target function found at the current instruction pointer +
+            // PAGE_SIZE - 6 (size of jump).
+            // If this is an OS patch, we want to detour at the point of the target function
+            // in the base image. Since we need 5 bytes to perform the jump, detour at the
+            // point of the long jump instead of the short jump at the start of the target.
+            if (pbCode[0] == 0xff &&
+                pbCode[1] == 0x25 &&
+                *(UNALIGNED INT32 *)&pbCode[2] == 0xFFA) {   // jmp [rip+PAGE_SIZE-6]
+
+                DETOUR_TRACE(("%p->%p: OS patch encountered, reset back to long jump 5 bytes prior to target function.\n", pbCode, pbCodeOriginal));
+                pbCode = pbCodeOriginal;
+            }
        }
    }
    return pbCode;
@ -1151,10 +1189,46 @@ inline void detour_find_jmp_bounds(PBYTE pbCode,
    *ppUpper = (PDETOUR_TRAMPOLINE)hi;
 }

+inline BOOL detour_is_code_os_patched(PBYTE pbCode)
+{
+    // Identify whether the provided code pointer is a OS patch jump.
+    // We can do this by checking if a branch (b <imm26>) is present, and if so,
+    // it must be jumping to an HPAT page containing ldr <reg> [PC+PAGE_SIZE-4], br <reg>.
+    ULONG Opcode = fetch_opcode(pbCode);
+
+    if ((Opcode & 0xfc000000) != 0x14000000) {
+        return FALSE;
+    }
+    // The branch must be jumping forward if it's going into the HPAT.
+    // Check that the sign bit is cleared.
+    if ((Opcode & 0x2000000) != 0) {
+        return FALSE;
+    }
+    ULONG Delta = (ULONG)((Opcode & 0x1FFFFFF) * 4);
+    PBYTE BranchTarget = pbCode + Delta;
+
+    // Now inspect the opcodes of the code we jumped to in order to determine if it's HPAT.
+    ULONG HpatOpcode1 = fetch_opcode(BranchTarget);
+    ULONG HpatOpcode2 = fetch_opcode(BranchTarget + 4);
+
+    if (HpatOpcode1 != 0x58008010) {    // ldr <reg> [PC+PAGE_SIZE]
+        return FALSE;
+    }
+    if (HpatOpcode2 != 0xd61f0200) {    // br <reg>
+        return FALSE;
+    }
+    return TRUE;
+}
+
 inline BOOL detour_does_code_end_function(PBYTE pbCode)
 {
    ULONG Opcode = fetch_opcode(pbCode);
-    if ((Opcode & 0xfffffc1f) == 0xd65f0000 ||      // br <reg>
+    // When the OS has patched a function entry point, it will incorrectly
+    // appear as though the function is just a single branch instruction.
+    if (detour_is_code_os_patched(pbCode)) {
+        return FALSE;
+    }
+    if ((Opcode & 0xffbffc1f) == 0xd61f0000 ||      // ret/br <reg>
        (Opcode & 0xfc000000) == 0x14000000) {      // b <imm26>
        return TRUE;
    }
@ -1837,41 +1911,46 @@ LONG WINAPI DetourTransactionCommitEx(_Out_opt_ PVOID **pppFailedPointer)
        }
    }

-    // Update any suspended threads.
-    for (t = s_pPendingThreads; t != NULL; t = t->pNext) {
-        CONTEXT cxt;
-        cxt.ContextFlags = CONTEXT_CONTROL;
-
 #undef DETOURS_EIP
+#undef DETOURS_CONTEXT_FLAGS

 #ifdef DETOURS_X86
 #define DETOURS_EIP         Eip
+#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
 #endif // DETOURS_X86

 #ifdef DETOURS_X64
 #define DETOURS_EIP         Rip
+#define DETOURS_CONTEXT_FLAGS (CONTEXT_CONTROL | CONTEXT_INTEGER)
 #endif // DETOURS_X64

 #ifdef DETOURS_IA64
 #define DETOURS_EIP         StIIP
+#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
 #endif // DETOURS_IA64

 #ifdef DETOURS_ARM
 #define DETOURS_EIP         Pc
+#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
 #endif // DETOURS_ARM

 #ifdef DETOURS_ARM64
 #define DETOURS_EIP         Pc
+#define DETOURS_CONTEXT_FLAGS (CONTEXT_CONTROL | CONTEXT_INTEGER)
 #endif // DETOURS_ARM64

 typedef ULONG_PTR DETOURS_EIP_TYPE;

+    // Update any suspended threads.
+    for (t = s_pPendingThreads; t != NULL; t = t->pNext) {
+        CONTEXT cxt;
+        cxt.ContextFlags = DETOURS_CONTEXT_FLAGS;
        if (GetThreadContext(t->hThread, &cxt)) {
            for (o = s_pPendingOperations; o != NULL; o = o->pNext) {
                if (o->fIsRemove) {
                    if (cxt.DETOURS_EIP >= (DETOURS_EIP_TYPE)(ULONG_PTR)o->pTrampoline &&
                        cxt.DETOURS_EIP < (DETOURS_EIP_TYPE)((ULONG_PTR)o->pTrampoline
-                                                             + sizeof(o->pTrampoline))
+                                                             + sizeof(*o->pTrampoline))
                       ) {

                        cxt.DETOURS_EIP = (DETOURS_EIP_TYPE)
@ -2064,6 +2143,15 @@ LONG WINAPI DetourAttachEx(_Inout_ PVOID *ppPointer,
    DETOUR_TRACE(("  ppldTarget=%p, code=%p [gp=%p]\n",
                  ppldTarget, pbTarget, pTargetGlobals));
 #else // DETOURS_IA64
+#if defined(_M_ARM64EC)
+    if (RtlIsEcCode(reinterpret_cast<DWORD64>(*ppPointer))) {
+        DETOUR_TRACE(("*ppPointer is an Arm64EC address (ppPointer=%p). "
+                      "An Arm64EC address cannot be legitimately detoured with an x64 jmp. "
+                      "Mark the target function with __declspec(hybrid_patchable) to make it detour-able. "
+                      "We still allow an Arm64EC function to be detoured with an x64 jmp to make it easy (crash) to debug.\n", ppPointer));
+        DETOUR_BREAK();
+    }
+#endif
    pbTarget = (PBYTE)DetourCodeFromPointer(pbTarget, NULL);
    pDetour = DetourCodeFromPointer(pDetour, NULL);
 #endif // !DETOURS_IA64
--- a/libs/detours/detours.h
+++ b/libs/detours/detours.h
@ -83,11 +83,15 @@
 #undef DETOURS_32BIT
 #undef DETOURS_64BIT

+#ifndef DECLSPEC_HYBRID_PATCHABLE
+#define DECLSPEC_HYBRID_PATCHABLE DECLSPEC_CHPE_PATCHABLE
+#endif
+
 #if defined(_X86_)
 #define DETOURS_X86
 #define DETOURS_OPTION_BITS 64

-#elif defined(_AMD64_)
+#elif defined(_AMD64_) || defined(_ARM64EC_)
 #define DETOURS_X64
 #define DETOURS_OPTION_BITS 32

@ -102,7 +106,7 @@
 #define DETOURS_ARM64

 #else
-#error Unknown architecture (x86, amd64, ia64, arm, arm64)
+#error Unknown architecture (x86, amd64, ia64, arm, arm64, arm64ec)
 #endif

 #ifdef _WIN64
--- a/libs/detours/modules.cpp
+++ b/libs/detours/modules.cpp
@ -645,6 +645,7 @@ BOOL WINAPI DetourEnumerateImportsEx(_In_opt_ HMODULE hModule,
 struct _DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT
 {
    PVOID pContext;
+    PF_DETOUR_IMPORT_FILE_CALLBACK pfImportFile;
    PF_DETOUR_IMPORT_FUNC_CALLBACK pfImportFunc;
 };

@ -664,6 +665,19 @@ DetourEnumerateImportsThunk(_In_ PVOID VoidContext,
    return pContext->pfImportFunc(pContext->pContext, nOrdinal, pszFunc, ppvFunc ? *ppvFunc : NULL);
 }

+static
+BOOL
+CALLBACK
+DetourEnumerateImportsFile(_In_ PVOID VoidContext,
+                           _In_opt_ HMODULE hModule,
+                           _In_opt_ LPCSTR pszFile)
+{
+    _DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const * const
+        pContext = (_DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT*)VoidContext;
+    return pContext->pfImportFile(pContext->pContext, hModule, pszFile);
+}
+
+
 BOOL WINAPI DetourEnumerateImports(_In_opt_ HMODULE hModule,
                                   _In_opt_ PVOID pContext,
                                   _In_opt_ PF_DETOUR_IMPORT_FILE_CALLBACK pfImportFile,
@ -674,11 +688,10 @@ BOOL WINAPI DetourEnumerateImports(_In_opt_ HMODULE hModule,
        return FALSE;
    }

-    _DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const context = { pContext, pfImportFunc };
-
+    _DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const context = { pContext, pfImportFile, pfImportFunc };
    return DetourEnumerateImportsEx(hModule,
                                    (PVOID)&context,
-                                    pfImportFile,
+                                    &DetourEnumerateImportsFile,
                                    &DetourEnumerateImportsThunk);
 }

--- a/libs/json/SOURCE.txt
+++ b/libs/json/SOURCE.txt
@ -2,7 +2,7 @@

 https://github.com/nlohmann/json

-VERSION: https://github.com/nlohmann/json/releases/tag/v3.11.3
+VERSION: https://github.com/nlohmann/json/releases/tag/v3.12.0

 #### LICENSE

--- a/libs/json/json.hpp
+++ b/libs/json/json.hpp
--- a/libs/stb/SOURCE.txt
+++ b/libs/stb/SOURCE.txt
@ -2,7 +2,7 @@

 https://github.com/nothings/stb

-VERSION: https://github.com/nothings/stb/tree/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31
+VERSION: https://github.com/nothings/stb/tree/f58f558c120e9b32c217290b80bad1a0729fbb2c

 #### LICENSE

--- a/libs/stb/stb_image_resize2.h
+++ b/libs/stb/stb_image_resize2.h
@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.10 - public domain image resizing
+/* stb_image_resize2 - v2.14 - public domain image resizing

   by Jeff Roberts (v2) and Jorge L Rodriguez
   http://github.com/nothings/stb
@ -11,35 +11,6 @@
         #define STB_IMAGE_RESIZE_IMPLEMENTATION
      before the #include. That will create the implementation in that file.

-   PORTING FROM VERSION 1
-
-      The API has changed. You can continue to use the old version of stb_image_resize.h,
-      which is available in the "deprecated/" directory.
-
-      If you're using the old simple-to-use API, porting is straightforward.
-      (For more advanced APIs, read the documentation.)
-
-        stbir_resize_uint8():
-          - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_float():
-          - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_uint8_srgb():
-          - function name is unchanged
-          - cast channel count to `stbir_pixel_layout`
-          - above is sufficient unless your image has alpha and it's not RGBA/BGRA
-            - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
-
-        stbir_resize_uint8_srgb_edgemode()
-          - switch to the "medium complexity" API
-          - stbir_resize(), very similar API but a few more parameters:
-            - pixel_layout: cast channel count to `stbir_pixel_layout`
-            - data_type:    STBIR_TYPE_UINT8_SRGB
-            - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
-            - filter:       STBIR_FILTER_DEFAULT
-          - which channel is alpha is specified in stbir_pixel_layout, see enum for details
-
   EASY API CALLS:
     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.

@ -283,7 +254,7 @@
         using the stbir_set_filter_callbacks function.

      PROGRESS
-         For interactive use with slow resize operations, you can use the the
+         For interactive use with slow resize operations, you can use the 
         scanline callbacks in the extended API. It would have to be a *very* large
         image resample to need progress though - we're very fast.

@ -296,6 +267,34 @@
      ASSERT
         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h

+     PORTING FROM VERSION 1
+        The API has changed. You can continue to use the old version of stb_image_resize.h,
+        which is available in the "deprecated/" directory.
+
+        If you're using the old simple-to-use API, porting is straightforward.
+        (For more advanced APIs, read the documentation.)
+
+          stbir_resize_uint8():
+            - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_float():
+            - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_uint8_srgb():
+            - function name is unchanged
+            - cast channel count to `stbir_pixel_layout`
+            - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+              - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+          stbir_resize_uint8_srgb_edgemode()
+            - switch to the "medium complexity" API
+            - stbir_resize(), very similar API but a few more parameters:
+              - pixel_layout: cast channel count to `stbir_pixel_layout`
+              - data_type:    STBIR_TYPE_UINT8_SRGB
+              - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+              - filter:       STBIR_FILTER_DEFAULT
+            - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
      FUTURE TODOS
        *  For polyphase integral filters, we just memcpy the coeffs to dupe
           them, but we should indirect and use the same coeff memory.
@ -308,6 +307,8 @@
           some pixel reconversion, but probably dwarfed by things falling out
           of cache. Probably also something possible with alternating between
           scattering and gathering at high resize scales?
+         * Should we have a multiple MIPs at the same time function (could keep
+           more memory in cache during multiple resizes)?
         * Rewrite the coefficient generator to do many at once.
         * AVX-512 vertical kernels - worried about downclocking here.
         * Convert the reincludes to macros when we know they aren't changing.
@ -328,6 +329,16 @@
      Nathan Reed: warning fixes for 1.0

   REVISIONS
+      2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and 
+                          scatter with vertical first.
+      2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for 
+                          tiny-c, fixed some variables that should have been static,
+                          fixes a bug when calculating temp memory with resizes that
+                          exceed 2GB of temp memory (very large resizes).
+      2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
+      2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
+                          with AVX-2, fix some weird scaling edge conditions with
+                          point sample mode.
      2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
                          fix MSVC 32-bit arm half float routines.
      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
@ -335,11 +346,11 @@
      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
                          to Ryan Salsbury), fix for sub-rect resizes, use the
                          pragmas to control unrolling when they are available.
-      2.07 (2024-05-24) fix for slow final split during threaded conversions of very
-                          wide scanlines when downsampling (caused by extra input
-                          converting), fix for wide scanline resamples with many
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
                          splits (int overflow), fix GCC warning.
-      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
                          undersampling a single row on rare resize ratios (about 1%).
      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
                        fix for output callback (thanks Julien Koenen).
@ -379,62 +390,6 @@ typedef uint32_t stbir_uint32;
 typedef uint64_t stbir_uint64;
 #endif

-#ifdef _M_IX86_FP
-#if ( _M_IX86_FP >= 1 )
-#ifndef STBIR_SSE
-#define STBIR_SSE
-#endif
-#endif
-#endif
-
-#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
-  #ifndef STBIR_SSE2
-    #define STBIR_SSE2
-  #endif
-  #if defined(__AVX__) || defined(STBIR_AVX2)
-    #ifndef STBIR_AVX
-      #ifndef STBIR_NO_AVX
-        #define STBIR_AVX
-      #endif
-    #endif
-  #endif
-  #if defined(__AVX2__) || defined(STBIR_AVX2)
-    #ifndef STBIR_NO_AVX2
-      #ifndef STBIR_AVX2
-        #define STBIR_AVX2
-      #endif
-      #if defined( _MSC_VER ) && !defined(__clang__)
-        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
-          #define STBIR_FP16C
-        #endif
-      #endif
-    #endif
-  #endif
-  #ifdef __F16C__
-    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
-      #define STBIR_FP16C
-    #endif
-  #endif
-#endif
-
-#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
-#ifndef STBIR_NEON
-#define STBIR_NEON
-#endif
-#endif
-
-#if defined(_M_ARM) || defined(__arm__)
-#ifdef STBIR_USE_FMA
-#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
-#endif
-#endif
-
-#if defined(__wasm__) && defined(__wasm_simd128__)
-#ifndef STBIR_WASM
-#define STBIR_WASM
-#endif
-#endif
-
 #ifndef STBIRDEF
 #ifdef STB_IMAGE_RESIZE_STATIC
 #define STBIRDEF static
@ -1033,7 +988,7 @@ typedef struct
  char no_cache_straddle[64];
 } stbir__per_split_info;

-typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
+typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
 typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
 typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
  stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
@ -1096,8 +1051,8 @@ struct stbir__info

 #define stbir__max_uint8_as_float             255.0f
 #define stbir__max_uint16_as_float            65535.0f
-#define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
-#define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
+#define stbir__max_uint8_as_float_inverted    3.9215689e-03f     // (1.0f/255.0f)
+#define stbir__max_uint16_as_float_inverted   1.5259022e-05f     // (1.0f/65535.0f)
 #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))

 // min/max friendly
@ -1202,23 +1157,86 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
 #endif

+#define STBIR_INPUT_CALLBACK_PADDING 3
+
+#ifdef _M_IX86_FP
+#if ( _M_IX86_FP >= 1 )
+#ifndef STBIR_SSE
+#define STBIR_SSE
+#endif
+#endif
+#endif
+
+#ifdef __TINYC__
+  // tiny c has no intrinsics yet - this can become a version check if they add them
+  #define STBIR_NO_SIMD
+#endif
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
+  #ifndef STBIR_SSE2
+    #define STBIR_SSE2
+  #endif
+  #if defined(__AVX__) || defined(STBIR_AVX2)
+    #ifndef STBIR_AVX
+      #ifndef STBIR_NO_AVX
+        #define STBIR_AVX
+      #endif
+    #endif
+  #endif
+  #if defined(__AVX2__) || defined(STBIR_AVX2)
+    #ifndef STBIR_NO_AVX2
+      #ifndef STBIR_AVX2
+        #define STBIR_AVX2
+      #endif
+      #if defined( _MSC_VER ) && !defined(__clang__)
+        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
+          #define STBIR_FP16C
+        #endif
+      #endif
+    #endif
+  #endif
+  #ifdef __F16C__
+    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
+      #define STBIR_FP16C
+    #endif
+  #endif
+#endif
+
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
+#ifndef STBIR_NEON
+#define STBIR_NEON
+#endif
+#endif
+
+#if defined(_M_ARM) || defined(__arm__)
+#ifdef STBIR_USE_FMA
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
+#endif
+#endif
+
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#ifndef STBIR_WASM
+#define STBIR_WASM
+#endif
+#endif
+
 // restrict pointers for the output pointers, other loop and unroll control
 #if defined( _MSC_VER ) && !defined(__clang__)
  #define STBIR_STREAMOUT_PTR( star ) star __restrict
  #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
  #if _MSC_VER >= 1900
-    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
+    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
  #else
-    #define STBIR_NO_UNROLL_LOOP_START
+    #define STBIR_NO_UNROLL_LOOP_START 
  #endif
 #elif defined( __clang__ )
  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
  #else
    #define STBIR_NO_UNROLL_LOOP_START
-  #endif
+  #endif 
 #elif defined( __GNUC__ )
  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
@ -1448,8 +1466,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
    #include <smmintrin.h>
    #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
  #else
-    STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
-    STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+    static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+    static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));

    #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
      { \
@ -3214,10 +3232,9 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
    newspan->n0 = -left_margin;
    newspan->n1 = ( max_left - min_left ) - left_margin;
    scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
-    return;
  }
-
  // if we can't merge the min_left range, add it as a second range
+  else  
  if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
  {
    stbir__span * newspan = scanline_extents->spans + 1;
@ -3232,7 +3249,14 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
    newspan->n0 = scanline_extents->spans[1].n1 + 1;
    newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
    scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
-    return;
+  }
+
+  // sort the spans into write output order
+  if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
+  {
+    stbir__span tspan = scanline_extents->spans[0];
+    scanline_extents->spans[0] = scanline_extents->spans[1];
+    scanline_extents->spans[1] = tspan;
  }
 }

@ -3247,6 +3271,7 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel

  first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
  last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+  if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross

  if ( edge == STBIR_EDGE_WRAP )
  {
@ -3282,6 +3307,11 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_

    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );

+    // make sure we never generate a range larger than our precalculated coeff width
+    //   this only happens in point sample mode, but it's a good safe thing to do anyway
+    if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
+      in_last_pixel = in_first_pixel + coefficient_width - 1;
+
    last_non_zero = -1;
    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
    {
@ -3317,19 +3347,22 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
  }
 }

-static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
 {
  if ( new_pixel <= contribs->n1 )  // before the end
  {
    if ( new_pixel < contribs->n0 ) // before the front?
    {
-      int j, o = contribs->n0 - new_pixel;
-      for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
-        coeffs[ j + o ] = coeffs[ j ];
-      for ( j = 1 ; j < o ; j-- )
-        coeffs[ j ] = coeffs[ 0 ];
-      coeffs[ 0 ] = new_coeff;
-      contribs->n0 = new_pixel;
+      if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
+      { 
+        int j, o = contribs->n0 - new_pixel;
+        for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+          coeffs[ j + o ] = coeffs[ j ];
+        for ( j = 1 ; j < o ; j-- )
+          coeffs[ j ] = coeffs[ 0 ];
+        coeffs[ 0 ] = new_coeff;
+        contribs->n0 = new_pixel;
+      }
    }
    else
    {
@ -3338,12 +3371,15 @@ static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs,
  }
  else
  {
-    int j, e = new_pixel - contribs->n0;
-    for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
-      coeffs[j] = 0;
+    if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
+    {
+      int j, e = new_pixel - contribs->n0;
+      for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+        coeffs[j] = 0;

-    coeffs[ e ] = new_coeff;
-    contribs->n1 = new_pixel;
+      coeffs[ e ] = new_coeff;
+      contribs->n1 = new_pixel;
+    }
  }
 }

@ -3522,6 +3558,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter

  coeffs = coefficient_group;
  contribs = contributors;
+
  for (n = 0; n < num_contributors; n++)
  {
    int i;
@ -3561,7 +3598,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
        int endi = contribs->n1;
        contribs->n1 = input_last_n1;
        for( i = input_size; i <= endi; i++ )
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
      }

      // now check left hand edge
@ -3573,7 +3610,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter

        // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
        for( i = -1 ; i > contribs->n0 ; i-- )
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
        save_n0 = contribs->n0;
        save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!

@ -3583,7 +3620,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
          coeffs[i] = coeffs[i-save_n0];

        // now that we have shrunk down the contribs, we insert the first one safely
-        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
      }
    }

@ -3592,6 +3629,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
      int diff = contribs->n1 - contribs->n0 + 1;
      while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
        --diff;
+
      contribs->n1 = contribs->n0 + diff - 1;

      if ( contribs->n0 <= contribs->n1 )
@ -3617,9 +3655,9 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
  filter_info->widest = widest;
 }

-#undef STBIR_RENORM_TYPE
+#undef STBIR_RENORM_TYPE 

-static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 {
  #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
  #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
@ -3940,7 +3978,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
          for (k = gn0 ; k <= gn1 ; k++ )
          {
            float gc = *g_coeffs++;
-
+            
            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
            //   (which happens when pivoting from horizontal, which might have dummy zeros)
            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
@ -3964,7 +4002,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
              }
              else
              {
-                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
              }
              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
            }
@ -4441,7 +4479,7 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann

 #ifdef STBIR_SIMD
    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
-      end_decode -= 12;
+      end_decode -= 12; 
      STBIR_NO_UNROLL_LOOP_START
      while( decode <= end_decode )
      {
@ -4452,13 +4490,13 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
        stbir__simdf_load( b, decode+4 );
        stbir__simdf_load( c, decode+8 );

-        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
-        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
-        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
-        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
+        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
+        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
+        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
+        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   

        stbir__simdf_store( decode, na );
-        stbir__simdf_store( decode+4, nb );
+        stbir__simdf_store( decode+4, nb ); 
        stbir__simdf_store( decode+8, c );
        decode += 12;
      }
@ -4480,18 +4518,18 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
        stbir__simdf_load( f, decode+15 );
        stbir__simdf_load( g, decode+18 );

-        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
-        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
-        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
-        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
-        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
-        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
-        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
+        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
+        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
+        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
+        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
+        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
+        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
+        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   

-        // stores overlap, need to be in order,
+        // stores overlap, need to be in order, 
        stbir__simdf_store( decode,    a );
        i21 = decode[21];
-        stbir__simdf_store( decode+3,  b );
+        stbir__simdf_store( decode+3,  b ); 
        i23 = decode[23];
        stbir__simdf_store( decode+6,  c );
        stbir__simdf_store( decode+9,  d );
@ -4543,7 +4581,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
  int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
  stbir__span const * spans = stbir_info->scanline_extents.spans;
-  float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+  float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+  float * last_decoded = 0;

  // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
  STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
@ -4571,12 +4610,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
    if ( stbir_info->in_pixels_cb )
    {
      // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
-      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
+      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING, input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
    }

    STBIR_PROFILE_START( decode );
    // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
-    stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
+    last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
    STBIR_PROFILE_END( decode );

    if (stbir_info->alpha_weight)
@ -4611,9 +4650,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
        float * marg = full_decode_buffer + x * effective_channels;
        float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
        STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
+        if ( e == 1 ) last_decoded = marg + margin * effective_channels;
      }
    }
  }
+  
+  // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
+  //   (we can't pre-zero it, because the input callback can use that area as padding)
+  last_decoded[0] = 0.0f; 
+
+  // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
+  //   when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
+  //   this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
+  last_decoded[1] = 0.0f;
 }


@ -4810,12 +4859,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );

 #define stbir__1_coeff_remnant( ofs )                \
-    { stbir__simdf t;                                \
+    { stbir__simdf t,d;                              \
    stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 );     \
    stbir__simdf_0123to0011( t, t );                 \
-    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf_mult( t, t, d );                    \
    stbir__simdf8_add4( tot0, tot0, t ); }
-
+ 
 #define stbir__2_coeff_remnant( ofs )                \
    { stbir__simdf t;                                \
    stbir__simdf_load2( t, hc + (ofs) );             \
@ -6191,6 +6241,8 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
  if ( vertical_first )
  {
    // Now resample the gathered vertical data in the horizontal axis into the encode buffer
+    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
+    decode_buffer[ width_times_channels+1 ] = 0.0f; 
    stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  }

@ -6362,6 +6414,8 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
  void * scanline_scatter_buffer;
  void * scanline_scatter_buffer_end;
  int on_first_input_y, last_input_y;
+  int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
+  int width_times_channels = stbir_info->effective_channels * width;

  STBIR_ASSERT( !stbir_info->vertical.is_gather );

@ -6396,7 +6450,12 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_

  // mark all the buffers as empty to start
  for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
-    stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+  {
+    float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
+    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
+    decode_buffer[ width_times_channels+1 ] = 0.0f; 
+    decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+  }

  // do the loop in input space
  on_first_input_y = 1; last_input_y = start_input_y;
@ -6519,11 +6578,11 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
  samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);

  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
-  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the
-  //   filter will extend before or after the scanline beyond just one extra entire copy of the
-  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total
-  //   width to 3x the total of input pixel (once for the scanline, once for the left side
-  //   overhang, and once for the right side). We only do this for edge mode, since the other
+  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
+  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
+  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
+  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
+  //   overhang, and once for the right side). We only do this for edge mode, since the other 
  //   modes can just re-edge clamp back in again.
  if ( edge == STBIR_EDGE_WRAP )
    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
@ -6532,11 +6591,11 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
  // This is how much to expand buffers to account for filters seeking outside
  // the image boundaries.
  samp->filter_pixel_margin = samp->filter_pixel_width / 2;
-
-  // filter_pixel_margin is the amount that this filter can overhang on just one side of either
-  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's
-  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again,
-  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix).
+  
+  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
+  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
+  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
+  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
  if ( edge == STBIR_EDGE_WRAP )
    if ( samp->filter_pixel_margin > scale_info->input_full_size )
      samp->filter_pixel_margin = scale_info->input_full_size;
@ -6544,7 +6603,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
  samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);

  samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
-  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
+  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding

  samp->gather_prescatter_contributors = 0;
  samp->gather_prescatter_coefficients = 0;
@ -6714,7 +6773,7 @@ static void stbir__free_internal_mem( stbir__info *info )
    STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
    STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
    STBIR__FREE_AND_CLEAR( info->alloced_mem );
-    STBIR__FREE_AND_CLEAR( info );
+    STBIR_FREE( info, info->user_data );
  #endif
  }

@ -6909,7 +6968,8 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
  void * alloced = 0;
  size_t alloced_total = 0;
  int vertical_first;
-  int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
+  size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
+  int alloc_ring_buffer_num_entries;

  int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
@ -6954,14 +7014,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
  vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );

  // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
-  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+  //   we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
+  //   the conversion routines overwriting the callback input data.
+  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger

 #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
  if ( effective_channels == 3 )
    decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
 #endif

-  ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+  ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding

  // if we do vertical first, the ring buffer holds a whole decoded line
  if ( vertical_first )
@ -6976,13 +7038,13 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
  if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
    alloc_ring_buffer_num_entries = conservative_split_output_size;

-  ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
+  ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;

  // The vertical buffer is used differently, depending on whether we are scattering
  //   the vertical scanlines, or gathering them.
  //   If scattering, it's used at the temp buffer to accumulate each output.
  //   If gathering, it's just the output buffer.
-  vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
+  vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float);  // extra float for padding

  // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
  for(;;)
@ -7018,9 +7080,9 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample

      info->offset_x = new_x;
      info->offset_y = new_y;
-      info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
+      info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
      info->ring_buffer_num_entries = 0;
-      info->ring_buffer_length_bytes = ring_buffer_length_bytes;
+      info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
      info->splits = splits;
      info->vertical_first = vertical_first;

@ -7101,19 +7163,24 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
    // alloc memory for to-be-pivoted coeffs (if necessary)
    if ( vertical->is_gather == 0 )
    {
-      int both;
-      int temp_mem_amt;
+      size_t both;
+      size_t temp_mem_amt;

      // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
      //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
      //   is too small, we just allocate extra memory to use as this temp.

-      both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
+      both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;

 #ifdef STBIR__SEPARATE_ALLOCATIONS
      temp_mem_amt = decode_buffer_size;
+
+      #ifdef STBIR_SIMD8
+      if ( effective_channels == 3 )
+        --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
 #else
-      temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
+      temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
 #endif
      if ( temp_mem_amt >= both )
      {
@ -7208,33 +7275,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
      if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
        info->ring_buffer_num_entries = conservative_split_output_size;
      STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
-
-      // a few of the horizontal gather functions read past the end of the decode (but mask it out),
-      //   so put in normal values so no snans or denormals accidentally sneak in (also, in the ring
-      //   buffer for vertical first)
-      for( i = 0 ; i < splits ; i++ )
-      {
-        int t, ofs, start;
-
-        ofs = decode_buffer_size / 4;
-        start = ofs - 4;
-        if ( start < 0 ) start = 0;
-
-        for( t = start ; t < ofs; t++ )
-          info->split_info[i].decode_buffer[ t ] = 9999.0f;
-
-        if ( vertical_first )
-        {
-          int j;
-          for( j = 0; j < info->ring_buffer_num_entries ; j++ )
-          {
-            for( t = start ; t < ofs; t++ )
-              stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f;
-          }
-        }
-      }
    }
-
    #undef STBIR__NEXT_PTR


@ -8197,7 +8238,7 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
 #define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
 #endif

-static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  float * decode_end = (float*) decode + width_times_channels;
@ -8257,7 +8298,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
      decode = decode_end; // backup and do last couple
      input = end_input_m16;
    }
-    return;
+    return decode_end + 16;
  }
  #endif

@ -8295,6 +8336,8 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
    input += stbir__coder_min_num;
  }
  #endif
+
+  return decode_end;
 }

 static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
@ -8414,7 +8457,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
  #endif
 }

-static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  float * decode_end = (float*) decode + width_times_channels;
@ -8468,7 +8511,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
      decode = decode_end; // backup and do last couple
      input = end_input_m16;
    }
-    return;
+    return decode_end + 16;
  }
  #endif

@ -8506,6 +8549,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;
 }

 static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
@ -8607,10 +8651,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
  #endif
 }

-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
  unsigned char const * input = (unsigned char const *)inputp;

  // try to do blocks of 4 when you can
@ -8645,6 +8689,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;
 }

 #define stbir__min_max_shift20( i, f ) \
@ -8797,11 +8842,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w

 #if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )

-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
  unsigned char const * input = (unsigned char const *)inputp;
+
  do {
    decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
    decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
@ -8810,6 +8856,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * de
    input += 4;
    decode += 4;
  } while( decode < decode_end );
+  return decode_end;
 }


@ -8882,11 +8929,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o

 #if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )

-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
  unsigned char const * input = (unsigned char const *)inputp;
+
  decode += 4;
  while( decode <= decode_end )
  {
@ -8903,6 +8951,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * de
    decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
    decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
  }
+  return decode_end;
 }

 static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
@ -8968,7 +9017,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o

 #endif

-static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  float * decode_end = (float*) decode + width_times_channels;
@ -9016,7 +9065,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
      decode = decode_end; // backup and do last couple
      input = end_input_m8;
    }
-    return;
+    return decode_end + 8;
  }
  #endif

@ -9054,6 +9103,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;
 }


@ -9173,7 +9223,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
  #endif
 }

-static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  float * decode_end = (float*) decode + width_times_channels;
@ -9218,7 +9268,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
      decode = decode_end; // backup and do last couple
      input = end_input_m8;
    }
-    return;
+    return decode_end + 8;
  }
  #endif

@ -9256,6 +9306,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;
 }

 static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
@ -9356,7 +9407,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
  #endif
 }

-static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  float * decode_end = (float*) decode + width_times_channels;
@ -9402,7 +9453,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
      decode = decode_end; // backup and do last couple
      input = end_input_m8;
    }
-    return;
+    return decode_end + 8;
  }
  #endif

@ -9440,6 +9491,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;
 }

 static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
@ -9526,7 +9578,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
  #endif
 }

-static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
  #ifdef stbir__decode_swizzle
  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
@ -9580,7 +9632,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
      decode = decode_end; // backup and do last couple
      input = end_input_m16;
    }
-    return;
+    return decode_end + 16;
  }
  #endif

@ -9618,12 +9670,15 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
    input += stbir__coder_min_num;
  }
  #endif
+  return decode_end;

  #else

  if ( (void*)decodep != inputp )
    STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );

+  return decodep + width_times_channels;
+
  #endif
 }