1
0
Fork 0
mirror of https://github.com/Detanup01/gbe_fork.git synced 2025-09-10 12:03:06 +02:00

Update deps in libs folder

This commit is contained in:
universal963 2025-07-16 00:31:30 +08:00
parent 2d19ede535
commit 8687787e6f
8 changed files with 2187 additions and 1266 deletions

View file

@ -2,7 +2,7 @@
https://github.com/microsoft/Detours
VERSION: https://github.com/microsoft/Detours/tree/4b8c659f549b0ab21cf649377c7a84eb708f5e68
VERSION: https://github.com/microsoft/Detours/tree/9764cebcb1a75940e68fa83d6730ffaf0f669401
#### LICENSE

View file

@ -156,6 +156,8 @@ inline PBYTE detour_gen_brk(PBYTE pbCode, PBYTE pbLimit)
inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
{
PBYTE pbCodeOriginal;
if (pbCode == NULL) {
return NULL;
}
@ -179,6 +181,7 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
PBYTE pbNew = pbCode + 2 + *(CHAR *)&pbCode[1];
DETOUR_TRACE(("%p->%p: skipped over short jump.\n", pbCode, pbNew));
pbCode = pbNew;
pbCodeOriginal = pbCode;
// First, skip over the import vector if there is one.
if (pbCode[0] == 0xff && pbCode[1] == 0x25) { // jmp [imm32]
@ -195,6 +198,23 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
pbNew = pbCode + 5 + *(UNALIGNED INT32 *)&pbCode[1];
DETOUR_TRACE(("%p->%p: skipped over long jump.\n", pbCode, pbNew));
pbCode = pbNew;
// Patches applied by the OS will jump through an HPAT page to get
// the target function in the patch image. The jump is always performed
// to the target function found at the current instruction pointer +
// PAGE_SIZE - 6 (size of jump).
// If this is an OS patch, we want to detour at the point of the target function
// padding in the base image. Ideally, we would detour at the target function, but
// since it's patched it begins with a short jump (to padding) which isn't long
// enough to hold the detour code bytes.
if (pbCode[0] == 0xff &&
pbCode[1] == 0x25 &&
*(UNALIGNED INT32 *)&pbCode[2] == (UNALIGNED INT32)(pbCode + 0x1000)) { // jmp [eip+PAGE_SIZE-6]
DETOUR_TRACE(("%p->%p: OS patch encountered, reset back to long jump 5 bytes prior to target function.\n", pbCode, pbCodeOriginal));
pbCode = pbCodeOriginal;
}
}
}
return pbCode;
@ -369,6 +389,8 @@ inline PBYTE detour_gen_brk(PBYTE pbCode, PBYTE pbLimit)
inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
{
PBYTE pbCodeOriginal;
if (pbCode == NULL) {
return NULL;
}
@ -392,6 +414,7 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
PBYTE pbNew = pbCode + 2 + *(CHAR *)&pbCode[1];
DETOUR_TRACE(("%p->%p: skipped over short jump.\n", pbCode, pbNew));
pbCode = pbNew;
pbCodeOriginal = pbCode;
// First, skip over the import vector if there is one.
if (pbCode[0] == 0xff && pbCode[1] == 0x25) { // jmp [+imm32]
@ -408,6 +431,21 @@ inline PBYTE detour_skip_jmp(PBYTE pbCode, PVOID *ppGlobals)
pbNew = pbCode + 5 + *(UNALIGNED INT32 *)&pbCode[1];
DETOUR_TRACE(("%p->%p: skipped over long jump.\n", pbCode, pbNew));
pbCode = pbNew;
// Patches applied by the OS will jump through an HPAT page to get
// the target function in the patch image. The jump is always performed
// to the target function found at the current instruction pointer +
// PAGE_SIZE - 6 (size of jump).
// If this is an OS patch, we want to detour at the point of the target function
// in the base image. Since we need 5 bytes to perform the jump, detour at the
// point of the long jump instead of the short jump at the start of the target.
if (pbCode[0] == 0xff &&
pbCode[1] == 0x25 &&
*(UNALIGNED INT32 *)&pbCode[2] == 0xFFA) { // jmp [rip+PAGE_SIZE-6]
DETOUR_TRACE(("%p->%p: OS patch encountered, reset back to long jump 5 bytes prior to target function.\n", pbCode, pbCodeOriginal));
pbCode = pbCodeOriginal;
}
}
}
return pbCode;
@ -1151,10 +1189,46 @@ inline void detour_find_jmp_bounds(PBYTE pbCode,
*ppUpper = (PDETOUR_TRAMPOLINE)hi;
}
inline BOOL detour_is_code_os_patched(PBYTE pbCode)
{
// Identify whether the provided code pointer is a OS patch jump.
// We can do this by checking if a branch (b <imm26>) is present, and if so,
// it must be jumping to an HPAT page containing ldr <reg> [PC+PAGE_SIZE-4], br <reg>.
ULONG Opcode = fetch_opcode(pbCode);
if ((Opcode & 0xfc000000) != 0x14000000) {
return FALSE;
}
// The branch must be jumping forward if it's going into the HPAT.
// Check that the sign bit is cleared.
if ((Opcode & 0x2000000) != 0) {
return FALSE;
}
ULONG Delta = (ULONG)((Opcode & 0x1FFFFFF) * 4);
PBYTE BranchTarget = pbCode + Delta;
// Now inspect the opcodes of the code we jumped to in order to determine if it's HPAT.
ULONG HpatOpcode1 = fetch_opcode(BranchTarget);
ULONG HpatOpcode2 = fetch_opcode(BranchTarget + 4);
if (HpatOpcode1 != 0x58008010) { // ldr <reg> [PC+PAGE_SIZE]
return FALSE;
}
if (HpatOpcode2 != 0xd61f0200) { // br <reg>
return FALSE;
}
return TRUE;
}
inline BOOL detour_does_code_end_function(PBYTE pbCode)
{
ULONG Opcode = fetch_opcode(pbCode);
if ((Opcode & 0xfffffc1f) == 0xd65f0000 || // br <reg>
// When the OS has patched a function entry point, it will incorrectly
// appear as though the function is just a single branch instruction.
if (detour_is_code_os_patched(pbCode)) {
return FALSE;
}
if ((Opcode & 0xffbffc1f) == 0xd61f0000 || // ret/br <reg>
(Opcode & 0xfc000000) == 0x14000000) { // b <imm26>
return TRUE;
}
@ -1837,41 +1911,46 @@ LONG WINAPI DetourTransactionCommitEx(_Out_opt_ PVOID **pppFailedPointer)
}
}
// Update any suspended threads.
for (t = s_pPendingThreads; t != NULL; t = t->pNext) {
CONTEXT cxt;
cxt.ContextFlags = CONTEXT_CONTROL;
#undef DETOURS_EIP
#undef DETOURS_CONTEXT_FLAGS
#ifdef DETOURS_X86
#define DETOURS_EIP Eip
#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
#endif // DETOURS_X86
#ifdef DETOURS_X64
#define DETOURS_EIP Rip
#define DETOURS_CONTEXT_FLAGS (CONTEXT_CONTROL | CONTEXT_INTEGER)
#endif // DETOURS_X64
#ifdef DETOURS_IA64
#define DETOURS_EIP StIIP
#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
#endif // DETOURS_IA64
#ifdef DETOURS_ARM
#define DETOURS_EIP Pc
#define DETOURS_CONTEXT_FLAGS CONTEXT_CONTROL
#endif // DETOURS_ARM
#ifdef DETOURS_ARM64
#define DETOURS_EIP Pc
#define DETOURS_CONTEXT_FLAGS (CONTEXT_CONTROL | CONTEXT_INTEGER)
#endif // DETOURS_ARM64
typedef ULONG_PTR DETOURS_EIP_TYPE;
// Update any suspended threads.
for (t = s_pPendingThreads; t != NULL; t = t->pNext) {
CONTEXT cxt;
cxt.ContextFlags = DETOURS_CONTEXT_FLAGS;
if (GetThreadContext(t->hThread, &cxt)) {
for (o = s_pPendingOperations; o != NULL; o = o->pNext) {
if (o->fIsRemove) {
if (cxt.DETOURS_EIP >= (DETOURS_EIP_TYPE)(ULONG_PTR)o->pTrampoline &&
cxt.DETOURS_EIP < (DETOURS_EIP_TYPE)((ULONG_PTR)o->pTrampoline
+ sizeof(o->pTrampoline))
+ sizeof(*o->pTrampoline))
) {
cxt.DETOURS_EIP = (DETOURS_EIP_TYPE)
@ -2064,6 +2143,15 @@ LONG WINAPI DetourAttachEx(_Inout_ PVOID *ppPointer,
DETOUR_TRACE((" ppldTarget=%p, code=%p [gp=%p]\n",
ppldTarget, pbTarget, pTargetGlobals));
#else // DETOURS_IA64
#if defined(_M_ARM64EC)
if (RtlIsEcCode(reinterpret_cast<DWORD64>(*ppPointer))) {
DETOUR_TRACE(("*ppPointer is an Arm64EC address (ppPointer=%p). "
"An Arm64EC address cannot be legitimately detoured with an x64 jmp. "
"Mark the target function with __declspec(hybrid_patchable) to make it detour-able. "
"We still allow an Arm64EC function to be detoured with an x64 jmp to make it easy (crash) to debug.\n", ppPointer));
DETOUR_BREAK();
}
#endif
pbTarget = (PBYTE)DetourCodeFromPointer(pbTarget, NULL);
pDetour = DetourCodeFromPointer(pDetour, NULL);
#endif // !DETOURS_IA64

View file

@ -83,11 +83,15 @@
#undef DETOURS_32BIT
#undef DETOURS_64BIT
#ifndef DECLSPEC_HYBRID_PATCHABLE
#define DECLSPEC_HYBRID_PATCHABLE DECLSPEC_CHPE_PATCHABLE
#endif
#if defined(_X86_)
#define DETOURS_X86
#define DETOURS_OPTION_BITS 64
#elif defined(_AMD64_)
#elif defined(_AMD64_) || defined(_ARM64EC_)
#define DETOURS_X64
#define DETOURS_OPTION_BITS 32
@ -102,7 +106,7 @@
#define DETOURS_ARM64
#else
#error Unknown architecture (x86, amd64, ia64, arm, arm64)
#error Unknown architecture (x86, amd64, ia64, arm, arm64, arm64ec)
#endif
#ifdef _WIN64

View file

@ -645,6 +645,7 @@ BOOL WINAPI DetourEnumerateImportsEx(_In_opt_ HMODULE hModule,
struct _DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT
{
PVOID pContext;
PF_DETOUR_IMPORT_FILE_CALLBACK pfImportFile;
PF_DETOUR_IMPORT_FUNC_CALLBACK pfImportFunc;
};
@ -664,6 +665,19 @@ DetourEnumerateImportsThunk(_In_ PVOID VoidContext,
return pContext->pfImportFunc(pContext->pContext, nOrdinal, pszFunc, ppvFunc ? *ppvFunc : NULL);
}
static
BOOL
CALLBACK
DetourEnumerateImportsFile(_In_ PVOID VoidContext,
_In_opt_ HMODULE hModule,
_In_opt_ LPCSTR pszFile)
{
_DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const * const
pContext = (_DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT*)VoidContext;
return pContext->pfImportFile(pContext->pContext, hModule, pszFile);
}
BOOL WINAPI DetourEnumerateImports(_In_opt_ HMODULE hModule,
_In_opt_ PVOID pContext,
_In_opt_ PF_DETOUR_IMPORT_FILE_CALLBACK pfImportFile,
@ -674,11 +688,10 @@ BOOL WINAPI DetourEnumerateImports(_In_opt_ HMODULE hModule,
return FALSE;
}
_DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const context = { pContext, pfImportFunc };
_DETOUR_ENUMERATE_IMPORTS_THUNK_CONTEXT const context = { pContext, pfImportFile, pfImportFunc };
return DetourEnumerateImportsEx(hModule,
(PVOID)&context,
pfImportFile,
&DetourEnumerateImportsFile,
&DetourEnumerateImportsThunk);
}

View file

@ -2,7 +2,7 @@
https://github.com/nlohmann/json
VERSION: https://github.com/nlohmann/json/releases/tag/v3.11.3
VERSION: https://github.com/nlohmann/json/releases/tag/v3.12.0
#### LICENSE

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
https://github.com/nothings/stb
VERSION: https://github.com/nothings/stb/tree/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31
VERSION: https://github.com/nothings/stb/tree/f58f558c120e9b32c217290b80bad1a0729fbb2c
#### LICENSE

View file

@ -1,4 +1,4 @@
/* stb_image_resize2 - v2.10 - public domain image resizing
/* stb_image_resize2 - v2.14 - public domain image resizing
by Jeff Roberts (v2) and Jorge L Rodriguez
http://github.com/nothings/stb
@ -11,35 +11,6 @@
#define STB_IMAGE_RESIZE_IMPLEMENTATION
before the #include. That will create the implementation in that file.
PORTING FROM VERSION 1
The API has changed. You can continue to use the old version of stb_image_resize.h,
which is available in the "deprecated/" directory.
If you're using the old simple-to-use API, porting is straightforward.
(For more advanced APIs, read the documentation.)
stbir_resize_uint8():
- call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
stbir_resize_float():
- call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
stbir_resize_uint8_srgb():
- function name is unchanged
- cast channel count to `stbir_pixel_layout`
- above is sufficient unless your image has alpha and it's not RGBA/BGRA
- in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
stbir_resize_uint8_srgb_edgemode()
- switch to the "medium complexity" API
- stbir_resize(), very similar API but a few more parameters:
- pixel_layout: cast channel count to `stbir_pixel_layout`
- data_type: STBIR_TYPE_UINT8_SRGB
- edge: unchanged (STBIR_EDGE_WRAP, etc.)
- filter: STBIR_FILTER_DEFAULT
- which channel is alpha is specified in stbir_pixel_layout, see enum for details
EASY API CALLS:
Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
@ -283,7 +254,7 @@
using the stbir_set_filter_callbacks function.
PROGRESS
For interactive use with slow resize operations, you can use the the
For interactive use with slow resize operations, you can use the
scanline callbacks in the extended API. It would have to be a *very* large
image resample to need progress though - we're very fast.
@ -296,6 +267,34 @@
ASSERT
Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
PORTING FROM VERSION 1
The API has changed. You can continue to use the old version of stb_image_resize.h,
which is available in the "deprecated/" directory.
If you're using the old simple-to-use API, porting is straightforward.
(For more advanced APIs, read the documentation.)
stbir_resize_uint8():
- call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
stbir_resize_float():
- call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
stbir_resize_uint8_srgb():
- function name is unchanged
- cast channel count to `stbir_pixel_layout`
- above is sufficient unless your image has alpha and it's not RGBA/BGRA
- in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
stbir_resize_uint8_srgb_edgemode()
- switch to the "medium complexity" API
- stbir_resize(), very similar API but a few more parameters:
- pixel_layout: cast channel count to `stbir_pixel_layout`
- data_type: STBIR_TYPE_UINT8_SRGB
- edge: unchanged (STBIR_EDGE_WRAP, etc.)
- filter: STBIR_FILTER_DEFAULT
- which channel is alpha is specified in stbir_pixel_layout, see enum for details
FUTURE TODOS
* For polyphase integral filters, we just memcpy the coeffs to dupe
them, but we should indirect and use the same coeff memory.
@ -308,6 +307,8 @@
some pixel reconversion, but probably dwarfed by things falling out
of cache. Probably also something possible with alternating between
scattering and gathering at high resize scales?
* Should we have a multiple MIPs at the same time function (could keep
more memory in cache during multiple resizes)?
* Rewrite the coefficient generator to do many at once.
* AVX-512 vertical kernels - worried about downclocking here.
* Convert the reincludes to macros when we know they aren't changing.
@ -328,6 +329,16 @@
Nathan Reed: warning fixes for 1.0
REVISIONS
2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and
scatter with vertical first.
2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for
tiny-c, fixed some variables that should have been static,
fixes a bug when calculating temp memory with resizes that
exceed 2GB of temp memory (very large resizes).
2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
with AVX-2, fix some weird scaling edge conditions with
point sample mode.
2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
fix MSVC 32-bit arm half float routines.
2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
@ -335,11 +346,11 @@
2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
to Ryan Salsbury), fix for sub-rect resizes, use the
pragmas to control unrolling when they are available.
2.07 (2024-05-24) fix for slow final split during threaded conversions of very
wide scanlines when downsampling (caused by extra input
converting), fix for wide scanline resamples with many
2.07 (2024-05-24) fix for slow final split during threaded conversions of very
wide scanlines when downsampling (caused by extra input
converting), fix for wide scanline resamples with many
splits (int overflow), fix GCC warning.
2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
undersampling a single row on rare resize ratios (about 1%).
2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
fix for output callback (thanks Julien Koenen).
@ -379,62 +390,6 @@ typedef uint32_t stbir_uint32;
typedef uint64_t stbir_uint64;
#endif
#ifdef _M_IX86_FP
#if ( _M_IX86_FP >= 1 )
#ifndef STBIR_SSE
#define STBIR_SSE
#endif
#endif
#endif
#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
#ifndef STBIR_SSE2
#define STBIR_SSE2
#endif
#if defined(__AVX__) || defined(STBIR_AVX2)
#ifndef STBIR_AVX
#ifndef STBIR_NO_AVX
#define STBIR_AVX
#endif
#endif
#endif
#if defined(__AVX2__) || defined(STBIR_AVX2)
#ifndef STBIR_NO_AVX2
#ifndef STBIR_AVX2
#define STBIR_AVX2
#endif
#if defined( _MSC_VER ) && !defined(__clang__)
#ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
#define STBIR_FP16C
#endif
#endif
#endif
#endif
#ifdef __F16C__
#ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc)
#define STBIR_FP16C
#endif
#endif
#endif
#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
#ifndef STBIR_NEON
#define STBIR_NEON
#endif
#endif
#if defined(_M_ARM) || defined(__arm__)
#ifdef STBIR_USE_FMA
#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
#endif
#endif
#if defined(__wasm__) && defined(__wasm_simd128__)
#ifndef STBIR_WASM
#define STBIR_WASM
#endif
#endif
#ifndef STBIRDEF
#ifdef STB_IMAGE_RESIZE_STATIC
#define STBIRDEF static
@ -1033,7 +988,7 @@ typedef struct
char no_cache_straddle[64];
} stbir__per_split_info;
typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
@ -1096,8 +1051,8 @@ struct stbir__info
#define stbir__max_uint8_as_float 255.0f
#define stbir__max_uint16_as_float 65535.0f
#define stbir__max_uint8_as_float_inverted (1.0f/255.0f)
#define stbir__max_uint16_as_float_inverted (1.0f/65535.0f)
#define stbir__max_uint8_as_float_inverted 3.9215689e-03f // (1.0f/255.0f)
#define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f)
#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
// min/max friendly
@ -1202,23 +1157,86 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
#endif
#define STBIR_INPUT_CALLBACK_PADDING 3
#ifdef _M_IX86_FP
#if ( _M_IX86_FP >= 1 )
#ifndef STBIR_SSE
#define STBIR_SSE
#endif
#endif
#endif
#ifdef __TINYC__
// tiny c has no intrinsics yet - this can become a version check if they add them
#define STBIR_NO_SIMD
#endif
#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
#ifndef STBIR_SSE2
#define STBIR_SSE2
#endif
#if defined(__AVX__) || defined(STBIR_AVX2)
#ifndef STBIR_AVX
#ifndef STBIR_NO_AVX
#define STBIR_AVX
#endif
#endif
#endif
#if defined(__AVX2__) || defined(STBIR_AVX2)
#ifndef STBIR_NO_AVX2
#ifndef STBIR_AVX2
#define STBIR_AVX2
#endif
#if defined( _MSC_VER ) && !defined(__clang__)
#ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
#define STBIR_FP16C
#endif
#endif
#endif
#endif
#ifdef __F16C__
#ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc)
#define STBIR_FP16C
#endif
#endif
#endif
#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
#ifndef STBIR_NEON
#define STBIR_NEON
#endif
#endif
#if defined(_M_ARM) || defined(__arm__)
#ifdef STBIR_USE_FMA
#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
#endif
#endif
#if defined(__wasm__) && defined(__wasm_simd128__)
#ifndef STBIR_WASM
#define STBIR_WASM
#endif
#endif
// restrict pointers for the output pointers, other loop and unroll control
#if defined( _MSC_VER ) && !defined(__clang__)
#define STBIR_STREAMOUT_PTR( star ) star __restrict
#define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
#if _MSC_VER >= 1900
#define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
#define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
#else
#define STBIR_NO_UNROLL_LOOP_START
#define STBIR_NO_UNROLL_LOOP_START
#endif
#elif defined( __clang__ )
#define STBIR_STREAMOUT_PTR( star ) star __restrict__
#define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
#define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
#if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
#define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
#else
#define STBIR_NO_UNROLL_LOOP_START
#endif
#endif
#elif defined( __GNUC__ )
#define STBIR_STREAMOUT_PTR( star ) star __restrict__
#define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
@ -1448,8 +1466,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
#include <smmintrin.h>
#define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
#else
STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
#define stbir__simdf_pack_to_8words(out,reg0,reg1) \
{ \
@ -3214,10 +3232,9 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
newspan->n0 = -left_margin;
newspan->n1 = ( max_left - min_left ) - left_margin;
scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin
return;
}
// if we can't merge the min_left range, add it as a second range
else
if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
{
stbir__span * newspan = scanline_extents->spans + 1;
@ -3232,7 +3249,14 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
newspan->n0 = scanline_extents->spans[1].n1 + 1;
newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin
return;
}
// sort the spans into write output order
if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
{
stbir__span tspan = scanline_extents->spans[0];
scanline_extents->spans[0] = scanline_extents->spans[1];
scanline_extents->spans[1] = tspan;
}
}
@ -3247,6 +3271,7 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
if ( edge == STBIR_EDGE_WRAP )
{
@ -3282,6 +3307,11 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
// make sure we never generate a range larger than our precalculated coeff width
// this only happens in point sample mode, but it's a good safe thing to do anyway
if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
in_last_pixel = in_first_pixel + coefficient_width - 1;
last_non_zero = -1;
for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
{
@ -3317,19 +3347,22 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
}
}
static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
{
if ( new_pixel <= contribs->n1 ) // before the end
{
if ( new_pixel < contribs->n0 ) // before the front?
{
int j, o = contribs->n0 - new_pixel;
for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
coeffs[ j + o ] = coeffs[ j ];
for ( j = 1 ; j < o ; j-- )
coeffs[ j ] = coeffs[ 0 ];
coeffs[ 0 ] = new_coeff;
contribs->n0 = new_pixel;
if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
{
int j, o = contribs->n0 - new_pixel;
for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
coeffs[ j + o ] = coeffs[ j ];
for ( j = 1 ; j < o ; j-- )
coeffs[ j ] = coeffs[ 0 ];
coeffs[ 0 ] = new_coeff;
contribs->n0 = new_pixel;
}
}
else
{
@ -3338,12 +3371,15 @@ static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs,
}
else
{
int j, e = new_pixel - contribs->n0;
for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
coeffs[j] = 0;
if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
{
int j, e = new_pixel - contribs->n0;
for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
coeffs[j] = 0;
coeffs[ e ] = new_coeff;
contribs->n1 = new_pixel;
coeffs[ e ] = new_coeff;
contribs->n1 = new_pixel;
}
}
}
@ -3522,6 +3558,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
coeffs = coefficient_group;
contribs = contributors;
for (n = 0; n < num_contributors; n++)
{
int i;
@ -3561,7 +3598,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
int endi = contribs->n1;
contribs->n1 = input_last_n1;
for( i = input_size; i <= endi; i++ )
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
}
// now check left hand edge
@ -3573,7 +3610,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
// reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
for( i = -1 ; i > contribs->n0 ; i-- )
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
save_n0 = contribs->n0;
save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
@ -3583,7 +3620,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
coeffs[i] = coeffs[i-save_n0];
// now that we have shrunk down the contribs, we insert the first one safely
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
}
}
@ -3592,6 +3629,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
int diff = contribs->n1 - contribs->n0 + 1;
while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
--diff;
contribs->n1 = contribs->n0 + diff - 1;
if ( contribs->n0 <= contribs->n1 )
@ -3617,9 +3655,9 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
filter_info->widest = widest;
}
#undef STBIR_RENORM_TYPE
#undef STBIR_RENORM_TYPE
static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
{
#define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
#define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
@ -3940,7 +3978,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
for (k = gn0 ; k <= gn1 ; k++ )
{
float gc = *g_coeffs++;
// skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
// (which happens when pivoting from horizontal, which might have dummy zeros)
if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
@ -3964,7 +4002,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
}
else
{
stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
}
STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
}
@ -4441,7 +4479,7 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
#ifdef STBIR_SIMD
#ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
end_decode -= 12;
end_decode -= 12;
STBIR_NO_UNROLL_LOOP_START
while( decode <= end_decode )
{
@ -4452,13 +4490,13 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
stbir__simdf_load( b, decode+4 );
stbir__simdf_load( c, decode+8 );
na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
stbir__simdf_store( decode, na );
stbir__simdf_store( decode+4, nb );
stbir__simdf_store( decode+4, nb );
stbir__simdf_store( decode+8, c );
decode += 12;
}
@ -4480,18 +4518,18 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
stbir__simdf_load( f, decode+15 );
stbir__simdf_load( g, decode+18 );
a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
// stores overlap, need to be in order,
// stores overlap, need to be in order,
stbir__simdf_store( decode, a );
i21 = decode[21];
stbir__simdf_store( decode+3, b );
stbir__simdf_store( decode+3, b );
i23 = decode[23];
stbir__simdf_store( decode+6, c );
stbir__simdf_store( decode+9, d );
@ -4543,7 +4581,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
stbir__span const * spans = stbir_info->scanline_extents.spans;
float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
float * last_decoded = 0;
// if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
@ -4571,12 +4610,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
if ( stbir_info->in_pixels_cb )
{
// call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself
input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING, input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
}
STBIR_PROFILE_START( decode );
// convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
STBIR_PROFILE_END( decode );
if (stbir_info->alpha_weight)
@ -4611,9 +4650,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
float * marg = full_decode_buffer + x * effective_channels;
float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
if ( e == 1 ) last_decoded = marg + margin * effective_channels;
}
}
}
// some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
// (we can't pre-zero it, because the input callback can use that area as padding)
last_decoded[0] = 0.0f;
// we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
// when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
// this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
last_decoded[1] = 0.0f;
}
@ -4810,12 +4859,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
#define stbir__1_coeff_remnant( ofs ) \
{ stbir__simdf t; \
{ stbir__simdf t,d; \
stbir__simdf_load1z( t, hc + (ofs) ); \
stbir__simdf_load2( d, decode + (ofs) * 2 ); \
stbir__simdf_0123to0011( t, t ); \
stbir__simdf_mult_mem( t, t, decode+(ofs)*2 ); \
stbir__simdf_mult( t, t, d ); \
stbir__simdf8_add4( tot0, tot0, t ); }
#define stbir__2_coeff_remnant( ofs ) \
{ stbir__simdf t; \
stbir__simdf_load2( t, hc + (ofs) ); \
@ -6191,6 +6241,8 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
if ( vertical_first )
{
// Now resample the gathered vertical data in the horizontal axis into the encode buffer
decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
decode_buffer[ width_times_channels+1 ] = 0.0f;
stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
}
@ -6362,6 +6414,8 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
void * scanline_scatter_buffer;
void * scanline_scatter_buffer_end;
int on_first_input_y, last_input_y;
int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
int width_times_channels = stbir_info->effective_channels * width;
STBIR_ASSERT( !stbir_info->vertical.is_gather );
@ -6396,7 +6450,12 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
// mark all the buffers as empty to start
for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
{
float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
decode_buffer[ width_times_channels+1 ] = 0.0f;
decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
}
// do the loop in input space
on_first_input_y = 1; last_input_y = start_input_y;
@ -6519,11 +6578,11 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
// filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
// In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the
// filter will extend before or after the scanline beyond just one extra entire copy of the
// scanline (we would hit the edge twice). We don't let you do that, so we clamp the total
// width to 3x the total of input pixel (once for the scanline, once for the left side
// overhang, and once for the right side). We only do this for edge mode, since the other
// In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the
// filter will extend before or after the scanline beyond just one extra entire copy of the
// scanline (we would hit the edge twice). We don't let you do that, so we clamp the total
// width to 3x the total of input pixel (once for the scanline, once for the left side
// overhang, and once for the right side). We only do this for edge mode, since the other
// modes can just re-edge clamp back in again.
if ( edge == STBIR_EDGE_WRAP )
if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
@ -6532,11 +6591,11 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
// This is how much to expand buffers to account for filters seeking outside
// the image boundaries.
samp->filter_pixel_margin = samp->filter_pixel_width / 2;
// filter_pixel_margin is the amount that this filter can overhang on just one side of either
// end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's
// worth of pixels, we clamp this one side of overhang to the input scanline size. Again,
// this clamping only happens in rare cases with the default filters (2 pix to 1 pix).
// filter_pixel_margin is the amount that this filter can overhang on just one side of either
// end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's
// worth of pixels, we clamp this one side of overhang to the input scanline size. Again,
// this clamping only happens in rare cases with the default filters (2 pix to 1 pix).
if ( edge == STBIR_EDGE_WRAP )
if ( samp->filter_pixel_margin > scale_info->input_full_size )
samp->filter_pixel_margin = scale_info->input_full_size;
@ -6544,7 +6603,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
samp->gather_prescatter_contributors = 0;
samp->gather_prescatter_coefficients = 0;
@ -6714,7 +6773,7 @@ static void stbir__free_internal_mem( stbir__info *info )
STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
STBIR__FREE_AND_CLEAR( info->alloced_mem );
STBIR__FREE_AND_CLEAR( info );
STBIR_FREE( info, info->user_data );
#endif
}
@ -6909,7 +6968,8 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
void * alloced = 0;
size_t alloced_total = 0;
int vertical_first;
int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
int alloc_ring_buffer_num_entries;
int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
@ -6954,14 +7014,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
// sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
// we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
// the conversion routines overwriting the callback input data.
decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger
#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
if ( effective_channels == 3 )
decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
#endif
ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
// if we do vertical first, the ring buffer holds a whole decoded line
if ( vertical_first )
@ -6976,13 +7038,13 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
alloc_ring_buffer_num_entries = conservative_split_output_size;
ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;
// The vertical buffer is used differently, depending on whether we are scattering
// the vertical scanlines, or gathering them.
// If scattering, it's used at the temp buffer to accumulate each output.
// If gathering, it's just the output buffer.
vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float); // extra float for padding
// we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
for(;;)
@ -7018,9 +7080,9 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
info->offset_x = new_x;
info->offset_y = new_y;
info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
info->ring_buffer_num_entries = 0;
info->ring_buffer_length_bytes = ring_buffer_length_bytes;
info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
info->splits = splits;
info->vertical_first = vertical_first;
@ -7101,19 +7163,24 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
// alloc memory for to-be-pivoted coeffs (if necessary)
if ( vertical->is_gather == 0 )
{
int both;
int temp_mem_amt;
size_t both;
size_t temp_mem_amt;
// when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
// that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
// is too small, we just allocate extra memory to use as this temp.
both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;
#ifdef STBIR__SEPARATE_ALLOCATIONS
temp_mem_amt = decode_buffer_size;
#ifdef STBIR_SIMD8
if ( effective_channels == 3 )
--temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
#endif
#else
temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
#endif
if ( temp_mem_amt >= both )
{
@ -7208,33 +7275,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
info->ring_buffer_num_entries = conservative_split_output_size;
STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
// a few of the horizontal gather functions read past the end of the decode (but mask it out),
// so put in normal values so no snans or denormals accidentally sneak in (also, in the ring
// buffer for vertical first)
for( i = 0 ; i < splits ; i++ )
{
int t, ofs, start;
ofs = decode_buffer_size / 4;
start = ofs - 4;
if ( start < 0 ) start = 0;
for( t = start ; t < ofs; t++ )
info->split_info[i].decode_buffer[ t ] = 9999.0f;
if ( vertical_first )
{
int j;
for( j = 0; j < info->ring_buffer_num_entries ; j++ )
{
for( t = start ; t < ofs; t++ )
stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f;
}
}
}
}
#undef STBIR__NEXT_PTR
@ -8197,7 +8238,7 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
#define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip
#endif
static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float * decode_end = (float*) decode + width_times_channels;
@ -8257,7 +8298,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
decode = decode_end; // backup and do last couple
input = end_input_m16;
}
return;
return decode_end + 16;
}
#endif
@ -8295,6 +8336,8 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
@ -8414,7 +8457,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
#endif
}
static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float * decode_end = (float*) decode + width_times_channels;
@ -8468,7 +8511,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
decode = decode_end; // backup and do last couple
input = end_input_m16;
}
return;
return decode_end + 16;
}
#endif
@ -8506,6 +8549,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
@ -8607,10 +8651,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
#endif
}
static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float const * decode_end = (float*) decode + width_times_channels;
float * decode_end = (float*) decode + width_times_channels;
unsigned char const * input = (unsigned char const *)inputp;
// try to do blocks of 4 when you can
@ -8645,6 +8689,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
#define stbir__min_max_shift20( i, f ) \
@ -8797,11 +8842,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float const * decode_end = (float*) decode + width_times_channels;
float * decode_end = (float*) decode + width_times_channels;
unsigned char const * input = (unsigned char const *)inputp;
do {
decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
@ -8810,6 +8856,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * de
input += 4;
decode += 4;
} while( decode < decode_end );
return decode_end;
}
@ -8882,11 +8929,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float const * decode_end = (float*) decode + width_times_channels;
float * decode_end = (float*) decode + width_times_channels;
unsigned char const * input = (unsigned char const *)inputp;
decode += 4;
while( decode <= decode_end )
{
@ -8903,6 +8951,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * de
decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
}
return decode_end;
}
static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
@ -8968,7 +9017,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
#endif
static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float * decode_end = (float*) decode + width_times_channels;
@ -9016,7 +9065,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
decode = decode_end; // backup and do last couple
input = end_input_m8;
}
return;
return decode_end + 8;
}
#endif
@ -9054,6 +9103,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
@ -9173,7 +9223,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
#endif
}
static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float * decode_end = (float*) decode + width_times_channels;
@ -9218,7 +9268,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
decode = decode_end; // backup and do last couple
input = end_input_m8;
}
return;
return decode_end + 8;
}
#endif
@ -9256,6 +9306,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
@ -9356,7 +9407,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
#endif
}
static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
{
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
float * decode_end = (float*) decode + width_times_channels;
@ -9402,7 +9453,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
decode = decode_end; // backup and do last couple
input = end_input_m8;
}
return;
return decode_end + 8;
}
#endif
@ -9440,6 +9491,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
input += stbir__coder_min_num;
}
#endif
return decode_end;
}
static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
@ -9526,7 +9578,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
#endif
}
static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
{
#ifdef stbir__decode_swizzle
float STBIR_STREAMOUT_PTR( * ) decode = decodep;
@ -9580,7 +9632,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
decode = decode_end; // backup and do last couple
input = end_input_m16;
}
return;
return decode_end + 16;
}
#endif
@ -9618,12 +9670,15 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
input += stbir__coder_min_num;
}
#endif
return decode_end;
#else
if ( (void*)decodep != inputp )
STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
return decodep + width_times_channels;
#endif
}