diff --git a/external/stb/stb/README.md b/external/stb/stb/README.md index 90f6d383..73abd456 100644 --- a/external/stb/stb/README.md +++ b/external/stb/stb/README.md @@ -24,10 +24,10 @@ library | lastest version | category | LoC | description --------------------- | ---- | -------- | --- | -------------------------------- **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer -**[stb_image.h](stb_image.h)** | 2.29 | graphics | 7985 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC -**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts +**[stb_image.h](stb_image.h)** | 2.30 | graphics | 7988 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC +**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5079 | parse, decode, and rasterize characters from truetype fonts **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP -**[stb_image_resize2.h](stb_image_resize2.h)** | 2.04 | graphics | 10325 | resize images larger/smaller with good quality +**[stb_image_resize2.h](stb_image_resize2.h)** | 2.09 | graphics | 10561 | resize images larger/smaller with good quality **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++ @@ -38,14 +38,14 @@ library | lastest version | category | LoC | description **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc **[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game dev | 4187 | embeddable tilemap editor **[stb_herringbone_wa...](stb_herringbone_wang_tile.h)** | 0.7 | game dev | 1221 | herringbone Wang tile map generator -**[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 940 | simplify writing parsers for C-like languages +**[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 941 | simplify writing parsers for C-like languages **[stb_divide.h](stb_divide.h)** | 0.94 | math | 433 | more useful 32-bit modulus e.g. "euclidean divide" **[stb_connected_comp...](stb_connected_components.h)** | 0.96 | misc | 1049 | incrementally compute reachability on grids **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL Total libraries: 21 -Total lines of C code: 50806 +Total lines of C code: 51048 FAQ diff --git a/external/stb/stb/stb_c_lexer.h b/external/stb/stb/stb_c_lexer.h index bf89dca3..fd42f1c3 100644 --- a/external/stb/stb/stb_c_lexer.h +++ b/external/stb/stb/stb_c_lexer.h @@ -38,6 +38,7 @@ // Contributors: // Arpad Goretity (bugfix) // Alan Hickman (hex floats) +// github:mundusnine (bugfix) // // LICENSE // @@ -562,7 +563,6 @@ int stb_c_lexer_get_token(stb_lexer *lexer) { int n = 0; lexer->string = lexer->string_storage; - lexer->string_len = n; do { if (n+1 >= lexer->string_storage_len) return stb__clex_token(lexer, CLEX_parse_error, p, p+n); @@ -576,6 +576,7 @@ int stb_c_lexer_get_token(stb_lexer *lexer) STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' ) ); lexer->string[n] = 0; + lexer->string_len = n; return stb__clex_token(lexer, CLEX_id, p, p+n-1); } diff --git a/external/stb/stb/stb_image.h b/external/stb/stb/stb_image.h index a632d543..9eedabed 100644 --- a/external/stb/stb/stb_image.h +++ b/external/stb/stb/stb_image.h @@ -1,4 +1,4 @@ -/* stb_image - v2.29 - public domain image loader - http://nothings.org/stb +/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb no warranty implied; use at your own risk Do this: @@ -48,6 +48,7 @@ LICENSE RECENT REVISION HISTORY: + 2.30 (2024-05-31) avoid erroneous gcc warning 2.29 (2023-05-xx) optimizations 2.28 (2023-01-29) many error fixes, security errors, just tons of stuff 2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes @@ -5159,9 +5160,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now. if (scan == STBI__SCAN_header) { ++s->img_n; return 1; } if (z->depth == 16) { - for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is + for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning + tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is } else { - for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger + for (k = 0; k < s->img_n && k < 3; ++k) + tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger } } break; diff --git a/external/stb/stb/stb_image_resize2.h b/external/stb/stb/stb_image_resize2.h index faf1b089..86e66d28 100644 --- a/external/stb/stb/stb_image_resize2.h +++ b/external/stb/stb/stb_image_resize2.h @@ -1,4 +1,4 @@ -/* stb_image_resize2 - v2.04 - public domain image resizing +/* stb_image_resize2 - v2.09 - public domain image resizing by Jeff Roberts (v2) and Jorge L Rodriguez http://github.com/nothings/stb @@ -320,19 +320,32 @@ CONTRIBUTORS Jeff Roberts: 2.0 implementation, optimizations, SIMD - Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer. + Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer Fabian Giesen: half float and srgb converters Sean Barrett: API design, optimizations Jorge L Rodriguez: Original 1.0 implementation - Aras Pranckevicius: bugfixes for 1.0 + Aras Pranckevicius: bugfixes Nathan Reed: warning fixes for 1.0 REVISIONS - 2.04 (2023-11-17) Fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic). + 2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting + hardware half floats). + 2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks + to Ryan Salsbury), fix for sub-rect resizes, use the + pragmas to control unrolling when they are available. + 2.07 (2024-05-24) fix for slow final split during threaded conversions of very + wide scanlines when downsampling (caused by extra input + converting), fix for wide scanline resamples with many + splits (int overflow), fix GCC warning. + 2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling + undersampling a single row on rare resize ratios (about 1%). + 2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras), + fix for output callback (thanks Julien Koenen). + 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic). 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks. 2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc - (2x-5x faster without simd, 4x-12x faster with simd) - (in some cases, 20x to 40x faster - resizing to very small for example) + 2x-5x faster without simd, 4x-12x faster with simd, + in some cases, 20x to 40x faster esp resizing large to very small. 0.96 (2019-03-04) fixed warnings 0.95 (2017-07-23) fixed warnings 0.94 (2017-03-18) fixed warnings @@ -402,13 +415,13 @@ typedef uint64_t stbir_uint64; #endif #endif -#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 && __ARM_FP16_FORMAT_IEEE != 0 +#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__) #ifndef STBIR_NEON #define STBIR_NEON #endif #endif -#if defined(_M_ARM) +#if defined(_M_ARM) || defined(__arm__) #ifdef STBIR_USE_FMA #undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC #endif @@ -1064,7 +1077,7 @@ struct stbir__info stbir__alpha_unweight_func * alpha_unweight; stbir__encode_pixels_func * encode_pixels; - int alloced_total; + int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated int splits; // count of splits stbir_internal_pixel_layout input_pixel_layout_internal; @@ -1075,7 +1088,7 @@ struct stbir__info int vertical_first; int channels; int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3) - int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated + size_t alloced_total; }; @@ -1086,10 +1099,11 @@ struct stbir__info #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20)) // min/max friendly -#define STBIR_CLAMP(x, xmin, xmax) do { \ +#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \ if ( (x) < (xmin) ) (x) = (xmin); \ if ( (x) > (xmax) ) (x) = (xmax); \ -} while (0) + break; \ +} static stbir__inline int stbir__min(int a, int b) { @@ -1186,19 +1200,35 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split? #endif -// restrict pointers for the output pointers +// restrict pointers for the output pointers, other loop and unroll control #if defined( _MSC_VER ) && !defined(__clang__) #define STBIR_STREAMOUT_PTR( star ) star __restrict #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop -#elif defined( __clang__ ) - #define STBIR_STREAMOUT_PTR( star ) star __restrict__ - #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) -#elif defined( __GNUC__ ) + #if _MSC_VER >= 1900 + #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif +#elif defined( __clang__ ) + #define STBIR_STREAMOUT_PTR( star ) star __restrict__ + #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) + #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) ) + #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)") + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif +#elif defined( __GNUC__ ) #define STBIR_STREAMOUT_PTR( star ) star __restrict__ #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) + #if __GNUC__ >= 14 + #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector") + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif #else #define STBIR_STREAMOUT_PTR( star ) star #define STBIR_NO_UNROLL( ptr ) + #define STBIR_NO_UNROLL_LOOP_START #endif #ifdef STBIR_NO_SIMD // force simd off for whatever reason @@ -1750,11 +1780,19 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \ vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \ ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) ) + + static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb) + { + uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) }; + return r; + } #else #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3} + #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}} #endif #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) ) + #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) ) #define stbir__simdi_16madd( out, reg0, reg1 ) \ { \ @@ -2138,7 +2176,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) #endif -#if defined(STBIR_NEON) && !defined(_M_ARM) +#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__) #if defined( _MSC_VER ) && !defined(__clang__) typedef __int16 stbir__FP16; @@ -2155,7 +2193,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) #endif -#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM) +#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__)) // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668 @@ -2382,7 +2420,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) stbir__simdi_store( output,final ); } -#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang +#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) { @@ -2428,7 +2466,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0]; } -#elif defined(STBIR_NEON) // 64-bit ARM +#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) { @@ -2506,11 +2544,12 @@ static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000); // Adding this switch saves about 5K on clang which is Captain Unroll the 3rd. #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star ) #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr) +#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START #ifdef STBIR_MEMCPY #undef STBIR_MEMCPY -#define STBIR_MEMCPY stbir_simd_memcpy #endif +#define STBIR_MEMCPY stbir_simd_memcpy // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies) static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) @@ -2528,6 +2567,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) { if ( bytes ) { + STBIR_SIMD_NO_UNROLL_LOOP_START do { STBIR_SIMD_NO_UNROLL(d); @@ -2542,8 +2582,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) // do one unaligned to get us aligned for the stream out below stbir__simdf_load( x, ( d + ofs_to_src ) ); stbir__simdf_store( d, x ); - d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 ); + d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 ); + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { STBIR_SIMD_NO_UNROLL(d); @@ -2574,8 +2615,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 ); stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 ); stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 ); - d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) ); + d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) ); + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { STBIR_SIMD_NO_UNROLL(d); @@ -2612,6 +2654,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away? { char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15); + STBIR_SIMD_NO_UNROLL_LOOP_START do { stbir__simdf x; @@ -2638,6 +2681,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte // when in scalar mode, we let unrolling happen, so this macro just does the __restrict #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star ) #define STBIR_SIMD_NO_UNROLL(ptr) +#define STBIR_SIMD_NO_UNROLL_LOOP_START #endif // SSE2 @@ -2749,7 +2793,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte #ifndef STBIR_SIMD -// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be +// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be // a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to // the diff between dest and src) static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) @@ -2761,6 +2805,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away? { char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7); + STBIR_NO_UNROLL_LOOP_START do { STBIR_NO_UNROLL(sd); @@ -2772,6 +2817,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte return; } + STBIR_NO_UNROLL_LOOP_START do { STBIR_NO_UNROLL(sd); @@ -2876,13 +2922,6 @@ static float stbir__filter_mitchell(float x, float s, void * user_data) return (0.0f); } -static float stbir__support_zero(float s, void * user_data) -{ - STBIR__UNUSED(s); - STBIR__UNUSED(user_data); - return 0; -} - static float stbir__support_zeropoint5(float s, void * user_data) { STBIR__UNUSED(s); @@ -3200,8 +3239,8 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel if ( edge == STBIR_EDGE_WRAP ) { - if ( first <= -input_size ) - first = -(input_size-1); + if ( first < -input_size ) + first = -input_size; if ( last >= (input_size*2)) last = (input_size*2) - 1; } @@ -3392,6 +3431,12 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int } } +#ifdef STBIR_RENORMALIZE_IN_FLOAT +#define STBIR_RENORM_TYPE float +#else +#define STBIR_RENORM_TYPE double +#endif + static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width ) { int input_size = scale_info->input_full_size; @@ -3413,14 +3458,14 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter for (n = 0; n < end; n++) { int i; - float filter_scale, total_filter = 0; + STBIR_RENORM_TYPE filter_scale, total_filter = 0; int e; // add all contribs e = contribs->n1 - contribs->n0; for( i = 0 ; i <= e ; i++ ) { - total_filter += coeffs[i]; + total_filter += (STBIR_RENORM_TYPE) coeffs[i]; STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f ) ); // check for wonky weights } @@ -3436,10 +3481,11 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter // if the total isn't 1.0, rescale everything if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) ) { - filter_scale = 1.0f / total_filter; + filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter; + // scale them all for (i = 0; i <= e; i++) - coeffs[i] *= filter_scale; + coeffs[i] = (float) ( coeffs[i] * filter_scale ); } } ++contribs; @@ -3560,7 +3606,9 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter filter_info->widest = widest; } -static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width ) +#undef STBIR_RENORM_TYPE + +static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) { #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; } #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; } @@ -3569,6 +3617,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* #else #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; } #endif + + int row_end = row1 + 1; + STBIR__UNUSED( row0 ); // only used in an assert + if ( coefficient_width != widest ) { float * pc = coefficents; @@ -3577,6 +3629,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* switch( widest ) { case 1: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_1( pc, coeffs ); ++pc; @@ -3584,6 +3637,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 2: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_2( pc, coeffs ); pc += 2; @@ -3591,6 +3645,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 3: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_2( pc, coeffs ); STBIR_MOVE_1( pc+2, coeffs+2 ); @@ -3599,6 +3654,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 4: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); pc += 4; @@ -3606,6 +3662,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 5: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_1( pc+4, coeffs+4 ); @@ -3614,6 +3671,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 6: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_2( pc+4, coeffs+4 ); @@ -3622,6 +3680,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 7: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_2( pc+4, coeffs+4 ); @@ -3631,6 +3690,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 8: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_4( pc+4, coeffs+4 ); @@ -3639,6 +3699,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 9: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_4( pc+4, coeffs+4 ); @@ -3648,6 +3709,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 10: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_4( pc+4, coeffs+4 ); @@ -3657,6 +3719,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 11: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_4( pc+4, coeffs+4 ); @@ -3667,6 +3730,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; case 12: + STBIR_NO_UNROLL_LOOP_START do { STBIR_MOVE_4( pc, coeffs ); STBIR_MOVE_4( pc+4, coeffs+4 ); @@ -3676,6 +3740,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } while ( pc < pc_end ); break; default: + STBIR_NO_UNROLL_LOOP_START do { float * copy_end = pc + widest - 4; float * c = coeffs; @@ -3686,6 +3751,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* c += 4; } while ( pc <= copy_end ); copy_end += 4; + STBIR_NO_UNROLL_LOOP_START while ( pc < copy_end ) { STBIR_MOVE_1( pc, c ); @@ -3710,10 +3776,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* float * coeffs = coefficents + widest * ( num_contributors - 1 ); // go until no chance of clipping (this is usually less than 8 lops) - while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_width ) ) + while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) ) { // might we clip?? - if ( ( contribs->n0 + widest ) > row_width ) + if ( ( contribs->n0 + widest ) > row_end ) { int stop_range = widest; @@ -3732,15 +3798,15 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } // now see if we still clip with the refined range - if ( ( contribs->n0 + stop_range ) > row_width ) + if ( ( contribs->n0 + stop_range ) > row_end ) { - int new_n0 = row_width - stop_range; + int new_n0 = row_end - stop_range; int num = contribs->n1 - contribs->n0 + 1; int backup = contribs->n0 - new_n0; float * from_co = coeffs + num - 1; float * to_co = from_co + backup; - STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) ); + STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) ); // move the coeffs over while( num ) @@ -3863,26 +3929,33 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot for (k = gn0 ; k <= gn1 ; k++ ) { float gc = *g_coeffs++; - if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) ) + + // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width + // (which happens when pivoting from horizontal, which might have dummy zeros) + if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) ) { + if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) ) { - // if we are skipping over several contributors, we need to clear the skipped ones - stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1); - while ( clear_contributors < scatter_contributors ) { - clear_contributors->n0 = 0; - clear_contributors->n1 = -1; - ++clear_contributors; + // if we are skipping over several contributors, we need to clear the skipped ones + stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1); + while ( clear_contributors < scatter_contributors ) + { + clear_contributors->n0 = 0; + clear_contributors->n1 = -1; + ++clear_contributors; + } } + scatter_contributors->n0 = n; + scatter_contributors->n1 = n; + scatter_coeffs[0] = gc; + highest_set = k; } - scatter_contributors->n0 = n; - scatter_contributors->n1 = n; - scatter_coeffs[0] = gc; - highest_set = k; - } - else - { - stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc ); + else + { + stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc ); + } + STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width ); } ++scatter_contributors; scatter_coeffs += scatter_coefficient_width; @@ -3989,6 +4062,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c #ifdef STBIR_SIMD8 decode += 16; + STBIR_NO_UNROLL_LOOP_START while ( decode <= end_decode ) { stbir__simdf8 d0,d1,a0,a1,p0,p1; @@ -4013,6 +4087,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c decode -= 16; #else decode += 8; + STBIR_NO_UNROLL_LOOP_START while ( decode <= end_decode ) { stbir__simdf d0,a0,d1,a1,p0,p1; @@ -4035,12 +4110,14 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c // might be one last odd pixel #ifdef STBIR_SIMD8 + STBIR_NO_UNROLL_LOOP_START while ( decode < end_decode ) #else if ( decode < end_decode ) #endif { stbir__simdf d,a,p; + STBIR_NO_UNROLL(decode); stbir__simdf_load( d, decode ); stbir__simdf_0123to3333( a, d ); stbir__simdf_mult( p, a, d ); @@ -4082,6 +4159,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c decode += 8; if ( decode <= end_decode ) { + STBIR_NO_UNROLL_LOOP_START do { #ifdef STBIR_SIMD8 stbir__simdf8 d0,a0,p0; @@ -4125,6 +4203,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c decode -= 8; #endif + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode < end_decode ) { float x = decode[0], y = decode[1]; @@ -4145,6 +4224,7 @@ static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_ti // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm + STBIR_SIMD_NO_UNROLL_LOOP_START do { float alpha = input[3]; #ifdef STBIR_SIMD @@ -4212,6 +4292,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim #ifdef STBIR_SIMD { decode += 2 * stbir__simdfX_float_count; + STBIR_NO_UNROLL_LOOP_START while ( decode <= end_decode ) { stbir__simdfX d0,a0,d1,a1; @@ -4230,6 +4311,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim // few last pixels remnants #ifdef STBIR_SIMD8 + STBIR_NO_UNROLL_LOOP_START while ( decode < end_decode ) #else if ( decode < end_decode ) @@ -4265,6 +4347,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim #ifdef STBIR_SIMD decode += 2 * stbir__simdfX_float_count; + STBIR_NO_UNROLL_LOOP_START while ( decode <= end_decode ) { stbir__simdfX d0,a0,d1,a1; @@ -4282,6 +4365,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim decode -= 2 * stbir__simdfX_float_count; #endif + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode < end_decode ) { float alpha = decode[1]; @@ -4296,6 +4380,7 @@ static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_t float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer; float const * end_output = encode_buffer + width_times_channels; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float alpha = encode[3]; @@ -4343,9 +4428,77 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann float STBIR_STREAMOUT_PTR(*) decode = decode_buffer; float const * end_decode = decode_buffer + width_times_channels; - decode += 12; +#ifdef STBIR_SIMD + #ifdef stbir__simdf_swiz2 // do we have two argument swizzles? + end_decode -= 12; + STBIR_NO_UNROLL_LOOP_START + while( decode <= end_decode ) + { + // on arm64 8 instructions, no overlapping stores + stbir__simdf a,b,c,na,nb; + STBIR_SIMD_NO_UNROLL(decode); + stbir__simdf_load( a, decode ); + stbir__simdf_load( b, decode+4 ); + stbir__simdf_load( c, decode+8 ); + + na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 ); + b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 ); + nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 ); + c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 ); + + stbir__simdf_store( decode, na ); + stbir__simdf_store( decode+4, nb ); + stbir__simdf_store( decode+8, c ); + decode += 12; + } + end_decode += 12; + #else + end_decode -= 24; + STBIR_NO_UNROLL_LOOP_START + while( decode <= end_decode ) + { + // 26 instructions on x64 + stbir__simdf a,b,c,d,e,f,g; + float i21, i23; + STBIR_SIMD_NO_UNROLL(decode); + stbir__simdf_load( a, decode ); + stbir__simdf_load( b, decode+3 ); + stbir__simdf_load( c, decode+6 ); + stbir__simdf_load( d, decode+9 ); + stbir__simdf_load( e, decode+12 ); + stbir__simdf_load( f, decode+15 ); + stbir__simdf_load( g, decode+18 ); + + a = stbir__simdf_swiz( a, 2, 1, 0, 3 ); + b = stbir__simdf_swiz( b, 2, 1, 0, 3 ); + c = stbir__simdf_swiz( c, 2, 1, 0, 3 ); + d = stbir__simdf_swiz( d, 2, 1, 0, 3 ); + e = stbir__simdf_swiz( e, 2, 1, 0, 3 ); + f = stbir__simdf_swiz( f, 2, 1, 0, 3 ); + g = stbir__simdf_swiz( g, 2, 1, 0, 3 ); + + // stores overlap, need to be in order, + stbir__simdf_store( decode, a ); + i21 = decode[21]; + stbir__simdf_store( decode+3, b ); + i23 = decode[23]; + stbir__simdf_store( decode+6, c ); + stbir__simdf_store( decode+9, d ); + stbir__simdf_store( decode+12, e ); + stbir__simdf_store( decode+15, f ); + stbir__simdf_store( decode+18, g ); + decode[21] = i23; + decode[23] = i21; + decode += 24; + } + end_decode += 24; + #endif +#else + end_decode -= 12; + STBIR_NO_UNROLL_LOOP_START while( decode <= end_decode ) { + // 16 instructions float t0,t1,t2,t3; STBIR_NO_UNROLL(decode); t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9]; @@ -4353,8 +4506,10 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3; decode += 12; } - decode -= 12; + end_decode += 12; +#endif + STBIR_NO_UNROLL_LOOP_START while( decode < end_decode ) { float t = decode[0]; @@ -4375,7 +4530,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float stbir_edge edge_horizontal = stbir_info->horizontal.edge; stbir_edge edge_vertical = stbir_info->vertical.edge; int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size); - const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes; + const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes; stbir__span const * spans = stbir_info->scanline_extents.spans; float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels; @@ -5958,7 +6113,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output // if we have an output callback, call it to send the data if ( stbir_info->out_pixels_cb ) - stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data ); + stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data ); } @@ -6028,7 +6183,7 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); } - stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes), + stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes), encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); } @@ -6069,7 +6224,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__ // initialize the ring buffer for gathering split_info->ring_buffer_begin_index = 0; - split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest; + split_info->ring_buffer_first_scanline = vertical_contributors->n0; split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty" for (y = start_output_y; y < end_output_y; y++) @@ -6123,7 +6278,7 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_ float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index ); // dump the scanline out - stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); // mark it as empty ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER; @@ -6144,7 +6299,7 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); // dump the scanline out - stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); // mark it as empty ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER; @@ -6352,15 +6507,31 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir // pre calculate stuff based on the above samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data); + // filter_pixel_width is the conservative size in pixels of input that affect an output pixel. + // In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the + // filter will extend before or after the scanline beyond just one extra entire copy of the + // scanline (we would hit the edge twice). We don't let you do that, so we clamp the total + // width to 3x the total of input pixel (once for the scanline, once for the left side + // overhang, and once for the right side). We only do this for edge mode, since the other + // modes can just re-edge clamp back in again. if ( edge == STBIR_EDGE_WRAP ) - if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) ) // this can only happen when shrinking to a single pixel - samp->filter_pixel_width = scale_info->input_full_size * 2; + if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) ) + samp->filter_pixel_width = scale_info->input_full_size * 3; // This is how much to expand buffers to account for filters seeking outside // the image boundaries. samp->filter_pixel_margin = samp->filter_pixel_width / 2; + + // filter_pixel_margin is the amount that this filter can overhang on just one side of either + // end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's + // worth of pixels, we clamp this one side of overhang to the input scanline size. Again, + // this clamping only happens in rare cases with the default filters (2 pix to 1 pix). + if ( edge == STBIR_EDGE_WRAP ) + if ( samp->filter_pixel_margin > scale_info->input_full_size ) + samp->filter_pixel_margin = scale_info->input_full_size; samp->num_contributors = stbir__get_contributors(samp, samp->is_gather); + samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors); samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding @@ -6725,7 +6896,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample stbir__info * info = 0; void * alloced = 0; - int alloced_total = 0; + size_t alloced_total = 0; int vertical_first; int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries; @@ -6996,7 +7167,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample stbir__get_extents( horizontal, &info->scanline_extents ); // pack the horizontal coeffs - horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 ); + horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 ); STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) ); @@ -7027,36 +7198,27 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample info->ring_buffer_num_entries = conservative_split_output_size; STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries ); - // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in + // a few of the horizontal gather functions read past the end of the decode (but mask it out), + // so put in normal values so no snans or denormals accidentally sneak in (also, in the ring + // buffer for vertical first) for( i = 0 ; i < splits ; i++ ) { - int width, ofs; + int t, ofs, start; - // find the right most span - if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 ) - width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0; - else - width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0; + ofs = decode_buffer_size / 4; + start = ofs - 4; + if ( start < 0 ) start = 0; - // this calc finds the exact end of the decoded scanline for all filter modes. - // usually this is just the width * effective channels. But we have to account - // for the area to the left of the scanline for wrap filtering and alignment, this - // is stored as a negative value in info->scanline_extents.conservative.n0. Next, - // we need to skip the exact size of the right hand size filter area (again for - // wrap mode), this is in info->scanline_extents.edge_sizes[1]). - ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels; + for( t = start ; t < ofs; t++ ) + info->split_info[i].decode_buffer[ t ] = 9999.0f; - // place a known, but numerically valid value in the decode buffer - info->split_info[i].decode_buffer[ ofs ] = 9999.0f; - - // if vertical filtering first, place a known, but numerically valid value in the all - // of the ring buffer accumulators if ( vertical_first ) { int j; for( j = 0; j < info->ring_buffer_num_entries ; j++ ) { - stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f; + for( t = start ; t < ofs; t++ ) + stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f; } } } @@ -7068,7 +7230,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample // is this the first time through loop? if ( info == 0 ) { - alloced_total = (int) ( 15 + (size_t)advance_mem ); + alloced_total = ( 15 + (size_t)advance_mem ); alloced = STBIR_MALLOC( alloced_total, user_data ); if ( alloced == 0 ) return 0; @@ -7185,7 +7347,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type]; // calc offset - info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] ); + info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] ); info->in_pixels_cb = resize->input_cb; info->user_data = resize->user_data; @@ -7757,7 +7919,7 @@ static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * o if ( output_stride_in_bytes < pitch ) return 0; - size = output_stride_in_bytes * output_h; + size = (size_t)output_stride_in_bytes * (size_t)output_h; if ( size == 0 ) return 0; @@ -8035,6 +8197,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco if ( width_times_channels >= 16 ) { decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START for(;;) { #ifdef STBIR_SIMD8 @@ -8090,6 +8253,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -8105,6 +8269,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -8131,6 +8296,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu { float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; end_output -= stbir__simdfX_float_count*2; + STBIR_NO_UNROLL_LOOP_START for(;;) { stbir__simdfX e0, e1; @@ -8162,6 +8328,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_NO_UNROLL_LOOP_START while( output <= end_output ) { stbir__simdf e0; @@ -8180,6 +8347,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { stbir__simdf e0; @@ -8216,6 +8384,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { float f; @@ -8245,6 +8414,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int if ( width_times_channels >= 16 ) { decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START for(;;) { #ifdef STBIR_SIMD8 @@ -8294,6 +8464,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -8309,6 +8480,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -8335,6 +8507,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int { float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdfX e0, e1; @@ -8366,6 +8539,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_NO_UNROLL_LOOP_START while( output <= end_output ) { stbir__simdf e0; @@ -8404,6 +8578,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { float f; @@ -8444,6 +8619,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -8529,12 +8705,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; #ifdef STBIR_SIMD - stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8; if ( width_times_channels >= 16 ) { float const * end_encode_m16 = encode + width_times_channels - 16; end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdf f0, f1, f2, f3; @@ -8548,7 +8724,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w stbir__min_max_shift20( i2, f2 ); stbir__min_max_shift20( i3, f3 ); - stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb ); + stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); stbir__linear_to_srgb_finish( i0, f0 ); stbir__linear_to_srgb_finish( i1, f1 ); @@ -8573,6 +8749,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while ( output <= end_output ) { STBIR_SIMD_NO_UNROLL(encode); @@ -8590,6 +8767,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { STBIR_NO_UNROLL(encode); @@ -8630,12 +8808,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; #ifdef STBIR_SIMD - stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8; if ( width_times_channels >= 16 ) { float const * end_encode_m16 = encode + width_times_channels - 16; end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdf f0, f1, f2, f3; @@ -8649,7 +8827,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o stbir__min_max_shift20( i2, f2 ); stbir__scale_and_convert( i3, f3 ); - stbir__simdi_table_lookup3( i0, i1, i2, to_srgb ); + stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); stbir__linear_to_srgb_finish( i0, f0 ); stbir__linear_to_srgb_finish( i1, f1 ); @@ -8671,6 +8849,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o } #endif + STBIR_SIMD_NO_UNROLL_LOOP_START do { float f; STBIR_SIMD_NO_UNROLL(encode); @@ -8721,12 +8900,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; #ifdef STBIR_SIMD - stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8; if ( width_times_channels >= 16 ) { float const * end_encode_m16 = encode + width_times_channels - 16; end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdf f0, f1, f2, f3; @@ -8740,7 +8919,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o stbir__min_max_shift20( i2, f2 ); stbir__scale_and_convert( i3, f3 ); - stbir__simdi_table_lookup2( i0, i2, to_srgb ); + stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); stbir__linear_to_srgb_finish( i0, f0 ); stbir__linear_to_srgb_finish( i2, f2 ); @@ -8760,6 +8939,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o } #endif + STBIR_SIMD_NO_UNROLL_LOOP_START do { float f; STBIR_SIMD_NO_UNROLL(encode); @@ -8788,6 +8968,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod if ( width_times_channels >= 8 ) { decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START for(;;) { #ifdef STBIR_SIMD8 @@ -8831,6 +9012,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -8846,6 +9028,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -8874,6 +9057,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output { float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdfX e0, e1; @@ -8901,6 +9085,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_NO_UNROLL_LOOP_START while( output <= end_output ) { stbir__simdf e; @@ -8919,6 +9104,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { stbir__simdf e; @@ -8940,6 +9126,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( output <= end_output ) { float f; @@ -8956,6 +9143,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { float f; @@ -8985,6 +9173,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int if ( width_times_channels >= 8 ) { decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START for(;;) { #ifdef STBIR_SIMD8 @@ -9025,6 +9214,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -9040,6 +9230,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -9067,6 +9258,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int { float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdfX e0, e1; @@ -9094,6 +9286,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_NO_UNROLL_LOOP_START while( output <= end_output ) { stbir__simdf e; @@ -9115,6 +9308,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( output <= end_output ) { float f; @@ -9133,6 +9327,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { float f; @@ -9161,6 +9356,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, { stbir__FP16 const * end_input_m8 = input + width_times_channels - 8; decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START for(;;) { STBIR_NO_UNROLL(decode); @@ -9202,6 +9398,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -9217,6 +9414,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -9243,6 +9441,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp { float const * end_encode_m8 = encode + width_times_channels - 8; end_output -= 8; + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { STBIR_SIMD_NO_UNROLL(encode); @@ -9283,6 +9482,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( output <= end_output ) { STBIR_SIMD_NO_UNROLL(output); @@ -9298,6 +9498,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { STBIR_NO_UNROLL(output); @@ -9326,6 +9527,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int { float const * end_input_m16 = input + width_times_channels - 16; decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START for(;;) { STBIR_NO_UNROLL(decode); @@ -9374,6 +9576,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( decode <= decode_end ) { STBIR_SIMD_NO_UNROLL(decode); @@ -9389,6 +9592,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( decode < decode_end ) { STBIR_NO_UNROLL(decode); @@ -9448,6 +9652,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int { float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 ); end_output -= ( stbir__simdfX_float_count * 2 ); + STBIR_SIMD_NO_UNROLL_LOOP_START for(;;) { stbir__simdfX e0, e1; @@ -9481,6 +9686,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_NO_UNROLL_LOOP_START while( output <= end_output ) { stbir__simdf e0; @@ -9505,6 +9711,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int // try to do blocks of 4 when you can #if stbir__coder_min_num != 3 // doesn't divide cleanly by four output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START while( output <= end_output ) { float e; @@ -9524,6 +9731,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int // do the remnants #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START while( output < end_output ) { float e; @@ -9634,6 +9842,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); ) stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); ) stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); ) + STBIR_SIMD_NO_UNROLL_LOOP_START while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) { stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3; @@ -9688,6 +9897,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output input += (4*stbir__simdfX_float_count); stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); ) } + STBIR_SIMD_NO_UNROLL_LOOP_START while ( ( (char*)input_end - (char*) input ) >= 16 ) { stbir__simdf o0, r0; @@ -9720,6 +9930,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output } } #else + STBIR_NO_UNROLL_LOOP_START while ( ( (char*)input_end - (char*) input ) >= 16 ) { float r0, r1, r2, r3; @@ -9751,6 +9962,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; ) } #endif + STBIR_NO_UNROLL_LOOP_START while ( input < input_end ) { float r = input[0]; @@ -9814,6 +10026,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); ) stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); ) + STBIR_SIMD_NO_UNROLL_LOOP_START while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) { stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3; @@ -9858,6 +10071,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); ) } + STBIR_SIMD_NO_UNROLL_LOOP_START while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) { stbir__simdf o0, r0; @@ -9882,6 +10096,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, } } #else + STBIR_NO_UNROLL_LOOP_START while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) { float o0, o1, o2, o3; @@ -9903,6 +10118,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; ) } #endif + STBIR_NO_UNROLL_LOOP_START while ( input0 < input0_end ) { float o0; @@ -9995,6 +10211,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( floa { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10007,6 +10224,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10019,6 +10237,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10031,6 +10250,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10043,6 +10263,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10056,6 +10277,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10070,6 +10292,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( flo float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10084,6 +10307,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10097,6 +10321,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( flo { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10111,6 +10336,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( fl { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10126,6 +10352,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( fl float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10140,6 +10367,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( fl { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; float const * hc = horizontal_coefficients; @@ -10154,12 +10382,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; float const * hc = horizontal_coefficients; stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { hc += 4; decode += STBIR__horizontal_channels * 4; @@ -10174,12 +10404,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; float const * hc = horizontal_coefficients; stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { hc += 4; decode += STBIR__horizontal_channels * 4; @@ -10195,12 +10427,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 { float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; float const * hc = horizontal_coefficients; stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { hc += 4; decode += STBIR__horizontal_channels * 4; @@ -10218,12 +10452,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; float const * hc = horizontal_coefficients; stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START do { hc += 4; decode += STBIR__horizontal_channels * 4; diff --git a/external/stb/stb/stb_truetype.h b/external/stb/stb/stb_truetype.h index bbf2284b..90a5c2e2 100644 --- a/external/stb/stb/stb_truetype.h +++ b/external/stb/stb/stb_truetype.h @@ -54,7 +54,7 @@ // Hou Qiming Derek Vinyard // Rob Loach Cort Stratton // Kenney Phillis Jr. Brian Costabile -// Ken Voskuil (kaesve) +// Ken Voskuil (kaesve) Yakov Galka // // VERSION HISTORY // @@ -4604,6 +4604,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc scale_y = -scale_y; { + // distance from singular values (in the same units as the pixel grid) + const float eps = 1./1024, eps2 = eps*eps; int x,y,i,j; float *precompute; stbtt_vertex *verts; @@ -4616,15 +4618,15 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y; float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y; float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0)); - precompute[i] = (dist == 0) ? 0.0f : 1.0f / dist; + precompute[i] = (dist < eps) ? 0.0f : 1.0f / dist; } else if (verts[i].type == STBTT_vcurve) { float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y; float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y; float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y; float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2; float len2 = bx*bx + by*by; - if (len2 != 0.0f) - precompute[i] = 1.0f / (bx*bx + by*by); + if (len2 >= eps2) + precompute[i] = 1.0f / len2; else precompute[i] = 0.0f; } else @@ -4689,8 +4691,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc float a = 3*(ax*bx + ay*by); float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by); float c = mx*ax+my*ay; - if (a == 0.0) { // if a is 0, it's linear - if (b != 0.0) { + if (STBTT_fabs(a) < eps2) { // if a is 0, it's linear + if (STBTT_fabs(b) >= eps2) { res[num++] = -c/b; } } else { diff --git a/external/stb/stb/tests/stb.dsp b/external/stb/stb/tests/stb.dsp index 849b95be..b7039c74 100644 --- a/external/stb/stb/tests/stb.dsp +++ b/external/stb/stb/tests/stb.dsp @@ -130,6 +130,10 @@ SOURCE=..\stb_image.h # End Source File # Begin Source File +SOURCE=..\stb_image_resize2.h +# End Source File +# Begin Source File + SOURCE=..\stb_image_write.h # End Source File # Begin Source File diff --git a/external/stb/stb/tests/test_vorbis.c b/external/stb/stb/tests/test_vorbis.c index d54ed231..fc795c8e 100644 --- a/external/stb/stb/tests/test_vorbis.c +++ b/external/stb/stb/tests/test_vorbis.c @@ -8,7 +8,7 @@ extern void stb_vorbis_dumpmem(void); int main(int argc, char **argv) { size_t memlen; - unsigned char *mem = stb_fileu("c:/x/sketch008.ogg", &memlen); + unsigned char *mem = stb_fileu("../../lib/vorbis/sample/sketch008.ogg", &memlen); int chan, samplerate; short *output; int samples = stb_vorbis_decode_memory(mem, memlen, &chan, &samplerate, &output);