diff --git a/external/stb/stb/README.md b/external/stb/stb/README.md
index 90f6d383..73abd456 100644
--- a/external/stb/stb/README.md
+++ b/external/stb/stb/README.md
@@ -24,10 +24,10 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.29 | graphics | 7985 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
-**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
+**[stb_image.h](stb_image.h)** | 2.30 | graphics | 7988 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5079 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
-**[stb_image_resize2.h](stb_image_resize2.h)** | 2.04 | graphics | 10325 | resize images larger/smaller with good quality
+**[stb_image_resize2.h](stb_image_resize2.h)** | 2.09 | graphics | 10561 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
@@ -38,14 +38,14 @@ library    | lastest version | category | LoC | description
 **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D&nbsp;graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
 **[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game&nbsp;dev | 4187 | embeddable tilemap editor
 **[stb_herringbone_wa...](stb_herringbone_wang_tile.h)** | 0.7 | game&nbsp;dev | 1221 | herringbone Wang tile map generator
-**[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 940 | simplify writing parsers for C-like languages
+**[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 941 | simplify writing parsers for C-like languages
 **[stb_divide.h](stb_divide.h)** | 0.94 | math | 433 | more useful 32-bit modulus e.g. "euclidean divide"
 **[stb_connected_comp...](stb_connected_components.h)** | 0.96 | misc | 1049 | incrementally compute reachability on grids
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 50806
+Total lines of C code: 51048
 
 
 FAQ
diff --git a/external/stb/stb/stb_c_lexer.h b/external/stb/stb/stb_c_lexer.h
index bf89dca3..fd42f1c3 100644
--- a/external/stb/stb/stb_c_lexer.h
+++ b/external/stb/stb/stb_c_lexer.h
@@ -38,6 +38,7 @@
 // Contributors:
 //   Arpad Goretity (bugfix)
 //   Alan Hickman (hex floats)
+//   github:mundusnine (bugfix)
 //
 // LICENSE
 //
@@ -562,7 +563,6 @@ int stb_c_lexer_get_token(stb_lexer *lexer)
          {
             int n = 0;
             lexer->string = lexer->string_storage;
-            lexer->string_len = n;
             do {
                if (n+1 >= lexer->string_storage_len)
                   return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
@@ -576,6 +576,7 @@ int stb_c_lexer_get_token(stb_lexer *lexer)
                 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
             );
             lexer->string[n] = 0;
+            lexer->string_len = n;
             return stb__clex_token(lexer, CLEX_id, p, p+n-1);
          }
 
diff --git a/external/stb/stb/stb_image.h b/external/stb/stb/stb_image.h
index a632d543..9eedabed 100644
--- a/external/stb/stb/stb_image.h
+++ b/external/stb/stb/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.29 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.30  (2024-05-31) avoid erroneous gcc warning
       2.29  (2023-05-xx) optimizations
       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
@@ -5159,9 +5160,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
                if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
                if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
                }
             }
             break;
diff --git a/external/stb/stb/stb_image_resize2.h b/external/stb/stb/stb_image_resize2.h
index faf1b089..86e66d28 100644
--- a/external/stb/stb/stb_image_resize2.h
+++ b/external/stb/stb/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.04 - public domain image resizing
+/* stb_image_resize2 - v2.09 - public domain image resizing
 
    by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
@@ -320,19 +320,32 @@
 
    CONTRIBUTORS
       Jeff Roberts: 2.0 implementation, optimizations, SIMD
-      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer.
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
       Fabian Giesen: half float and srgb converters
       Sean Barrett: API design, optimizations
       Jorge L Rodriguez: Original 1.0 implementation
-      Aras Pranckevicius: bugfixes for 1.0
+      Aras Pranckevicius: bugfixes
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.04 (2023-11-17) Fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
+      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
+                          hardware half floats).
+      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
+                          to Ryan Salsbury), fix for sub-rect resizes, use the
+                          pragmas to control unrolling when they are available.
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
+                          splits (int overflow), fix GCC warning.
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
+                          undersampling a single row on rare resize ratios (about 1%).
+      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
+                        fix for output callback (thanks Julien Koenen).
+      2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
       2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
       2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
-                          (2x-5x faster without simd, 4x-12x faster with simd)
-                          (in some cases, 20x to 40x faster - resizing to very small for example)
+                          2x-5x faster without simd, 4x-12x faster with simd,
+                          in some cases, 20x to 40x faster esp resizing large to very small.
       0.96 (2019-03-04) fixed warnings
       0.95 (2017-07-23) fixed warnings
       0.94 (2017-03-18) fixed warnings
@@ -402,13 +415,13 @@ typedef uint64_t stbir_uint64;
   #endif
 #endif
 
-#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 &&  __ARM_FP16_FORMAT_IEEE != 0
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
 #ifndef STBIR_NEON
 #define STBIR_NEON
 #endif
 #endif
 
-#if defined(_M_ARM)
+#if defined(_M_ARM) || defined(__arm__)
 #ifdef STBIR_USE_FMA
 #undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
 #endif
@@ -1064,7 +1077,7 @@ struct stbir__info
   stbir__alpha_unweight_func * alpha_unweight;
   stbir__encode_pixels_func * encode_pixels;
 
-  int alloced_total;
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
   int splits; // count of splits
 
   stbir_internal_pixel_layout input_pixel_layout_internal;
@@ -1075,7 +1088,7 @@ struct stbir__info
   int vertical_first;
   int channels;
   int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
-  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+  size_t alloced_total;
 };
 
 
@@ -1086,10 +1099,11 @@ struct stbir__info
 #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
 
 // min/max friendly
-#define STBIR_CLAMP(x, xmin, xmax) do { \
+#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
   if ( (x) < (xmin) ) (x) = (xmin);     \
   if ( (x) > (xmax) ) (x) = (xmax);     \
-} while (0)
+  break;                                \
+}
 
 static stbir__inline int stbir__min(int a, int b)
 {
@@ -1186,19 +1200,35 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
 #endif
 
-// restrict pointers for the output pointers
+// restrict pointers for the output pointers, other loop and unroll control
 #if defined( _MSC_VER ) && !defined(__clang__)
   #define STBIR_STREAMOUT_PTR( star ) star __restrict
   #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
-#elif defined(  __clang__ )
-  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
-#elif defined(  __GNUC__ )
+  #if _MSC_VER >= 1900
+    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START 
+  #endif
+#elif defined( __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
+  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif 
+#elif defined( __GNUC__ )
   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
   #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+  #if __GNUC__ >= 14
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif
 #else
   #define STBIR_STREAMOUT_PTR( star ) star
   #define STBIR_NO_UNROLL( ptr )
+  #define STBIR_NO_UNROLL_LOOP_START
 #endif
 
 #ifdef STBIR_NO_SIMD // force simd off for whatever reason
@@ -1750,11 +1780,19 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
           ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
         vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
           ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+
+      static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
+      {
+        uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
+        return r;
+      }
     #else
       #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+      #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
     #endif
 
     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
+    #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
 
     #define stbir__simdi_16madd( out, reg0, reg1 ) \
     { \
@@ -2138,7 +2176,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #endif
 
 
-#if defined(STBIR_NEON) && !defined(_M_ARM)
+#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
 
   #if defined( _MSC_VER ) && !defined(__clang__)
   typedef __int16 stbir__FP16;
@@ -2155,7 +2193,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
 #endif
 
-#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM)
+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
 
   // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 
@@ -2382,7 +2420,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     stbir__simdi_store( output,final );
   }
 
-#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
   {
@@ -2428,7 +2466,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
   }
 
-#elif defined(STBIR_NEON) // 64-bit ARM
+#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
   {
@@ -2506,11 +2544,12 @@ static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 //   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
 #define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
 #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
 
 #ifdef STBIR_MEMCPY
 #undef STBIR_MEMCPY
-#define STBIR_MEMCPY stbir_simd_memcpy
 #endif
+#define STBIR_MEMCPY stbir_simd_memcpy
 
 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
 static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
@@ -2528,6 +2567,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     {
       if ( bytes )
       {
+        STBIR_SIMD_NO_UNROLL_LOOP_START
         do
         {
           STBIR_SIMD_NO_UNROLL(d);
@@ -2542,8 +2582,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
       // do one unaligned to get us aligned for the stream out below
       stbir__simdf_load( x, ( d + ofs_to_src ) );
       stbir__simdf_store( d, x );
-      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
 
+      STBIR_SIMD_NO_UNROLL_LOOP_START
       for(;;)
       {
         STBIR_SIMD_NO_UNROLL(d);
@@ -2574,8 +2615,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
     stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
     stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
-    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(d);
@@ -2612,6 +2654,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do
     {
       stbir__simdf x;
@@ -2638,6 +2681,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
 #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
 #define STBIR_SIMD_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START
 
 #endif // SSE2
 
@@ -2749,7 +2793,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #ifndef STBIR_SIMD
 
-// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
 static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
@@ -2761,6 +2805,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    STBIR_NO_UNROLL_LOOP_START
     do
     {
       STBIR_NO_UNROLL(sd);
@@ -2772,6 +2817,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
       return;
   }
 
+  STBIR_NO_UNROLL_LOOP_START
   do
   {
     STBIR_NO_UNROLL(sd);
@@ -2876,13 +2922,6 @@ static float stbir__filter_mitchell(float x, float s, void * user_data)
   return (0.0f);
 }
 
-static float stbir__support_zero(float s, void * user_data)
-{
-  STBIR__UNUSED(s);
-  STBIR__UNUSED(user_data);
-  return 0;
-}
-
 static float stbir__support_zeropoint5(float s, void * user_data)
 {
   STBIR__UNUSED(s);
@@ -3200,8 +3239,8 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
 
   if ( edge == STBIR_EDGE_WRAP )
   {
-    if ( first <= -input_size )
-      first = -(input_size-1);
+    if ( first < -input_size )
+      first = -input_size;
     if ( last >= (input_size*2))
       last = (input_size*2) - 1;
   }
@@ -3392,6 +3431,12 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
   }
 }
 
+#ifdef STBIR_RENORMALIZE_IN_FLOAT
+#define STBIR_RENORM_TYPE float
+#else
+#define STBIR_RENORM_TYPE double
+#endif
+
 static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
 {
   int input_size = scale_info->input_full_size;
@@ -3413,14 +3458,14 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   for (n = 0; n < end; n++)
   {
     int i;
-    float filter_scale, total_filter = 0;
+    STBIR_RENORM_TYPE filter_scale, total_filter = 0;
     int e;
 
     // add all contribs
     e = contribs->n1 - contribs->n0;
     for( i = 0 ; i <= e ; i++ )
     {
-      total_filter += coeffs[i];
+      total_filter += (STBIR_RENORM_TYPE) coeffs[i];
       STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
     }
 
@@ -3436,10 +3481,11 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
       // if the total isn't 1.0, rescale everything
       if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
       {
-        filter_scale = 1.0f / total_filter;
+        filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
+
         // scale them all
         for (i = 0; i <= e; i++)
-          coeffs[i] *= filter_scale;
+          coeffs[i] = (float) ( coeffs[i] * filter_scale );
       }
     }
     ++contribs;
@@ -3560,7 +3606,9 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   filter_info->widest = widest;
 }
 
-static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width )
+#undef STBIR_RENORM_TYPE 
+
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 {
   #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
   #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
@@ -3569,6 +3617,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   #else
   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
   #endif
+
+  int row_end = row1 + 1;
+  STBIR__UNUSED( row0 ); // only used in an assert
+
   if ( coefficient_width != widest )
   {
     float * pc = coefficents;
@@ -3577,6 +3629,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     switch( widest )
     {
       case 1:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_1( pc, coeffs );
           ++pc;
@@ -3584,6 +3637,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 2:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           pc += 2;
@@ -3591,6 +3645,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 3:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           STBIR_MOVE_1( pc+2, coeffs+2 );
@@ -3599,6 +3654,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 4:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           pc += 4;
@@ -3606,6 +3662,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 5:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_1( pc+4, coeffs+4 );
@@ -3614,6 +3671,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 6:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3622,6 +3680,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 7:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3631,6 +3690,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 8:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3639,6 +3699,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 9:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3648,6 +3709,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 10:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3657,6 +3719,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 11:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3667,6 +3730,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 12:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3676,6 +3740,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       default:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           float * copy_end = pc + widest - 4;
           float * c = coeffs;
@@ -3686,6 +3751,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
             c += 4;
           } while ( pc <= copy_end );
           copy_end += 4;
+          STBIR_NO_UNROLL_LOOP_START
           while ( pc < copy_end )
           {
             STBIR_MOVE_1( pc, c );
@@ -3710,10 +3776,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     float * coeffs = coefficents + widest * ( num_contributors - 1 );
 
     // go until no chance of clipping (this is usually less than 8 lops)
-    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_width ) )
+    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
     {
       // might we clip??
-      if ( ( contribs->n0 + widest ) > row_width )
+      if ( ( contribs->n0 + widest ) > row_end )
       {
         int stop_range = widest;
 
@@ -3732,15 +3798,15 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         }
 
         // now see if we still clip with the refined range
-        if ( ( contribs->n0 + stop_range ) > row_width )
+        if ( ( contribs->n0 + stop_range ) > row_end )
         {
-          int new_n0 = row_width - stop_range;
+          int new_n0 = row_end - stop_range;
           int num = contribs->n1 - contribs->n0 + 1;
           int backup = contribs->n0 - new_n0;
           float * from_co = coeffs + num - 1;
           float * to_co = from_co + backup;
 
-          STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) );
+          STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
 
           // move the coeffs over
           while( num )
@@ -3863,26 +3929,33 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
           for (k = gn0 ; k <= gn1 ; k++ )
           {
             float gc = *g_coeffs++;
-            if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+            
+            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
+            //   (which happens when pivoting from horizontal, which might have dummy zeros)
+            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
             {
+              if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
               {
-                // if we are skipping over several contributors, we need to clear the skipped ones
-                stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
-                while ( clear_contributors < scatter_contributors )
                 {
-                  clear_contributors->n0 = 0;
-                  clear_contributors->n1 = -1;
-                  ++clear_contributors;
+                  // if we are skipping over several contributors, we need to clear the skipped ones
+                  stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                  while ( clear_contributors < scatter_contributors )
+                  {
+                    clear_contributors->n0 = 0;
+                    clear_contributors->n1 = -1;
+                    ++clear_contributors;
+                  }
                 }
+                scatter_contributors->n0 = n;
+                scatter_contributors->n1 = n;
+                scatter_coeffs[0]  = gc;
+                highest_set = k;
               }
-              scatter_contributors->n0 = n;
-              scatter_contributors->n1 = n;
-              scatter_coeffs[0]  = gc;
-              highest_set = k;
-            }
-            else
-            {
-              stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+              else
+              {
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+              }
+              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
             }
             ++scatter_contributors;
             scatter_coeffs += scatter_coefficient_width;
@@ -3989,6 +4062,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
 
   #ifdef STBIR_SIMD8
   decode += 16;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf8 d0,d1,a0,a1,p0,p1;
@@ -4013,6 +4087,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
   decode -= 16;
   #else
   decode += 8;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf d0,a0,d1,a1,p0,p1;
@@ -4035,12 +4110,14 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
 
   // might be one last odd pixel
   #ifdef STBIR_SIMD8
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode < end_decode )
   #else
   if ( decode < end_decode )
   #endif
   {
     stbir__simdf d,a,p;
+    STBIR_NO_UNROLL(decode);
     stbir__simdf_load( d, decode );
     stbir__simdf_0123to3333( a, d );
     stbir__simdf_mult( p, a, d );
@@ -4082,6 +4159,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode += 8;
   if ( decode <= end_decode )
   {
+    STBIR_NO_UNROLL_LOOP_START
     do {
       #ifdef STBIR_SIMD8
       stbir__simdf8 d0,a0,p0;
@@ -4125,6 +4203,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode -= 8;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float x = decode[0], y = decode[1];
@@ -4145,6 +4224,7 @@ static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_ti
 
   // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = input[3];
 #ifdef STBIR_SIMD
@@ -4212,6 +4292,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
   #ifdef STBIR_SIMD
   {
     decode += 2 * stbir__simdfX_float_count;
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode <= end_decode )
     {
       stbir__simdfX d0,a0,d1,a1;
@@ -4230,6 +4311,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
 
     // few last pixels remnants
     #ifdef STBIR_SIMD8
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode < end_decode )
     #else
     if ( decode < end_decode )
@@ -4265,6 +4347,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
 
   #ifdef STBIR_SIMD
   decode += 2 * stbir__simdfX_float_count;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdfX d0,a0,d1,a1;
@@ -4282,6 +4365,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
   decode -= 2 * stbir__simdfX_float_count;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float alpha = decode[1];
@@ -4296,6 +4380,7 @@ static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_t
   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
   float const * end_output = encode_buffer + width_times_channels;
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = encode[3];
 
@@ -4343,9 +4428,77 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
   float const * end_decode = decode_buffer + width_times_channels;
 
-  decode += 12;
+#ifdef STBIR_SIMD
+    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
+      end_decode -= 12; 
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // on arm64 8 instructions, no overlapping stores
+        stbir__simdf a,b,c,na,nb;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+4 );
+        stbir__simdf_load( c, decode+8 );
+
+        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
+        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
+        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
+        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
+
+        stbir__simdf_store( decode, na );
+        stbir__simdf_store( decode+4, nb ); 
+        stbir__simdf_store( decode+8, c );
+        decode += 12;
+      }
+      end_decode += 12;
+    #else
+      end_decode -= 24;
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // 26 instructions on x64
+        stbir__simdf a,b,c,d,e,f,g;
+        float i21, i23;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+3 );
+        stbir__simdf_load( c, decode+6 );
+        stbir__simdf_load( d, decode+9 );
+        stbir__simdf_load( e, decode+12 );
+        stbir__simdf_load( f, decode+15 );
+        stbir__simdf_load( g, decode+18 );
+
+        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
+        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
+        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
+        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
+        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
+        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
+        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
+
+        // stores overlap, need to be in order, 
+        stbir__simdf_store( decode,    a );
+        i21 = decode[21];
+        stbir__simdf_store( decode+3,  b ); 
+        i23 = decode[23];
+        stbir__simdf_store( decode+6,  c );
+        stbir__simdf_store( decode+9,  d );
+        stbir__simdf_store( decode+12, e );
+        stbir__simdf_store( decode+15, f );
+        stbir__simdf_store( decode+18, g );
+        decode[21] = i23;
+        decode[23] = i21;
+        decode += 24;
+      }
+      end_decode += 24;
+    #endif
+#else
+  end_decode -= 12;
+  STBIR_NO_UNROLL_LOOP_START
   while( decode <= end_decode )
   {
+    // 16 instructions
     float t0,t1,t2,t3;
     STBIR_NO_UNROLL(decode);
     t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
@@ -4353,8 +4506,10 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
     decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
     decode += 12;
   }
-  decode -= 12;
+  end_decode += 12;
+#endif
 
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float t = decode[0];
@@ -4375,7 +4530,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
   stbir_edge edge_horizontal = stbir_info->horizontal.edge;
   stbir_edge edge_vertical = stbir_info->vertical.edge;
   int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
-  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
   stbir__span const * spans = stbir_info->scanline_extents.spans;
   float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
 
@@ -5958,7 +6113,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output
 
   // if we have an output callback, call it to send the data
   if ( stbir_info->out_pixels_cb )
-    stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data );
+    stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
 }
 
 
@@ -6028,7 +6183,7 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
   }
 
-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes),
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 }
 
@@ -6069,7 +6224,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
 
   // initialize the ring buffer for gathering
   split_info->ring_buffer_begin_index = 0;
-  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;
+  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
 
   for (y = start_output_y; y < end_output_y; y++)
@@ -6123,7 +6278,7 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_
   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6144,7 +6299,7 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st
   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6352,15 +6507,31 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   // pre calculate stuff based on the above
   samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
 
+  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
+  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
+  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
+  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
+  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
+  //   overhang, and once for the right side). We only do this for edge mode, since the other 
+  //   modes can just re-edge clamp back in again.
   if ( edge == STBIR_EDGE_WRAP )
-    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) )  // this can only happen when shrinking to a single pixel
-      samp->filter_pixel_width = scale_info->input_full_size * 2;
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
+      samp->filter_pixel_width = scale_info->input_full_size * 3;
 
   // This is how much to expand buffers to account for filters seeking outside
   // the image boundaries.
   samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+  
+  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
+  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
+  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
+  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_margin > scale_info->input_full_size )
+      samp->filter_pixel_margin = scale_info->input_full_size;
 
   samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+
   samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
   samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
 
@@ -6725,7 +6896,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
   stbir__info * info = 0;
   void * alloced = 0;
-  int alloced_total = 0;
+  size_t alloced_total = 0;
   int vertical_first;
   int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
 
@@ -6996,7 +7167,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       stbir__get_extents( horizontal, &info->scanline_extents );
 
       // pack the horizontal coeffs
-      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
 
       STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
 
@@ -7027,36 +7198,27 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
         info->ring_buffer_num_entries = conservative_split_output_size;
       STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
 
-      // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in
+      // a few of the horizontal gather functions read past the end of the decode (but mask it out), 
+      //   so put in normal values so no snans or denormals accidentally sneak in (also, in the ring 
+      //   buffer for vertical first)
       for( i = 0 ; i < splits ; i++ )
       {
-        int width, ofs;
+        int t, ofs, start;
 
-        // find the right most span
-        if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
-          width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
-        else
-          width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
+        ofs = decode_buffer_size / 4;
+        start = ofs - 4;
+        if ( start < 0 ) start = 0;
 
-        // this calc finds the exact end of the decoded scanline for all filter modes.
-        //   usually this is just the width * effective channels.  But we have to account
-        //   for the area to the left of the scanline for wrap filtering and alignment, this
-        //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
-        //   we need to skip the exact size of the right hand size filter area (again for
-        //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
-        ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
+        for( t = start ; t < ofs; t++ )
+          info->split_info[i].decode_buffer[ t ] = 9999.0f;
 
-        // place a known, but numerically valid value in the decode buffer
-        info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
-
-        // if vertical filtering first, place a known, but numerically valid value in the all
-        //   of the ring buffer accumulators
         if ( vertical_first )
         {
           int j;
           for( j = 0; j < info->ring_buffer_num_entries ; j++ )
           {
-            stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
+            for( t = start ; t < ofs; t++ )
+              stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f;
           }
         }
       }
@@ -7068,7 +7230,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
     // is this the first time through loop?
     if ( info == 0 )
     {
-      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced_total = ( 15 + (size_t)advance_mem );
       alloced = STBIR_MALLOC( alloced_total, user_data );
       if ( alloced == 0 )
         return 0;
@@ -7185,7 +7347,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
 
   // calc offset
-  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
 
   info->in_pixels_cb = resize->input_cb;
   info->user_data = resize->user_data;
@@ -7757,7 +7919,7 @@ static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * o
   if ( output_stride_in_bytes < pitch )
     return 0;
 
-  size = output_stride_in_bytes * output_h;
+  size = (size_t)output_stride_in_bytes * (size_t)output_h;
   if ( size == 0 )
     return 0;
 
@@ -8035,6 +8197,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8090,6 +8253,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8105,6 +8269,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8131,6 +8296,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8162,6 +8328,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8180,6 +8347,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     stbir__simdf e0;
@@ -8216,6 +8384,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8245,6 +8414,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8294,6 +8464,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8309,6 +8480,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8335,6 +8507,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8366,6 +8539,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8404,6 +8578,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8444,6 +8619,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8529,12 +8705,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8548,7 +8724,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
       stbir__min_max_shift20( i2, f2 );
       stbir__min_max_shift20( i3, f3 );
 
-      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8573,6 +8749,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while ( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8590,6 +8767,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     STBIR_NO_UNROLL(encode);
@@ -8630,12 +8808,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8649,7 +8827,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
+      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8671,6 +8849,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8721,12 +8900,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8740,7 +8919,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup2( i0, i2, to_srgb );
+      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8760,6 +8939,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8788,6 +8968,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8831,6 +9012,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8846,6 +9028,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8874,6 +9057,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -8901,6 +9085,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -8919,6 +9104,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     stbir__simdf e;
@@ -8940,6 +9126,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -8956,6 +9143,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8985,6 +9173,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -9025,6 +9214,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9040,6 +9230,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9067,6 +9258,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -9094,6 +9286,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -9115,6 +9308,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -9133,6 +9327,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -9161,6 +9356,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
   {
     stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9202,6 +9398,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9217,6 +9414,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9243,6 +9441,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   {
     float const * end_encode_m8 = encode + width_times_channels - 8;
     end_output -= 8;
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(encode);
@@ -9283,6 +9482,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(output);
@@ -9298,6 +9498,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     STBIR_NO_UNROLL(output);
@@ -9326,6 +9527,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   {
     float const * end_input_m16 = input + width_times_channels - 16;
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9374,6 +9576,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9389,6 +9592,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9448,6 +9652,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
     end_output -= ( stbir__simdfX_float_count * 2 );
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -9481,6 +9686,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -9505,6 +9711,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float e;
@@ -9524,6 +9731,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float e;
@@ -9634,6 +9842,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
@@ -9688,6 +9897,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
       input += (4*stbir__simdfX_float_count);
       stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
     }
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input_end - (char*) input ) >= 16 )
     {
       stbir__simdf o0, r0;
@@ -9720,6 +9930,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     }
   }
   #else
+  STBIR_NO_UNROLL_LOOP_START
   while ( ( (char*)input_end - (char*) input ) >= 16 )
   {
     float r0, r1, r2, r3;
@@ -9751,6 +9962,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
   }
   #endif
+  STBIR_NO_UNROLL_LOOP_START
   while ( input < input_end )
   {
     float r = input[0];
@@ -9814,6 +10026,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
@@ -9858,6 +10071,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
       stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
     }
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
     {
       stbir__simdf o0, r0;
@@ -9882,6 +10096,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     }
   }
   #else
+  STBIR_NO_UNROLL_LOOP_START
   while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
   {
     float o0, o1, o2, o3;
@@ -9903,6 +10118,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
   }
   #endif
+  STBIR_NO_UNROLL_LOOP_START
   while ( input0 < input0_end )
   {
     float o0;
@@ -9995,6 +10211,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( floa
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10007,6 +10224,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10019,6 +10237,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10031,6 +10250,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10043,6 +10263,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10056,6 +10277,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10070,6 +10292,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10084,6 +10307,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10097,6 +10321,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10111,6 +10336,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10126,6 +10352,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( fl
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10140,6 +10367,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10154,12 +10382,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10174,12 +10404,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10195,12 +10427,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10218,12 +10452,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
diff --git a/external/stb/stb/stb_truetype.h b/external/stb/stb/stb_truetype.h
index bbf2284b..90a5c2e2 100644
--- a/external/stb/stb/stb_truetype.h
+++ b/external/stb/stb/stb_truetype.h
@@ -54,7 +54,7 @@
 //       Hou Qiming                 Derek Vinyard
 //       Rob Loach                  Cort Stratton
 //       Kenney Phillis Jr.         Brian Costabile
-//       Ken Voskuil (kaesve)
+//       Ken Voskuil (kaesve)       Yakov Galka
 //
 // VERSION HISTORY
 //
@@ -4604,6 +4604,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
    scale_y = -scale_y;
 
    {
+      // distance from singular values (in the same units as the pixel grid)
+      const float eps = 1./1024, eps2 = eps*eps;
       int x,y,i,j;
       float *precompute;
       stbtt_vertex *verts;
@@ -4616,15 +4618,15 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
             float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
             float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y;
             float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0));
-            precompute[i] = (dist == 0) ? 0.0f : 1.0f / dist;
+            precompute[i] = (dist < eps) ? 0.0f : 1.0f / dist;
          } else if (verts[i].type == STBTT_vcurve) {
             float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y;
             float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y;
             float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y;
             float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
             float len2 = bx*bx + by*by;
-            if (len2 != 0.0f)
-               precompute[i] = 1.0f / (bx*bx + by*by);
+            if (len2 >= eps2)
+               precompute[i] = 1.0f / len2;
             else
                precompute[i] = 0.0f;
          } else
@@ -4689,8 +4691,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                         float a = 3*(ax*bx + ay*by);
                         float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by);
                         float c = mx*ax+my*ay;
-                        if (a == 0.0) { // if a is 0, it's linear
-                           if (b != 0.0) {
+                        if (STBTT_fabs(a) < eps2) { // if a is 0, it's linear
+                           if (STBTT_fabs(b) >= eps2) {
                               res[num++] = -c/b;
                            }
                         } else {
diff --git a/external/stb/stb/tests/stb.dsp b/external/stb/stb/tests/stb.dsp
index 849b95be..b7039c74 100644
--- a/external/stb/stb/tests/stb.dsp
+++ b/external/stb/stb/tests/stb.dsp
@@ -130,6 +130,10 @@ SOURCE=..\stb_image.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\stb_image_resize2.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\stb_image_write.h
 # End Source File
 # Begin Source File
diff --git a/external/stb/stb/tests/test_vorbis.c b/external/stb/stb/tests/test_vorbis.c
index d54ed231..fc795c8e 100644
--- a/external/stb/stb/tests/test_vorbis.c
+++ b/external/stb/stb/tests/test_vorbis.c
@@ -8,7 +8,7 @@ extern void stb_vorbis_dumpmem(void);
 int main(int argc, char **argv)
 {
    size_t memlen;
-   unsigned char *mem = stb_fileu("c:/x/sketch008.ogg", &memlen);
+   unsigned char *mem = stb_fileu("../../lib/vorbis/sample/sketch008.ogg", &memlen);
    int chan, samplerate;
    short *output;
    int samples = stb_vorbis_decode_memory(mem, memlen, &chan, &samplerate, &output);