commit 1c189bfd9c34436956d439355e3ecab85ad22afa
Author: Green Sky <Green-Sky@users.noreply.github.com>
Date:   Mon May 12 20:44:00 2025 +0200

    Squashed 'external/libqoirdo/' content from commit 59f81203c9
    
    git-subtree-dir: external/libqoirdo
    git-subtree-split: 59f81203c99b2bd6edda0c84b98ba66a38f0e2c4

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..70406e4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(libqoirdo)
+
+if(NOT CMAKE_BUILD_TYPE)
+	set(CMAKE_BUILD_TYPE Debug)
+endif()
+
+message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} )
+
+add_library(qoirdo
+	./qoirdo.hpp
+	./qoirdo.cpp
+)
+
+target_compile_features(qoirdo PUBLIC cxx_std_11)
+
+#if (NOT MSVC)
+#   target_link_libraries(rdopng m pthread)
+#endif()
+
+########################################
+
+add_executable(qoirdo_tool
+	tool.cpp
+)
+
+target_link_libraries(qoirdo_tool qoirdo)
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4679575
--- /dev/null
+++ b/README.md
@@ -0,0 +1,120 @@
+# rdopng
+Rate-Distortion Optimized Lossy PNG, QOI, and LZ4 image (LZ4I) Encoding Tool
+
+rdopng is a command line tool which uses LZ match optimization, Lagrangian multiplier [rate distortion optimization (RDO)](https://en.wikipedia.org/wiki/Rate%E2%80%93distortion_optimization), a simple perceptual error tolerance model, and [Oklab](https://bottosson.github.io/posts/oklab/)-based colorspace error metrics to encode lossy 24/32bpp PNG/QOI/LZ4I files. The encoded lossy PNG files are typically 30-80% smaller relative to lodepng/libpng. The tool defaults to reasonably fast near-lossless settings which writes PNG's around 30-40% smaller than lossless PNG encoders.
+
+Unlike [pngquant](https://pngquant.org/), rdopng does not use 256-color palettes or dithering. PNG files encoded by rdopng typically range between roughly 2.5-7bpp, depending on the options used (and how much time and patience you have).
+
+Some example encodes and command lines are [here](https://github.com/richgel999/rdopng/wiki/Examples).
+
+You can download a pre-built Windows binary for an older version of rdopng [here](https://github.com/richgel999/rdopng/releases). (The latest version is in the repo.) You may need to install the [VS 2022 runtime redistributable from Microsoft](https://docs.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170). 
+
+### Building
+
+You'll need [cmake](https://cmake.org/). There are no other dependencies.
+
+Linux (gcc/clang): 
+
+```
+cmake .
+make
+```
+
+Windows (tested with Visual Studio 2022):
+
+```
+cmake .
+rdopng.sln
+```
+
+### Instructions
+
+Encodes a .PNG/.BMP/.TGA/.JPG file to "./file_rdo.png":
+
+```
+rdopng file.png
+```
+
+Encodes a .PNG/.BMP/.TGA/.JPG file to "./file_rdo.qoi" (and also unpacks the coded image and saves it as .PNG):
+
+```
+rdopng -qoi -unpack_qoi_to_png file.png 
+```
+
+Encodes a file to "./file_rdo.qoi" at higher quality per bit, but much slower (also try -better which is in between the default/uber settings):
+
+```
+rdopng -qoi -uber -unpack_qoi_to_png file.png 
+```
+
+Encodes smaller PNG files but will be 2x slower:
+
+```
+rdopng -two_pass file.png
+```
+
+Encodes at lower than default quality (which is 300), but writes smaller files:
+
+```
+rdopng -lambda 500 file.png
+```
+
+Significantly lower PNG quality (which increases artifacts), using a higher than default parsing level to compensate for artifacts:
+
+```
+rdopng -level 3 -lambda 1000 file.png
+```
+
+Enable debug output and write output to z.png:
+
+```
+rdopng -debug file.png -output z.png
+```
+
+Load a normal map, normalize it, pack it using angular normal map metrics, decoded/encode texels using GPU SNORM unpacking (instead of the default UNORM):
+
+```
+rdopng -normalize -normal_map -snorm file.png
+```
+
+Level ranges from 0-29. Levels 0-9 use up to 4 pixel long matches, levels 10-17 use up to 6 pixel long matches, and 18-23 use up to 6 or 12 pixel long matches. Levels 24-29 use exhaustive matching and are beyond impractical except on tiny images. 
+
+The higher the level within a match length category, the slower the encoder. Higher match length categories are needed for the higher lambdas/lower bitrates. At near-lossless settings (lower than approximately lambda 300), the smaller/less aggressive parsing levels are usually fine. At higher lambdas/lower bitrates the higher levels are needed to avoid artifacts. To get below roughly 3-4bpp you'll need to use high lambdas, two pass mode, and very slow parsing levels.
+
+-lambda is the quality slider. Useful lambda values are roughly 1-20000, but values beyond approximately 500-1000 (depending on the image) will require fiddling with the level to compensate for artifacts. Higher levels are extremely slow because the current tool is single threaded.
+
+Most options work with both QOI, LZ4I and PNG. The -level option is only for PNG, and the -uber/-better options are only for QOI/LZ4I.
+
+### RDO LZ4 examples
+
+```
+rdopng -lz4i -lambda 5000 -debug -better file.png
+```
+
+Unpacking .LZ4I images to PNG:
+
+```
+rdopng -unpack file.lz4i
+```
+
+LZ4I image files contain a simple header followed by the RGB(A) pixels compressed using LZ4. Here's the header (it's like QOI's but with a different sig):
+
+```
+#pragma pack(push, 1)
+struct lz4i_header
+{
+	char sig[4]; // signature bytes "lz4i"
+	uint32_t width; // image width in pixels (BE)
+	uint32_t height; // image height in pixels (BE)
+	uint8_t channels; // 3 = RGB, 4 = RGBA
+	uint8_t colorspace; // 0 = sRGB with linear alpha 1 = all channels linear
+};
+#pragma pack(pop)
+```
+
+### Known Problems
+rdopng has only been tested on little endian platforms, under Windows using MSVC and Ubuntu Linux using clang/gcc. There are a few known endian issues in there, which I'll eventually fix. It has not been compiled or tested on OSX.
+
+### Special Thanks
+Thanks to [Paul Hughes](https://twitter.com/PaulieHughes) for encouraging me to continue working on this on Twitter. Also, thanks to [Jyrki Alakuijala](https://twitter.com/jyzg) for suggesting to drop YCbCr for an alternative such as Oklab.
+
diff --git a/basisu.min.hpp b/basisu.min.hpp
new file mode 100644
index 0000000..4cd53a3
--- /dev/null
+++ b/basisu.min.hpp
@@ -0,0 +1,867 @@
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cassert>
+#include <cstring>
+#include <vector>
+
+namespace basisu
+{
+
+	using std::clamp;
+	using std::min;
+	using std::max;
+
+	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }
+
+
+	class color_rgba
+	{
+	public:
+		union
+		{
+			uint8_t m_comps[4];
+
+			struct
+			{
+				uint8_t r;
+				uint8_t g;
+				uint8_t b;
+				uint8_t a;
+			};
+		};
+
+		inline color_rgba()
+		{
+			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
+		}
+
+		inline color_rgba(int y)
+		{
+			set(y);
+		}
+
+		inline color_rgba(int y, int na)
+		{
+			set(y, na);
+		}
+
+		inline color_rgba(int sr, int sg, int sb, int sa)
+		{
+			set(sr, sg, sb, sa);
+		}
+
+		//inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
+		//{
+		//    set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
+		//}
+
+		inline color_rgba& set_noclamp_y(int y)
+		{
+			m_comps[0] = (uint8_t)y;
+			m_comps[1] = (uint8_t)y;
+			m_comps[2] = (uint8_t)y;
+			m_comps[3] = (uint8_t)255;
+			return *this;
+		}
+
+		inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)
+		{
+			m_comps[0] = (uint8_t)sr;
+			m_comps[1] = (uint8_t)sg;
+			m_comps[2] = (uint8_t)sb;
+			m_comps[3] = (uint8_t)sa;
+			return *this;
+		}
+
+		inline color_rgba &set(int y)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
+			m_comps[1] = m_comps[0];
+			m_comps[2] = m_comps[0];
+			m_comps[3] = 255;
+			return *this;
+		}
+
+		inline color_rgba &set(int y, int na)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
+			m_comps[1] = m_comps[0];
+			m_comps[2] = m_comps[0];
+			m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set(int sr, int sg, int sb, int sa)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
+			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
+			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
+			m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set_rgb(int sr, int sg, int sb)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
+			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
+			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set_rgb(const color_rgba &other)
+		{
+			r = other.r;
+			g = other.g;
+			b = other.b;
+			return *this;
+		}
+
+		inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }
+		inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }
+
+		inline void clear()
+		{
+			m_comps[0] = 0;
+			m_comps[1] = 0;
+			m_comps[2] = 0;
+			m_comps[3] = 0;
+		}
+
+		inline bool operator== (const color_rgba &rhs) const
+		{
+			if (m_comps[0] != rhs.m_comps[0]) return false;
+			if (m_comps[1] != rhs.m_comps[1]) return false;
+			if (m_comps[2] != rhs.m_comps[2]) return false;
+			if (m_comps[3] != rhs.m_comps[3]) return false;
+			return true;
+		}
+
+		inline bool operator!= (const color_rgba &rhs) const
+		{
+			return !(*this == rhs);
+		}
+
+		inline bool operator<(const color_rgba &rhs) const
+		{
+			for (int i = 0; i < 4; i++)
+			{
+				if (m_comps[i] < rhs.m_comps[i])
+					return true;
+				else if (m_comps[i] != rhs.m_comps[i])
+					return false;
+			}
+			return false;
+		}
+
+		inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
+		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; }
+		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
+
+		static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(min(a[0], b[0]), min(a[1], b[1]), min(a[2], b[2]), min(a[3], b[3])); }
+		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(max(a[0], b[0]), max(a[1], b[1]), max(a[2], b[2]), max(a[3], b[3])); }
+	};
+
+	typedef std::vector<color_rgba> color_rgba_vec;
+
+	const color_rgba g_black_color(0, 0, 0, 255);
+	const color_rgba g_black_trans_color(0, 0, 0, 0);
+	const color_rgba g_white_color(255, 255, 255, 255);
+
+	// Simple 32-bit 2D image class
+
+	class image
+	{
+	public:
+		image() :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+		}
+
+		image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			resize(w, h, p);
+		}
+
+		image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			init(pImage, width, height, comps);
+		}
+
+		image(const image &other) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			*this = other;
+		}
+
+		image &swap(image &other)
+		{
+			std::swap(m_width, other.m_width);
+			std::swap(m_height, other.m_height);
+			std::swap(m_pitch, other.m_pitch);
+			m_pixels.swap(other.m_pixels);
+			return *this;
+		}
+
+		image &operator= (const image &rhs)
+		{
+			if (this != &rhs)
+			{
+				m_width = rhs.m_width;
+				m_height = rhs.m_height;
+				m_pitch = rhs.m_pitch;
+				m_pixels = rhs.m_pixels;
+			}
+			return *this;
+		}
+
+		image &clear()
+		{
+			m_width = 0;
+			m_height = 0;
+			m_pitch = 0;
+			m_pixels.erase(m_pixels.begin(), m_pixels.end());
+			return *this;
+		}
+
+		image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
+		{
+			return crop(w, h, p, background);
+		}
+
+		image &set_all(const color_rgba &c)
+		{
+			for (uint32_t i = 0; i < m_pixels.size(); i++)
+				m_pixels[i] = c;
+			return *this;
+		}
+
+		void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
+		{
+			assert(comps >= 1 && comps <= 4);
+
+			resize(width, height);
+
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const uint8_t *pSrc = &pImage[(x + y * width) * comps];
+					color_rgba &dst = (*this)(x, y);
+
+					if (comps == 1)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = 255;
+					}
+					else if (comps == 2)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = pSrc[1];
+					}
+					else
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[1];
+						dst.b = pSrc[2];
+						if (comps == 4)
+							dst.a = pSrc[3];
+						else
+							dst.a = 255;
+					}
+				}
+			}
+		}
+
+		image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped(x + ix, y + iy, c);
+			return *this;
+		}
+
+		image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped_alpha(x + ix, y + iy, c);
+			return *this;
+		}
+
+		image &crop_dup_borders(uint32_t w, uint32_t h)
+		{
+			const uint32_t orig_w = m_width, orig_h = m_height;
+
+			crop(w, h);
+
+			if (orig_w && orig_h)
+			{
+				if (m_width > orig_w)
+				{
+					for (uint32_t x = orig_w; x < m_width; x++)
+						for (uint32_t y = 0; y < m_height; y++)
+							set_clipped(x, y, get_clamped(min(x, orig_w - 1U), min(y, orig_h - 1U)));
+				}
+
+				if (m_height > orig_h)
+				{
+					for (uint32_t y = orig_h; y < m_height; y++)
+						for (uint32_t x = 0; x < m_width; x++)
+							set_clipped(x, y, get_clamped(min(x, orig_w - 1U), min(y, orig_h - 1U)));
+				}
+			}
+			return *this;
+		}
+
+		//// pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).
+		//image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)
+		//{
+		//    if (p == UINT32_MAX)
+		//        p = w;
+
+		//    clear();
+
+		//    if ((!p) || (!w) || (!h))
+		//        return *this;
+
+		//    m_pixels.grant_ownership(pPixels, p * h, p * h);
+
+		//    m_width = w;
+		//    m_height = h;
+		//    m_pitch = p;
+
+		//    return *this;
+		//}
+
+		image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)
+		{
+			if (p == UINT32_MAX)
+				p = w;
+
+			if ((w == m_width) && (m_height == h) && (m_pitch == p))
+				return *this;
+
+			if ((!w) || (!h) || (!p))
+			{
+				clear();
+				return *this;
+			}
+
+			color_rgba_vec cur_state;
+			cur_state.swap(m_pixels);
+
+			m_pixels.resize(p * h);
+
+			if (init_image)
+			{
+				if (m_width || m_height)
+				{
+					for (uint32_t y = 0; y < h; y++)
+					{
+						for (uint32_t x = 0; x < w; x++)
+						{
+							if ((x < m_width) && (y < m_height))
+								m_pixels[x + y * p] = cur_state[x + y * m_pitch];
+							else
+								m_pixels[x + y * p] = background;
+						}
+					}
+				}
+				else
+				{
+					//m_pixels.set_all(background);
+					set_all(background);
+				}
+			}
+
+			m_width = w;
+			m_height = h;
+			m_pitch = p;
+
+			return *this;
+		}
+
+		inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+		inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+
+		inline const color_rgba& get_pixel(uint32_t c) const { return (*this)(c % m_width, c / m_width); }
+		inline color_rgba& get_pixel(uint32_t c) { return (*this)(c % m_width, c / m_width); }
+
+		inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+		inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+
+		//inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
+		//{
+		//    x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+		//    y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+		//    return m_pixels[x + y * m_pitch];
+		//}
+
+		//inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
+		//{
+		//    x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+		//    y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+		//    return m_pixels[x + y * m_pitch];
+		//}
+
+		inline image &set_clipped(int x, int y, const color_rgba &c)
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y) = c;
+			return *this;
+		}
+
+		inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y).m_comps[3] = c.m_comps[3];
+			return *this;
+		}
+
+		// Very straightforward blit with full clipping. Not fast, but it works.
+		image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
+		{
+			for (int y = 0; y < src_h; y++)
+			{
+				const int sy = src_y + y;
+				if (sy < 0)
+					continue;
+				else if (sy >= (int)src.get_height())
+					break;
+
+				for (int x = 0; x < src_w; x++)
+				{
+					const int sx = src_x + x;
+					if (sx < 0)
+						continue;
+					else if (sx >= (int)src.get_height())
+						break;
+
+					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
+				}
+			}
+
+			return *this;
+		}
+
+		const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
+		{
+			if (((src_x + w) > m_width) || ((src_y + h) > m_height))
+			{
+				// Slower clamping case
+				for (uint32_t y = 0; y < h; y++)
+					for (uint32_t x = 0; x < w; x++)
+						*pDst++ = get_clamped(src_x + x, src_y + y);
+			}
+			else
+			{
+				const color_rgba* pSrc = &m_pixels[src_x + src_y * m_pitch];
+
+				for (uint32_t y = 0; y < h; y++)
+				{
+					std::memcpy(pDst, pSrc, w * sizeof(color_rgba));
+					pSrc += m_pitch;
+					pDst += w;
+				}
+			}
+
+			return *this;
+		}
+
+		image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
+		{
+			for (uint32_t y = 0; y < h; y++)
+				for (uint32_t x = 0; x < w; x++)
+					set_clipped(dst_x + x, dst_y + y, *pSrc++);
+			return *this;
+		}
+
+		inline uint32_t get_width() const { return m_width; }
+		inline uint32_t get_height() const { return m_height; }
+		inline uint32_t get_pitch() const { return m_pitch; }
+		inline uint32_t get_total_pixels() const { return m_width * m_height; }
+
+		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
+		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
+		inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
+
+		inline const color_rgba_vec &get_pixels() const { return m_pixels; }
+		inline color_rgba_vec &get_pixels() { return m_pixels; }
+
+		inline const color_rgba *get_ptr() const { return &m_pixels[0]; }
+		inline color_rgba *get_ptr() { return &m_pixels[0]; }
+
+		bool has_alpha() const
+		{
+			for (uint32_t y = 0; y < m_height; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					if ((*this)(x, y).a < 255)
+						return true;
+
+			return false;
+		}
+
+		image &set_alpha(uint8_t a)
+		{
+			for (uint32_t y = 0; y < m_height; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					(*this)(x, y).a = a;
+			return *this;
+		}
+
+		image &flip_y()
+		{
+			for (uint32_t y = 0; y < m_height / 2; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
+			return *this;
+		}
+
+		//// TODO: There are many ways to do this, not sure this is the best way.
+		//image &renormalize_normal_map()
+		//{
+		//    for (uint32_t y = 0; y < m_height; y++)
+		//    {
+		//        for (uint32_t x = 0; x < m_width; x++)
+		//        {
+		//            color_rgba &c = (*this)(x, y);
+		//            if ((c.r == 128) && (c.g == 128) && (c.b == 128))
+		//                continue;
+
+		//            vec3F v(c.r, c.g, c.b);
+		//            v = (v * (2.0f / 255.0f)) - vec3F(1.0f);
+		//            v.clamp(-1.0f, 1.0f);
+
+		//            float length = v.length();
+		//            const float cValidThresh = .077f;
+		//            if (length < cValidThresh)
+		//            {
+		//                c.set(128, 128, 128, c.a);
+		//            }
+		//            else if (fabs(length - 1.0f) > cValidThresh)
+		//            {
+		//                if (length)
+		//                    v /= length;
+
+		//                for (uint32_t i = 0; i < 3; i++)
+		//                    c[i] = static_cast<uint8_t>(clamp<float>(floor((v[i] + 1.0f) * 255.0f * .5f + .5f), 0.0f, 255.0f));
+
+		//                if ((c.g == 128) && (c.r == 128))
+		//                {
+		//                    if (c.b < 128)
+		//                        c.b = 0;
+		//                    else
+		//                        c.b = 255;
+		//                }
+		//            }
+		//        }
+		//    }
+		//    return *this;
+		//}
+
+		bool operator== (const image& img) const
+		{
+			if ((m_width != img.get_width()) || (m_height != img.get_height()))
+				return false;
+
+			for (uint32_t y = 0; y < m_height; y++)
+				for (uint32_t x = 0; x < m_width; x++)
+					if ((*this)(x, y) != img(x, y))
+						return false;
+
+			return true;
+		}
+
+		bool operator!= (const image& img) const
+		{
+			return !(*this == img);
+		}
+
+		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
+
+	private:
+		uint32_t m_width, m_height, m_pitch;  // all in pixels
+		color_rgba_vec m_pixels;
+	};
+
+	enum eZero { cZero };
+
+	// Linear algebra
+
+	template <uint32_t N, typename T>
+	class vec
+	{
+	protected:
+		T m_v[N];
+
+	public:
+		enum { num_elements = N };
+
+		inline vec() { }
+		inline vec(eZero) { set_zero();  }
+
+		explicit inline vec(T val) { set(val); }
+		inline vec(T v0, T v1) { set(v0, v1); }
+		inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }
+		inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }
+		inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }
+		template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
+
+		inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
+		inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
+
+		inline T getX() const { return m_v[0]; }
+		inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; }
+		inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; }
+		inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }
+
+		inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false;	return true; }
+		inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
+
+		inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
+
+		template <uint32_t OtherN, typename OtherT>
+		inline vec &set(const vec<OtherN, OtherT> &other)
+		{
+			uint32_t i;
+			if ((const void *)(&other) == (const void *)(this))
+				return *this;
+			const uint32_t m = min(OtherN, N);
+			for (i = 0; i < m; i++)
+				m_v[i] = static_cast<T>(other[i]);
+			for (; i < N; i++)
+				m_v[i] = 0;
+			return *this;
+		}
+
+		inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }
+		inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; }
+		inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; }
+
+		inline vec &set(T v0, T v1)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				clear_elements(2, N);
+			}
+			return *this;
+		}
+
+		inline vec &set(T v0, T v1, T v2)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				if (N >= 3)
+				{
+					m_v[2] = v2;
+					clear_elements(3, N);
+				}
+			}
+			return *this;
+		}
+
+		inline vec &set(T v0, T v1, T v2, T v3)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				if (N >= 3)
+				{
+					m_v[2] = v2;
+
+					if (N >= 4)
+					{
+						m_v[3] = v3;
+						clear_elements(5, N);
+					}
+				}
+			}
+			return *this;
+		}
+
+		inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }
+		template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }
+
+		inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); }
+		inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); }
+
+		inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; }
+		inline vec operator+ () const { return *this; }
+		inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; }
+		inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }
+		inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }
+		inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; }
+		inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; }
+		inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; }
+
+		friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }
+		friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }
+		friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }
+		friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }
+		friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }
+		friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }
+
+		static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }
+
+		inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }
+
+		inline T norm() const { return dot_product(*this, *this); }
+		inline T length() const { return sqrt(norm()); }
+
+		inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
+		inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
+
+		inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
+		inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
+
+		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len);	return *this; }
+
+		inline vec &clamp(T l, T h)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_v[i] = basisu::clamp(m_v[i], l, h);
+			return *this;
+		}
+
+		static vec component_min(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = min(a[i], b[i]);
+			return res;
+		}
+
+		static vec component_max(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = max(a[i], b[i]);
+			return res;
+		}
+	};
+
+	typedef vec<4, double> vec4D;
+	typedef vec<3, double> vec3D;
+	typedef vec<2, double> vec2D;
+	typedef vec<1, double> vec1D;
+
+	typedef vec<4, float> vec4F;
+	typedef vec<3, float> vec3F;
+	typedef vec<2, float> vec2F;
+	typedef vec<1, float> vec1F;
+
+	typedef vec<16, float> vec16F;
+
+	// 2D array
+
+	template<typename T>
+	class vector2D
+	{
+		typedef std::vector<T> TVec;
+
+		uint32_t m_width, m_height;
+		TVec m_values;
+
+	public:
+		vector2D() :
+			m_width(0),
+			m_height(0)
+		{
+		}
+
+		vector2D(uint32_t w, uint32_t h) :
+			m_width(0),
+			m_height(0)
+		{
+			resize(w, h);
+		}
+
+		vector2D(const vector2D &other)
+		{
+			*this = other;
+		}
+
+		vector2D &operator= (const vector2D &other)
+		{
+			if (this != &other)
+			{
+				m_width = other.m_width;
+				m_height = other.m_height;
+				m_values = other.m_values;
+			}
+			return *this;
+		}
+
+		inline bool operator== (const vector2D &rhs) const
+		{
+			return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values);
+		}
+
+		inline uint32_t size_in_bytes() const { return (uint32_t)m_values.size() * sizeof(m_values[0]); }
+
+		inline const T &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
+		inline T &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
+
+		inline const T &operator[] (uint32_t i) const { return m_values[i]; }
+		inline T &operator[] (uint32_t i) { return m_values[i]; }
+
+		inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }
+		inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }
+
+		void clear()
+		{
+			m_width = 0;
+			m_height = 0;
+			m_values.clear();
+		}
+
+		void set_all(const T&val)
+		{
+			//vector_set_all(m_values, val);
+			for (size_t i = 0; i < m_values.size(); i++)
+				m_values[i] = val;
+		}
+
+		inline const T* get_ptr() const { return &m_values[0]; }
+		inline T* get_ptr() { return &m_values[0]; }
+
+		vector2D &resize(uint32_t new_width, uint32_t new_height)
+		{
+			if ((m_width == new_width) && (m_height == new_height))
+				return *this;
+
+			TVec oldVals(new_width * new_height);
+			oldVals.swap(m_values);
+
+			const uint32_t w = min(m_width, new_width);
+			const uint32_t h = min(m_height, new_height);
+
+			if ((w) && (h))
+			{
+				for (uint32_t y = 0; y < h; y++)
+					for (uint32_t x = 0; x < w; x++)
+						m_values[x + y * new_width] = oldVals[x + y * m_width];
+			}
+
+			m_width = new_width;
+			m_height = new_height;
+
+			return *this;
+		}
+	};
+
+} // basisu
+
diff --git a/examples/aliens.png b/examples/aliens.png
new file mode 100644
index 0000000..f8c2e40
Binary files /dev/null and b/examples/aliens.png differ
diff --git a/examples/aliens_2_rdo.png b/examples/aliens_2_rdo.png
new file mode 100644
index 0000000..83d2203
Binary files /dev/null and b/examples/aliens_2_rdo.png differ
diff --git a/examples/aliens_rdo.png b/examples/aliens_rdo.png
new file mode 100644
index 0000000..76bb4fc
Binary files /dev/null and b/examples/aliens_rdo.png differ
diff --git a/examples/crossyf.png b/examples/crossyf.png
new file mode 100644
index 0000000..abd6478
Binary files /dev/null and b/examples/crossyf.png differ
diff --git a/examples/crossyf_2_rdo.png b/examples/crossyf_2_rdo.png
new file mode 100644
index 0000000..f537218
Binary files /dev/null and b/examples/crossyf_2_rdo.png differ
diff --git a/examples/crossyf_rdo.png b/examples/crossyf_rdo.png
new file mode 100644
index 0000000..ed0baa4
Binary files /dev/null and b/examples/crossyf_rdo.png differ
diff --git a/examples/doom.png b/examples/doom.png
new file mode 100644
index 0000000..965959d
Binary files /dev/null and b/examples/doom.png differ
diff --git a/examples/doom_delta.png b/examples/doom_delta.png
new file mode 100644
index 0000000..99ae008
Binary files /dev/null and b/examples/doom_delta.png differ
diff --git a/examples/doom_rdo.png b/examples/doom_rdo.png
new file mode 100644
index 0000000..593f25b
Binary files /dev/null and b/examples/doom_rdo.png differ
diff --git a/examples/gotham.png b/examples/gotham.png
new file mode 100644
index 0000000..cee9849
Binary files /dev/null and b/examples/gotham.png differ
diff --git a/examples/gotham_2_delta.png b/examples/gotham_2_delta.png
new file mode 100644
index 0000000..03c1ae0
Binary files /dev/null and b/examples/gotham_2_delta.png differ
diff --git a/examples/gotham_2_rdo.png b/examples/gotham_2_rdo.png
new file mode 100644
index 0000000..43b3b89
Binary files /dev/null and b/examples/gotham_2_rdo.png differ
diff --git a/examples/gotham_delta.png b/examples/gotham_delta.png
new file mode 100644
index 0000000..01f9a23
Binary files /dev/null and b/examples/gotham_delta.png differ
diff --git a/examples/gotham_rdo.png b/examples/gotham_rdo.png
new file mode 100644
index 0000000..1a8e5c4
Binary files /dev/null and b/examples/gotham_rdo.png differ
diff --git a/examples/high_fidelity.png b/examples/high_fidelity.png
new file mode 100644
index 0000000..fed5254
Binary files /dev/null and b/examples/high_fidelity.png differ
diff --git a/examples/high_fidelity_1.png b/examples/high_fidelity_1.png
new file mode 100644
index 0000000..606f4e8
Binary files /dev/null and b/examples/high_fidelity_1.png differ
diff --git a/examples/high_fidelity_2.png b/examples/high_fidelity_2.png
new file mode 100644
index 0000000..1b8d34b
Binary files /dev/null and b/examples/high_fidelity_2.png differ
diff --git a/examples/joker_768.png b/examples/joker_768.png
new file mode 100644
index 0000000..a72008a
Binary files /dev/null and b/examples/joker_768.png differ
diff --git a/examples/joker_768_2_delta.png b/examples/joker_768_2_delta.png
new file mode 100644
index 0000000..0f2a7b5
Binary files /dev/null and b/examples/joker_768_2_delta.png differ
diff --git a/examples/joker_768_2_rdo.png b/examples/joker_768_2_rdo.png
new file mode 100644
index 0000000..fdfe440
Binary files /dev/null and b/examples/joker_768_2_rdo.png differ
diff --git a/examples/joker_768_3_delta.png b/examples/joker_768_3_delta.png
new file mode 100644
index 0000000..a2faa98
Binary files /dev/null and b/examples/joker_768_3_delta.png differ
diff --git a/examples/joker_768_3_rdo.png b/examples/joker_768_3_rdo.png
new file mode 100644
index 0000000..1184162
Binary files /dev/null and b/examples/joker_768_3_rdo.png differ
diff --git a/examples/joker_768_4_delta.png b/examples/joker_768_4_delta.png
new file mode 100644
index 0000000..dcfec23
Binary files /dev/null and b/examples/joker_768_4_delta.png differ
diff --git a/examples/joker_768_4_rdo.png b/examples/joker_768_4_rdo.png
new file mode 100644
index 0000000..153b8ca
Binary files /dev/null and b/examples/joker_768_4_rdo.png differ
diff --git a/examples/joker_768_delta.png b/examples/joker_768_delta.png
new file mode 100644
index 0000000..e26c9d4
Binary files /dev/null and b/examples/joker_768_delta.png differ
diff --git a/examples/joker_768_rdo.png b/examples/joker_768_rdo.png
new file mode 100644
index 0000000..9bf7df2
Binary files /dev/null and b/examples/joker_768_rdo.png differ
diff --git a/examples/kodim18.png b/examples/kodim18.png
new file mode 100644
index 0000000..8572808
Binary files /dev/null and b/examples/kodim18.png differ
diff --git a/examples/kodim18_delta.png b/examples/kodim18_delta.png
new file mode 100644
index 0000000..8161fa8
Binary files /dev/null and b/examples/kodim18_delta.png differ
diff --git a/examples/kodim18_rdo.png b/examples/kodim18_rdo.png
new file mode 100644
index 0000000..e9c9fcc
Binary files /dev/null and b/examples/kodim18_rdo.png differ
diff --git a/examples/lara_1024.png b/examples/lara_1024.png
new file mode 100644
index 0000000..bd34366
Binary files /dev/null and b/examples/lara_1024.png differ
diff --git a/examples/lara_1024_delta.png b/examples/lara_1024_delta.png
new file mode 100644
index 0000000..5c6ff0d
Binary files /dev/null and b/examples/lara_1024_delta.png differ
diff --git a/examples/lara_1024_rdo.png b/examples/lara_1024_rdo.png
new file mode 100644
index 0000000..8c7ed2d
Binary files /dev/null and b/examples/lara_1024_rdo.png differ
diff --git a/examples/magneto.png b/examples/magneto.png
new file mode 100644
index 0000000..19b6110
Binary files /dev/null and b/examples/magneto.png differ
diff --git a/examples/magneto_2_alpha_delta.png b/examples/magneto_2_alpha_delta.png
new file mode 100644
index 0000000..7651a65
Binary files /dev/null and b/examples/magneto_2_alpha_delta.png differ
diff --git a/examples/magneto_2_delta.png b/examples/magneto_2_delta.png
new file mode 100644
index 0000000..2e1f247
Binary files /dev/null and b/examples/magneto_2_delta.png differ
diff --git a/examples/magneto_2_rdo.png b/examples/magneto_2_rdo.png
new file mode 100644
index 0000000..1cd902f
Binary files /dev/null and b/examples/magneto_2_rdo.png differ
diff --git a/examples/magneto_delta.png b/examples/magneto_delta.png
new file mode 100644
index 0000000..e2adb35
Binary files /dev/null and b/examples/magneto_delta.png differ
diff --git a/examples/magneto_rdo.png b/examples/magneto_rdo.png
new file mode 100644
index 0000000..8995e62
Binary files /dev/null and b/examples/magneto_rdo.png differ
diff --git a/examples/masterchief.png b/examples/masterchief.png
new file mode 100644
index 0000000..73f46fd
Binary files /dev/null and b/examples/masterchief.png differ
diff --git a/examples/masterchief_2_rdo.png b/examples/masterchief_2_rdo.png
new file mode 100644
index 0000000..77d41e0
Binary files /dev/null and b/examples/masterchief_2_rdo.png differ
diff --git a/examples/masterchief_rdo.png b/examples/masterchief_rdo.png
new file mode 100644
index 0000000..1eb4bc1
Binary files /dev/null and b/examples/masterchief_rdo.png differ
diff --git a/examples/minerology.png b/examples/minerology.png
new file mode 100644
index 0000000..590be40
Binary files /dev/null and b/examples/minerology.png differ
diff --git a/examples/minerology_delta.png b/examples/minerology_delta.png
new file mode 100644
index 0000000..e5485c8
Binary files /dev/null and b/examples/minerology_delta.png differ
diff --git a/examples/minerology_rdo.png b/examples/minerology_rdo.png
new file mode 100644
index 0000000..3bbed04
Binary files /dev/null and b/examples/minerology_rdo.png differ
diff --git a/examples/puppy.png b/examples/puppy.png
new file mode 100644
index 0000000..4200121
Binary files /dev/null and b/examples/puppy.png differ
diff --git a/examples/puppy_delta.png b/examples/puppy_delta.png
new file mode 100644
index 0000000..78d2762
Binary files /dev/null and b/examples/puppy_delta.png differ
diff --git a/examples/puppy_rdo.png b/examples/puppy_rdo.png
new file mode 100644
index 0000000..3ff6588
Binary files /dev/null and b/examples/puppy_rdo.png differ
diff --git a/examples/stp.png b/examples/stp.png
new file mode 100644
index 0000000..5e5810a
Binary files /dev/null and b/examples/stp.png differ
diff --git a/examples/stp_2_delta.png b/examples/stp_2_delta.png
new file mode 100644
index 0000000..075e23a
Binary files /dev/null and b/examples/stp_2_delta.png differ
diff --git a/examples/stp_2_rdo.png b/examples/stp_2_rdo.png
new file mode 100644
index 0000000..a08c29b
Binary files /dev/null and b/examples/stp_2_rdo.png differ
diff --git a/examples/stp_3_delta.png b/examples/stp_3_delta.png
new file mode 100644
index 0000000..880e45a
Binary files /dev/null and b/examples/stp_3_delta.png differ
diff --git a/examples/stp_3_rdo.png b/examples/stp_3_rdo.png
new file mode 100644
index 0000000..1f75162
Binary files /dev/null and b/examples/stp_3_rdo.png differ
diff --git a/examples/stp_delta.png b/examples/stp_delta.png
new file mode 100644
index 0000000..a7a53c1
Binary files /dev/null and b/examples/stp_delta.png differ
diff --git a/examples/stp_rdo.png b/examples/stp_rdo.png
new file mode 100644
index 0000000..00e7436
Binary files /dev/null and b/examples/stp_rdo.png differ
diff --git a/examples/waterfall.png b/examples/waterfall.png
new file mode 100644
index 0000000..8c3e411
Binary files /dev/null and b/examples/waterfall.png differ
diff --git a/examples/waterfall_delta.png b/examples/waterfall_delta.png
new file mode 100644
index 0000000..99ad823
Binary files /dev/null and b/examples/waterfall_delta.png differ
diff --git a/examples/waterfall_rdo.png b/examples/waterfall_rdo.png
new file mode 100644
index 0000000..5d78081
Binary files /dev/null and b/examples/waterfall_rdo.png differ
diff --git a/examples/xfiles_768.png b/examples/xfiles_768.png
new file mode 100644
index 0000000..2eb0b91
Binary files /dev/null and b/examples/xfiles_768.png differ
diff --git a/examples/xfiles_768_2_delta.png b/examples/xfiles_768_2_delta.png
new file mode 100644
index 0000000..004a85b
Binary files /dev/null and b/examples/xfiles_768_2_delta.png differ
diff --git a/examples/xfiles_768_2_rdo.png b/examples/xfiles_768_2_rdo.png
new file mode 100644
index 0000000..3f023a6
Binary files /dev/null and b/examples/xfiles_768_2_rdo.png differ
diff --git a/examples/xfiles_768_delta.png b/examples/xfiles_768_delta.png
new file mode 100644
index 0000000..83e1b4a
Binary files /dev/null and b/examples/xfiles_768_delta.png differ
diff --git a/examples/xfiles_768_rdo.png b/examples/xfiles_768_rdo.png
new file mode 100644
index 0000000..423ffa8
Binary files /dev/null and b/examples/xfiles_768_rdo.png differ
diff --git a/qoi.h b/qoi.h
new file mode 100644
index 0000000..5583bad
--- /dev/null
+++ b/qoi.h
@@ -0,0 +1,672 @@
+/*
+
+QOI - The "Quite OK Image" format for fast, lossless image compression
+
+Dominic Szablewski - https://phoboslab.org
+
+
+-- LICENSE: The MIT License(MIT)
+
+Copyright(c) 2021 Dominic Szablewski
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions :
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+-- About
+
+QOI encodes and decodes images in a lossless format. Compared to stb_image and
+stb_image_write QOI offers 20x-50x faster encoding, 3x-4x faster decoding and
+20% better compression.
+
+
+-- Synopsis
+
+// Define `QOI_IMPLEMENTATION` in *one* C/C++ file before including this
+// library to create the implementation.
+
+#define QOI_IMPLEMENTATION
+#include "qoi.h"
+
+// Encode and store an RGBA buffer to the file system. The qoi_desc describes
+// the input pixel data.
+qoi_write("image_new.qoi", rgba_pixels, &(qoi_desc){
+	.width = 1920,
+	.height = 1080,
+	.channels = 4,
+	.colorspace = QOI_SRGB
+});
+
+// Load and decode a QOI image from the file system into a 32bbp RGBA buffer.
+// The qoi_desc struct will be filled with the width, height, number of channels
+// and colorspace read from the file header.
+qoi_desc desc;
+void *rgba_pixels = qoi_read("image.qoi", &desc, 4);
+
+
+
+-- Documentation
+
+This library provides the following functions;
+- qoi_read    -- read and decode a QOI file
+- qoi_decode  -- decode the raw bytes of a QOI image from memory
+- qoi_write   -- encode and write a QOI file
+- qoi_encode  -- encode an rgba buffer into a QOI image in memory
+
+See the function declaration below for the signature and more information.
+
+If you don't want/need the qoi_read and qoi_write functions, you can define
+QOI_NO_STDIO before including this library.
+
+This library uses malloc() and free(). To supply your own malloc implementation
+you can define QOI_MALLOC and QOI_FREE before including this library.
+
+This library uses memset() to zero-initialize the index. To supply your own
+implementation you can define QOI_ZEROARR before including this library.
+
+
+-- Data Format
+
+A QOI file has a 14 byte header, followed by any number of data "chunks" and an
+8-byte end marker.
+
+struct qoi_header_t {
+	char     magic[4];   // magic bytes "qoif"
+	uint32_t width;      // image width in pixels (BE)
+	uint32_t height;     // image height in pixels (BE)
+	uint8_t  channels;   // 3 = RGB, 4 = RGBA
+	uint8_t  colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear
+};
+
+Images are encoded row by row, left to right, top to bottom. The decoder and
+encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous pixel value. An
+image is complete when all pixels specified by width * height have been covered.
+
+Pixels are encoded as
+ - a run of the previous pixel
+ - an index into an array of previously seen pixels
+ - a difference to the previous pixel value in r,g,b
+ - full r,g,b or r,g,b,a values
+
+The color channels are assumed to not be premultiplied with the alpha channel
+("un-premultiplied alpha").
+
+A running array[64] (zero-initialized) of previously seen pixel values is
+maintained by the encoder and decoder. Each pixel that is seen by the encoder
+and decoder is put into this array at the position formed by a hash function of
+the color value. In the encoder, if the pixel value at the index matches the
+current pixel, this index position is written to the stream as QOI_OP_INDEX.
+The hash function for the index is:
+
+	index_position = (r * 3 + g * 5 + b * 7 + a * 11) % 64
+
+Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The
+bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All
+values encoded in these data bits have the most significant bit on the left.
+
+The 8-bit tags have precedence over the 2-bit tags. A decoder must check for the
+presence of an 8-bit tag first.
+
+The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte.
+
+
+The possible chunks are:
+
+
+.- QOI_OP_INDEX ----------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  0  0 |     index       |
+`-------------------------`
+2-bit tag b00
+6-bit index into the color index array: 0..63
+
+A valid encoder must not issue 2 or more consecutive QOI_OP_INDEX chunks to the
+same index. QOI_OP_RUN should be used instead.
+
+
+.- QOI_OP_DIFF -----------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----+-----+-----|
+|  0  1 |  dr |  dg |  db |
+`-------------------------`
+2-bit tag b01
+2-bit   red channel difference from the previous pixel between -2..1
+2-bit green channel difference from the previous pixel between -2..1
+2-bit  blue channel difference from the previous pixel between -2..1
+
+The difference to the current channel values are using a wraparound operation,
+so "1 - 2" will result in 255, while "255 + 1" will result in 0.
+
+Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as
+0 (b00). 1 is stored as 3 (b11).
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_LUMA -------------------------------------.
+|         Byte[0]         |         Byte[1]         |
+|  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
+|-------+-----------------+-------------+-----------|
+|  1  0 |  green diff     |   dr - dg   |  db - dg  |
+`---------------------------------------------------`
+2-bit tag b10
+6-bit green channel difference from the previous pixel -32..31
+4-bit   red channel difference minus green channel difference -8..7
+4-bit  blue channel difference minus green channel difference -8..7
+
+The green channel is used to indicate the general direction of change and is
+encoded in 6 bits. The red and blue channels (dr and db) base their diffs off
+of the green channel difference and are encoded in 4 bits. I.e.:
+	dr_dg = (cur_px.r - prev_px.r) - (cur_px.g - prev_px.g)
+	db_dg = (cur_px.b - prev_px.b) - (cur_px.g - prev_px.g)
+
+The difference to the current channel values are using a wraparound operation,
+so "10 - 13" will result in 253, while "250 + 7" will result in 1.
+
+Values are stored as unsigned integers with a bias of 32 for the green channel
+and a bias of 8 for the red and blue channel.
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_RUN ------------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  1  1 |       run       |
+`-------------------------`
+2-bit tag b11
+6-bit run-length repeating the previous pixel: 1..62
+
+The run-length is stored with a bias of -1. Note that the run-lengths 63 and 64
+(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and
+QOI_OP_RGBA tags.
+
+
+.- QOI_OP_RGB ------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------|
+|  1  1  1  1  1  1  1  0 |   red   |  green  |  blue   |
+`-------------------------------------------------------`
+8-bit tag b11111110
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_RGBA ---------------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] | Byte[4] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------+---------|
+|  1  1  1  1  1  1  1  1 |   red   |  green  |  blue   |  alpha  |
+`-----------------------------------------------------------------`
+8-bit tag b11111111
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+8-bit alpha channel value
+
+*/
+
+
+/* -----------------------------------------------------------------------------
+Header - Public functions */
+
+#ifndef QOI_H
+#define QOI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions.
+It describes either the input format (for qoi_write and qoi_encode), or is
+filled with the description read from the file header (for qoi_read and
+qoi_decode).
+
+The colorspace in this qoi_desc is an enum where
+	0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
+	1 = all channels are linear
+You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely
+informative. It will be saved to the file header, but does not affect
+how chunks are en-/decoded. */
+
+#define QOI_SRGB   0
+#define QOI_LINEAR 1
+
+typedef struct {
+	unsigned int width;
+	unsigned int height;
+	unsigned char channels;
+	unsigned char colorspace;
+} qoi_desc;
+
+#ifndef QOI_NO_STDIO
+
+/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file
+system. The qoi_desc struct must be filled with the image width, height,
+number of channels (3 = RGB, 4 = RGBA) and the colorspace.
+
+The function returns 0 on failure (invalid parameters, or fopen or malloc
+failed) or the number of bytes written on success. */
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc);
+
+
+/* Read and decode a QOI image from the file system. If channels is 0, the
+number of channels from the file header is used. If channels is 3 or 4 the
+output format will be forced into this number of channels.
+
+The function either returns NULL on failure (invalid data, or malloc or fopen
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+will be filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels);
+
+#endif /* QOI_NO_STDIO */
+
+
+/* Encode raw RGB or RGBA pixels into a QOI image in memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the encoded data on success. On success the out_len
+is set to the size in bytes of the encoded data.
+
+The returned qoi data should be free()d after use. */
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len);
+
+
+/* Decode a QOI image from memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+is filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* QOI_H */
+
+
+/* -----------------------------------------------------------------------------
+Implementation */
+
+#ifdef QOI_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef QOI_MALLOC
+	#define QOI_MALLOC(sz) malloc(sz)
+	#define QOI_FREE(p)    free(p)
+#endif
+#ifndef QOI_ZEROARR
+	#define QOI_ZEROARR(a) memset((a),0,sizeof(a))
+#endif
+
+#define QOI_OP_INDEX  0x00 /* 00xxxxxx */
+#define QOI_OP_DIFF   0x40 /* 01xxxxxx */
+#define QOI_OP_LUMA   0x80 /* 10xxxxxx */
+#define QOI_OP_RUN    0xc0 /* 11xxxxxx */
+#define QOI_OP_RGB    0xfe /* 11111110 */
+#define QOI_OP_RGBA   0xff /* 11111111 */
+
+#define QOI_MASK_2    0xc0 /* 11000000 */
+
+#define QOI_COLOR_HASH(C) (C.rgba.r*3 + C.rgba.g*5 + C.rgba.b*7 + C.rgba.a*11)
+#define QOI_MAGIC \
+	(((unsigned int)'q') << 24 | ((unsigned int)'o') << 16 | \
+	 ((unsigned int)'i') <<  8 | ((unsigned int)'f'))
+#define QOI_HEADER_SIZE 14
+
+/* 2GB is the max file size that this implementation can safely handle. We guard
+against anything larger than that, assuming the worst case with 5 bytes per
+pixel, rounded down to a nice clean value. 400 million pixels ought to be
+enough for anybody. */
+#define QOI_PIXELS_MAX ((unsigned int)400000000)
+
+typedef union {
+	struct { unsigned char r, g, b, a; } rgba;
+	unsigned int v;
+} qoi_rgba_t;
+
+static const unsigned char qoi_padding[8] = {0,0,0,0,0,0,0,1};
+
+static void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) {
+	bytes[(*p)++] = (0xff000000 & v) >> 24;
+	bytes[(*p)++] = (0x00ff0000 & v) >> 16;
+	bytes[(*p)++] = (0x0000ff00 & v) >> 8;
+	bytes[(*p)++] = (0x000000ff & v);
+}
+
+static unsigned int qoi_read_32(const unsigned char *bytes, int *p) {
+	unsigned int a = bytes[(*p)++];
+	unsigned int b = bytes[(*p)++];
+	unsigned int c = bytes[(*p)++];
+	unsigned int d = bytes[(*p)++];
+	return a << 24 | b << 16 | c << 8 | d;
+}
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
+	int i, max_size, p, run;
+	int px_len, px_end, px_pos, channels;
+	unsigned char *bytes;
+	const unsigned char *pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px, px_prev;
+
+	if (
+		data == NULL || out_len == NULL || desc == NULL ||
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	max_size =
+		desc->width * desc->height * (desc->channels + 1) +
+		QOI_HEADER_SIZE + sizeof(qoi_padding);
+
+	p = 0;
+	bytes = (unsigned char *) QOI_MALLOC(max_size);
+	if (!bytes) {
+		return NULL;
+	}
+
+	qoi_write_32(bytes, &p, QOI_MAGIC);
+	qoi_write_32(bytes, &p, desc->width);
+	qoi_write_32(bytes, &p, desc->height);
+	bytes[p++] = desc->channels;
+	bytes[p++] = desc->colorspace;
+
+
+	pixels = (const unsigned char *)data;
+
+	QOI_ZEROARR(index);
+
+	run = 0;
+	px_prev.rgba.r = 0;
+	px_prev.rgba.g = 0;
+	px_prev.rgba.b = 0;
+	px_prev.rgba.a = 255;
+	px = px_prev;
+
+	px_len = desc->width * desc->height * desc->channels;
+	px_end = px_len - desc->channels;
+	channels = desc->channels;
+
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		if (channels == 4) {
+			px = *(qoi_rgba_t *)(pixels + px_pos);
+		}
+		else {
+			px.rgba.r = pixels[px_pos + 0];
+			px.rgba.g = pixels[px_pos + 1];
+			px.rgba.b = pixels[px_pos + 2];
+		}
+
+		if (px.v == px_prev.v) {
+			run++;
+			if (run == 62 || px_pos == px_end) {
+				bytes[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+		}
+		else {
+			int index_pos;
+
+			if (run > 0) {
+				bytes[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+
+			index_pos = QOI_COLOR_HASH(px) % 64;
+
+			if (index[index_pos].v == px.v) {
+				bytes[p++] = QOI_OP_INDEX | index_pos;
+			}
+			else {
+				index[index_pos] = px;
+
+				if (px.rgba.a == px_prev.rgba.a) {
+					signed char vr = px.rgba.r - px_prev.rgba.r;
+					signed char vg = px.rgba.g - px_prev.rgba.g;
+					signed char vb = px.rgba.b - px_prev.rgba.b;
+
+					signed char vg_r = vr - vg;
+					signed char vg_b = vb - vg;
+
+					if (
+						vr > -3 && vr < 2 &&
+						vg > -3 && vg < 2 &&
+						vb > -3 && vb < 2
+					) {
+						bytes[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
+					}
+					else if (
+						vg_r >  -9 && vg_r <  8 &&
+						vg   > -33 && vg   < 32 &&
+						vg_b >  -9 && vg_b <  8
+					) {
+						bytes[p++] = QOI_OP_LUMA     | (vg   + 32);
+						bytes[p++] = (vg_r + 8) << 4 | (vg_b +  8);
+					}
+					else {
+						bytes[p++] = QOI_OP_RGB;
+						bytes[p++] = px.rgba.r;
+						bytes[p++] = px.rgba.g;
+						bytes[p++] = px.rgba.b;
+					}
+				}
+				else {
+					bytes[p++] = QOI_OP_RGBA;
+					bytes[p++] = px.rgba.r;
+					bytes[p++] = px.rgba.g;
+					bytes[p++] = px.rgba.b;
+					bytes[p++] = px.rgba.a;
+				}
+			}
+		}
+		px_prev = px;
+	}
+
+	for (i = 0; i < (int)sizeof(qoi_padding); i++) {
+		bytes[p++] = qoi_padding[i];
+	}
+
+	*out_len = p;
+	return bytes;
+}
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
+	const unsigned char *bytes;
+	unsigned int header_magic;
+	unsigned char *pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px;
+	int px_len, chunks_len, px_pos;
+	int p = 0, run = 0;
+
+	if (
+		data == NULL || desc == NULL ||
+		(channels != 0 && channels != 3 && channels != 4) ||
+		size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding)
+	) {
+		return NULL;
+	}
+
+	bytes = (const unsigned char *)data;
+
+	header_magic = qoi_read_32(bytes, &p);
+	desc->width = qoi_read_32(bytes, &p);
+	desc->height = qoi_read_32(bytes, &p);
+	desc->channels = bytes[p++];
+	desc->colorspace = bytes[p++];
+
+	if (
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		header_magic != QOI_MAGIC ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	if (channels == 0) {
+		channels = desc->channels;
+	}
+
+	px_len = desc->width * desc->height * channels;
+	pixels = (unsigned char *) QOI_MALLOC(px_len);
+	if (!pixels) {
+		return NULL;
+	}
+
+	QOI_ZEROARR(index);
+	px.rgba.r = 0;
+	px.rgba.g = 0;
+	px.rgba.b = 0;
+	px.rgba.a = 255;
+
+	chunks_len = size - (int)sizeof(qoi_padding);
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+				
+		if (run > 0) {
+			run--;
+		}
+		else if (p < chunks_len) {
+			int b1 = bytes[p++];
+
+			if (b1 == QOI_OP_RGB) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+			}
+			else if (b1 == QOI_OP_RGBA) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+				px.rgba.a = bytes[p++];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
+				px = index[b1];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
+				px.rgba.r += ((b1 >> 4) & 0x03) - 2;
+				px.rgba.g += ((b1 >> 2) & 0x03) - 2;
+				px.rgba.b += ( b1       & 0x03) - 2;
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
+				int b2 = bytes[p++];
+				int vg = (b1 & 0x3f) - 32;
+				px.rgba.r += vg - 8 + ((b2 >> 4) & 0x0f);
+				px.rgba.g += vg;
+				px.rgba.b += vg - 8 +  (b2       & 0x0f);
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+				run = (b1 & 0x3f);
+			}
+
+			index[QOI_COLOR_HASH(px) % 64] = px;
+		}
+
+		if (channels == 4) {
+			*(qoi_rgba_t*)(pixels + px_pos) = px;
+		}
+		else {
+			pixels[px_pos + 0] = px.rgba.r;
+			pixels[px_pos + 1] = px.rgba.g;
+			pixels[px_pos + 2] = px.rgba.b;
+		}
+	}
+
+	return pixels;
+}
+
+#ifndef QOI_NO_STDIO
+#include <stdio.h>
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc) {
+	FILE *f = fopen(filename, "wb");
+	int size;
+	void *encoded;
+
+	if (!f) {
+		return 0;
+	}
+
+	encoded = qoi_encode(data, desc, &size);
+	if (!encoded) {
+		fclose(f);
+		return 0;
+	}
+
+	fwrite(encoded, 1, size, f);
+	fclose(f);
+
+	QOI_FREE(encoded);
+	return size;
+}
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels) {
+	FILE *f = fopen(filename, "rb");
+	int size, bytes_read;
+	void *pixels, *data;
+
+	if (!f) {
+		return NULL;
+	}
+
+	fseek(f, 0, SEEK_END);
+	size = ftell(f);
+	if (size <= 0) {
+		fclose(f);
+		return NULL;
+	}
+	fseek(f, 0, SEEK_SET);
+
+	data = QOI_MALLOC(size);
+	if (!data) {
+		fclose(f);
+		return NULL;
+	}
+
+	bytes_read = (int)fread(data, 1, size, f);
+	fclose(f);
+
+	pixels = qoi_decode(data, bytes_read, desc, channels);
+	QOI_FREE(data);
+	return pixels;
+}
+
+#endif /* QOI_NO_STDIO */
+#endif /* QOI_IMPLEMENTATION */
diff --git a/qoirdo.cpp b/qoirdo.cpp
new file mode 100644
index 0000000..9573594
--- /dev/null
+++ b/qoirdo.cpp
@@ -0,0 +1,1212 @@
+﻿// qoirdo.cpp
+// Copyright (C) 2022 Richard Geldreich, Jr. All Rights Reserved.
+// Copyright (C) 2025 Erik Scholz
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "./qoirdo.hpp"
+
+#if _MSC_VER
+// For sprintf(), strcpy()
+#define _CRT_SECURE_NO_WARNINGS (1)
+#endif
+
+#include <cstdint>
+#include <cstdio>
+#include <cmath>
+#include <string>
+#include <array>
+#include <vector>
+
+#include "./basisu.min.hpp"
+
+using namespace basisu;
+
+#define RDO_PNG_VERSION "v1.10"
+
+const float DEF_MAX_SMOOTH_STD_DEV = 35.0f;
+const float DEF_SMOOTH_MAX_MSE_SCALE = 250.0f;
+const float DEF_MAX_ULTRA_SMOOTH_STD_DEV = 5.0F;
+const float DEF_ULTRA_SMOOTH_MAX_MSE_SCALE = 1500.0F;
+
+const float QOI_DEF_SMOOTH_MAX_MSE_SCALE = 2500.0f;
+const float QOI_DEF_ULTRA_SMOOTH_MAX_MSE_SCALE = 5000.0f;
+
+enum speed_mode
+{
+	cNormalSpeed,
+	cFasterSpeed,
+	cFastestSpeed
+};
+
+struct rdo_png_params
+{
+	rdo_png_params()
+	{
+		clear();
+	}
+
+	void clear()
+	{
+		m_orig_img.clear();
+		m_output_file_data.clear();
+		m_lambda = 300.0f;
+		m_level = 0;
+		m_psnr = 0;
+		m_angular_rms_error = 0;
+		m_y_psnr = 0;
+		m_bpp = 0;
+		m_print_debug_output = false;
+		m_debug_images = false;
+		m_print_progress = false;
+		m_print_stats = false;
+
+		m_use_chan_weights = false;
+		m_chan_weights[0] = 1;
+		m_chan_weights[1] = 1;
+		m_chan_weights[2] = 1;
+		m_chan_weights[3] = 1;
+
+		{
+			float LW = 2;
+			float AW = 1.5;
+			float BW = 1;
+			float l = sqrtf(LW * LW + AW * AW + BW * BW);
+			LW /= l;
+			AW /= l;
+			BW /= l;
+			m_chan_weights_lab[0] = LW; // L
+			m_chan_weights_lab[1] = AW; // a
+			m_chan_weights_lab[2] = BW; // b
+			m_chan_weights_lab[3] = 1.5f; // alpha
+		}
+
+		m_use_reject_thresholds = true;
+		m_reject_thresholds[0] = 32;
+		m_reject_thresholds[1] = 32;
+		m_reject_thresholds[2] = 32;
+		m_reject_thresholds[3] = 32;
+
+		m_reject_thresholds_lab[0] = .05f;
+		//m_reject_thresholds_lab[1] = .075f;
+		m_reject_thresholds_lab[1] = .05f;
+
+		m_transparent_reject_test = false;
+
+		m_perceptual_error = true;
+
+		m_match_only = false;
+
+		m_two_pass = false;
+
+		m_alpha_is_opacity = true;
+
+		m_speed_mode = cFastestSpeed;
+
+		m_max_smooth_std_dev = DEF_MAX_SMOOTH_STD_DEV;
+		m_smooth_max_mse_scale = DEF_SMOOTH_MAX_MSE_SCALE;
+		m_max_ultra_smooth_std_dev = DEF_MAX_ULTRA_SMOOTH_STD_DEV;
+		m_ultra_smooth_max_mse_scale = DEF_ULTRA_SMOOTH_MAX_MSE_SCALE;
+
+		m_no_mse_scaling = false;
+	}
+
+	void print()
+	{
+		printf("orig image: %ux%u has alpha: %u\n", m_orig_img.get_width(), m_orig_img.get_height(), m_orig_img.has_alpha());
+		printf("lambda: %f\n", m_lambda);
+		printf("level: %u\n", m_level);
+		printf("chan weights: %u %u %u %u\n", m_chan_weights[0], m_chan_weights[1], m_chan_weights[2], m_chan_weights[3]);
+		printf("use chan weights: %u\n", m_use_chan_weights);
+		printf("chan weights lab: %f %f %f %f\n", m_chan_weights_lab[0], m_chan_weights_lab[1], m_chan_weights_lab[2], m_chan_weights_lab[3]);
+		printf("reject thresholds: %u %u %u %u\n", m_reject_thresholds[0], m_reject_thresholds[1], m_reject_thresholds[2], m_reject_thresholds[3]);
+		printf("reject thresholds lab: %f %f\n", m_reject_thresholds_lab[0], m_reject_thresholds_lab[1]);
+		printf("use reject thresholds: %u\n", m_use_reject_thresholds);
+		printf("transparent reject test: %u\n", m_transparent_reject_test);
+		printf("print debug output: %u\n", m_print_debug_output);
+		printf("debug images: %u\n", m_debug_images);
+		printf("print progress: %u\n", m_print_progress);
+		printf("print stats: %u\n", m_print_stats);
+		printf("perceptual error: %u\n", m_perceptual_error);
+		printf("match only: %u\n", m_match_only);
+		printf("two pass: %u\n", m_two_pass);
+		printf("alpha is opacity: %u\n", m_alpha_is_opacity);
+		printf("speed mode: %u\n", (uint32_t)m_speed_mode);
+		printf("max smooth std dev: %f\n", m_max_smooth_std_dev);
+		printf("smooth max mse scale: %f\n", m_smooth_max_mse_scale);
+		printf("max ultra smooth std dev: %f\n", m_max_ultra_smooth_std_dev);
+		printf("ultra smooth max mse scale: %f\n", m_ultra_smooth_max_mse_scale);
+		printf("no MSE scaling: %u\n", m_no_mse_scaling);
+	}
+
+	// TODO: results - move
+	float m_psnr;
+	float m_angular_rms_error;
+	float m_y_psnr;
+	float m_bpp;
+
+	// This is the output image data, but note for PNG you can't save it at the right size without the scanline predictor values.
+	image m_output_image;
+
+	image m_orig_img;
+
+	std::vector<uint8_t> m_output_file_data;
+
+	float m_lambda;
+
+	uint32_t m_level;
+
+	uint32_t m_chan_weights[4];
+	float m_chan_weights_lab[4];
+	bool m_use_chan_weights;
+
+	uint32_t m_reject_thresholds[4];
+	float m_reject_thresholds_lab[2];
+	bool m_use_reject_thresholds;
+
+	bool m_transparent_reject_test;
+
+	bool m_print_debug_output;
+	bool m_debug_images;
+	bool m_print_progress;
+	bool m_print_stats;
+
+	bool m_perceptual_error;
+
+	bool m_match_only;
+	bool m_two_pass;
+
+	bool m_alpha_is_opacity;
+
+	speed_mode m_speed_mode;
+
+	float m_max_smooth_std_dev;
+	float m_smooth_max_mse_scale;
+	float m_max_ultra_smooth_std_dev;
+	float m_ultra_smooth_max_mse_scale;
+
+	bool m_no_mse_scaling;
+};
+
+static inline float square(float f)
+{
+	return f * f;
+}
+
+static inline uint32_t byteswap_32(uint32_t v)
+{
+	return ((v & 0xFF) << 24) | (((v >> 8) & 0xFF) << 16) | (((v >> 16) & 0xFF) << 8) | ((v >> 24) & 0xFF);
+}
+
+class tracked_stat
+{
+public:
+	tracked_stat() { clear(); }
+
+	inline void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+	inline void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+	inline tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+	inline uint32_t get_number_of_values() { return m_num; }
+	inline uint64_t get_total() const { return m_total; }
+	inline uint64_t get_total2() const { return m_total2; }
+
+	inline float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+	inline float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+	inline float get_variance() const { float s = get_std_dev(); return s * s; }
+
+private:
+	uint32_t m_num;
+	uint64_t m_total;
+	uint64_t m_total2;
+};
+
+struct Lab { float L; float a; float b; };
+struct RGB { float r; float g; float b; };
+
+static inline Lab linear_srgb_to_oklab(RGB c)
+{
+	float l = 0.4122214708f * c.r + 0.5363325363f * c.g + 0.0514459929f * c.b;
+	float m = 0.2119034982f * c.r + 0.6806995451f * c.g + 0.1073969566f * c.b;
+	float s = 0.0883024619f * c.r + 0.2817188376f * c.g + 0.6299787005f * c.b;
+
+	float l_ = std::cbrtf(l);
+	float m_ = std::cbrtf(m);
+	float s_ = std::cbrtf(s);
+
+	return
+	{
+		0.2104542553f * l_ + 0.7936177850f * m_ - 0.0040720468f * s_,
+		1.9779984951f * l_ - 2.4285922050f * m_ + 0.4505937099f * s_,
+		0.0259040371f * l_ + 0.7827717662f * m_ - 0.8086757660f * s_,
+	};
+}
+
+static float g_srgb_to_linear[256];
+
+static float f_inv(float x)
+{
+	if (x <= 0.04045f)
+		return x / 12.92f;
+	else
+		return powf(((x + 0.055f) / 1.055f), 2.4f);
+}
+
+static void init_srgb_to_linear()
+{
+	for (uint32_t i = 0; i < 256; i++)
+		g_srgb_to_linear[i] = f_inv(i / 255.0f);
+}
+
+#pragma pack(push, 1)
+struct Lab16
+{
+	uint16_t m_L, m_a, m_b;
+};
+#pragma pack(pop)
+
+std::vector<Lab16> g_srgb_to_oklab16;
+
+const float SCALE_L = 1.0f / 65535.0f;
+const float SCALE_A = (1.0f / 65535.0f) * (0.276216f - (-0.233887f));
+const float OFS_A = -0.233887f;
+const float SCALE_B = (1.0f / 65535.0f) * (0.198570f - (-0.311528f));
+const float OFS_B = -0.311528f;
+
+const float MIN_L = 0.000000f, MAX_L = 1.000000f;
+const float MIN_A = -0.233888f, MAX_A = 0.276217f;
+const float MIN_B = -0.311529f, MAX_B = 0.198570f;
+
+static inline Lab srgb_to_oklab(const color_rgba &c)
+{
+	const Lab16 &l = g_srgb_to_oklab16[c.r + c.g * 256 + c.b * 65536];
+
+	Lab res;
+	res.L = l.m_L * SCALE_L;
+	res.a = l.m_a * SCALE_A + OFS_A;
+	res.b = l.m_b * SCALE_B + OFS_B;
+
+	return res;
+}
+
+static inline Lab srgb_to_oklab_norm(const color_rgba& c)
+{
+	const Lab16& l = g_srgb_to_oklab16[c.r + c.g * 256 + c.b * 65536];
+
+	Lab res;
+	res.L = l.m_L * SCALE_L;
+	res.a = l.m_a * SCALE_L;
+	res.b = l.m_b * SCALE_L;
+
+	return res;
+}
+
+static void init_oklab_table(const char *pExec, bool quiet, bool caching_enabled)
+{
+	g_srgb_to_oklab16.resize(256 * 256 * 256);
+
+	for (uint32_t r = 0; r <= 255; r++)
+	{
+		for (uint32_t g = 0; g <= 255; g++)
+		{
+			for (uint32_t b = 0; b <= 255; b++)
+			{
+				color_rgba c(r, g, b, 255);
+				Lab l(linear_srgb_to_oklab({ g_srgb_to_linear[c.r], g_srgb_to_linear[c.g], g_srgb_to_linear[c.b] }));
+
+				assert(l.L >= MIN_L && l.L <= MAX_L);
+				assert(l.a >= MIN_A && l.a <= MAX_A);
+				assert(l.b >= MIN_B && l.b <= MAX_B);
+
+				float lL = std::round(((l.L - MIN_L) / (MAX_L - MIN_L)) * 65535.0f);
+				float la = std::round(((l.a - MIN_A) / (MAX_A - MIN_A)) * 65535.0f);
+				float lb = std::round(((l.b - MIN_B) / (MAX_B - MIN_B)) * 65535.0f);
+
+				lL = clamp(lL, 0.0f, 65535.0f);
+				la = clamp(la, 0.0f, 65535.0f);
+				lb = clamp(lb, 0.0f, 65535.0f);
+
+				Lab16& v = g_srgb_to_oklab16[r + g * 256 + b * 65536];
+				v.m_L = (uint16_t)lL;
+				v.m_a = (uint16_t)la;
+				v.m_b = (uint16_t)lb;
+			}
+		}
+	}
+}
+
+static inline float compute_se(const color_rgba& a, const color_rgba& orig, uint32_t num_comps, const rdo_png_params &params)
+{
+	float dist;
+
+	if (params.m_perceptual_error)
+	{
+		Lab la = srgb_to_oklab_norm(a);
+		Lab lb = srgb_to_oklab_norm(orig);
+
+		la.L -= lb.L;
+		la.a -= lb.a;
+		la.b -= lb.b;
+
+		float L_d = la.L * la.L;
+		float a_d = la.a * la.a;
+		float b_d = la.b * la.b;
+
+		L_d *= params.m_chan_weights_lab[0];
+		a_d *= params.m_chan_weights_lab[1];
+		b_d *= params.m_chan_weights_lab[2];
+
+		dist = L_d + a_d + b_d;
+
+		// TODO: Scales the error to bring it into a range where lambda will be roughly comparable to plain MSE.
+		const float NORM_ERROR_SCALE = 350000.0f;
+		dist *= NORM_ERROR_SCALE;
+
+		if (num_comps == 4)
+		{
+			int da = (int)a[3] - (int)orig[3];
+			dist += params.m_chan_weights_lab[3] * square((float)da);
+		}
+	}
+	else if (params.m_use_chan_weights)
+	{
+		int dr = (int)a[0] - (int)orig[0];
+		int dg = (int)a[1] - (int)orig[1];
+		int db = (int)a[2] - (int)orig[2];
+
+		uint32_t idist = (uint32_t)(params.m_chan_weights[0] * (uint32_t)(dr * dr) + params.m_chan_weights[1] * (uint32_t)(dg * dg) + params.m_chan_weights[2] * (uint32_t)(db * db));
+		if (num_comps == 4)
+		{
+			int da = (int)a[3] - (int)orig[3];
+			idist += params.m_chan_weights[3] * (uint32_t)(da * da);
+		}
+
+		dist = (float)idist;
+	}
+	else
+	{
+		int dr = (int)a[0] - (int)orig[0];
+		int dg = (int)a[1] - (int)orig[1];
+		int db = (int)a[2] - (int)orig[2];
+
+		uint32_t idist = (uint32_t)(dr * dr + dg * dg + db * db);
+		if (num_comps == 4)
+		{
+			int da = (int)a[3] - (int)orig[3];
+			idist += da * da;
+		}
+
+		dist = (float)idist;
+	}
+
+	return dist;
+}
+
+static inline bool should_reject(const color_rgba& trial_color, const color_rgba& orig_color, uint32_t num_comps, const rdo_png_params& params)
+{
+	if ((params.m_transparent_reject_test) && (num_comps == 4))
+	{
+		if ((orig_color[3] == 0) && (trial_color[3] > 0))
+			return true;
+
+		if ((orig_color[3] == 255) && (trial_color[3] < 255))
+			return true;
+	}
+
+	if (params.m_use_reject_thresholds)
+	{
+		if (params.m_perceptual_error)
+		{
+			Lab t(srgb_to_oklab_norm(trial_color));
+			Lab o(srgb_to_oklab_norm(orig_color));
+
+			float L_diff = fabs(t.L - o.L);
+
+			if (L_diff > params.m_reject_thresholds_lab[0])
+				return true;
+
+			float ab_dist = square(t.a - o.a) + square(t.b - o.b);
+
+			if (ab_dist > (params.m_reject_thresholds_lab[1] * params.m_reject_thresholds_lab[1]))
+				return true;
+
+			if (num_comps == 4)
+			{
+				uint32_t delta_a = abs((int)trial_color[3] - (int)orig_color[3]);
+				if (delta_a > params.m_reject_thresholds[3])
+					return true;
+			}
+		}
+		else
+		{
+			uint32_t delta_r = abs((int)trial_color[0] - (int)orig_color[0]);
+			uint32_t delta_g = abs((int)trial_color[1] - (int)orig_color[1]);
+			uint32_t delta_b = abs((int)trial_color[2] - (int)orig_color[2]);
+
+			if (delta_r > params.m_reject_thresholds[0])
+				return true;
+			if (delta_g > params.m_reject_thresholds[1])
+				return true;
+			if (delta_b > params.m_reject_thresholds[2])
+				return true;
+
+			if (num_comps == 4)
+			{
+				uint32_t delta_a = abs((int)trial_color[3] - (int)orig_color[3]);
+				if (delta_a > params.m_reject_thresholds[3])
+					return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+struct smooth_desc {
+	bool alpha_is_opacity {true};
+	float max_smooth_std_dev {DEF_MAX_SMOOTH_STD_DEV};
+	float smooth_max_mse_scale {QOI_DEF_SMOOTH_MAX_MSE_SCALE};
+	float max_ultra_smooth_std_dev {DEF_MAX_ULTRA_SMOOTH_STD_DEV};
+	float ultra_smooth_max_mse_scale {QOI_DEF_ULTRA_SMOOTH_MAX_MSE_SCALE};
+};
+
+static void create_smooth_maps(
+	vector2D<float> &smooth_block_mse_scales,
+	const image& orig_img,
+	const smooth_desc& desc
+) {
+	const uint32_t width = orig_img.get_width();
+	const uint32_t height = orig_img.get_height();
+	const uint32_t total_pixels = orig_img.get_total_pixels();
+	const bool has_alpha = orig_img.has_alpha();
+	const uint32_t num_comps = has_alpha ? 4 : 3;
+
+#if 0
+	if (params.m_no_mse_scaling)
+	{
+		smooth_block_mse_scales.set_all(1.0f);
+		return;
+	}
+#endif
+
+	image smooth_vis(width, height);
+	image alpha_edge_vis(width, height);
+	image ultra_smooth_vis(width, height);
+
+	for (uint32_t y = 0; y < height; y++)
+	{
+		for (uint32_t x = 0; x < width; x++)
+		{
+			float alpha_edge_yl = 0.0f;
+			if ((num_comps == 4) && (desc.alpha_is_opacity))
+			{
+				tracked_stat alpha_comp_stats;
+				for (int yd = -3; yd <= 3; yd++)
+				{
+					for (int xd = -3; xd <= 3; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+						alpha_comp_stats.update(p[3]);
+					}
+				}
+
+				float max_std_dev = alpha_comp_stats.get_std_dev();
+
+				float yl = clamp(max_std_dev / desc.max_smooth_std_dev, 0.0f, 1.0f);
+				alpha_edge_yl = yl * yl;
+			}
+
+			{
+				tracked_stat comp_stats[4];
+				for (int yd = -1; yd <= 1; yd++)
+				{
+					for (int xd = -1; xd <= 1; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+						comp_stats[0].update(p[0]);
+						comp_stats[1].update(p[1]);
+						comp_stats[2].update(p[2]);
+						if (num_comps == 4)
+							comp_stats[3].update(p[3]);
+					}
+				}
+
+				float max_std_dev = 0.0f;
+				for (uint32_t i = 0; i < num_comps; i++)
+					max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
+
+				float yl = clamp(max_std_dev / desc.max_smooth_std_dev, 0.0f, 1.0f);
+				yl = yl * yl;
+
+				smooth_block_mse_scales(x, y) = lerp(desc.smooth_max_mse_scale, 1.0f, yl);
+
+				if (num_comps == 4)
+				{
+					alpha_edge_vis(x, y).set((int)std::round(alpha_edge_yl * 255.0f));
+
+					smooth_block_mse_scales(x, y) = lerp(smooth_block_mse_scales(x, y), desc.smooth_max_mse_scale, alpha_edge_yl);
+				}
+
+				smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (desc.smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
+			}
+
+			{
+				tracked_stat comp_stats[4];
+
+				const int S = 5;
+				for (int yd = -S; yd < S; yd++)
+				{
+					for (int xd = -S; xd < S; xd++)
+					{
+						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
+						comp_stats[0].update(p[0]);
+						comp_stats[1].update(p[1]);
+						comp_stats[2].update(p[2]);
+						if (num_comps == 4)
+							comp_stats[3].update(p[3]);
+					}
+				}
+
+				float max_std_dev = 0.0f;
+				for (uint32_t i = 0; i < num_comps; i++)
+					max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
+
+				float yl = clamp(max_std_dev / desc.max_ultra_smooth_std_dev, 0.0f, 1.0f);
+				yl = powf(yl, 3.0f);
+
+				smooth_block_mse_scales(x, y) = lerp(desc.ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
+
+				ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
+			}
+
+		}
+	}
+
+#if 0
+	if (params.m_debug_images)
+	{
+		save_png("dbg_smooth_vis.png", smooth_vis);
+		save_png("dbg_alpha_edge_vis.png", alpha_edge_vis);
+		save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);
+	}
+#endif
+}
+
+#pragma pack(push, 1)
+struct qoi_header
+{
+	char magic[4]; // magic bytes "qoif"
+	uint32_t width; // image width in pixels (BE)
+	uint32_t height; // image height in pixels (BE)
+	uint8_t channels; // 3 = RGB, 4 = RGBA
+	uint8_t colorspace; // 0 = sRGB with linear alpha 1 = all channels linear
+};
+#pragma pack(pop)
+
+static bool encode_rdo_qoi(
+	const image& orig_img,
+	std::vector<uint8_t>& data,
+	//const rdo_png_params& params,
+	const vector2D<float>& smooth_block_mse_scales,
+	float lambda)
+{
+	// This function wasn't designed to deal with lambda=0, so nudge it up.
+	lambda = max(lambda, .0000125f);
+
+	const rdo_png_params params{};
+
+	const bool has_alpha = orig_img.has_alpha();
+	uint32_t num_comps = has_alpha ? 4 : 3;
+
+	color_rgba hash[64];
+	//clear_obj(hash);
+	memset(&hash, 0, sizeof(hash));
+
+	data.resize(0);
+
+	qoi_header hdr;
+	memcpy(hdr.magic, "qoif", 4);
+	hdr.width = byteswap_32(orig_img.get_width());
+	hdr.height = byteswap_32(orig_img.get_height());
+	hdr.channels = has_alpha ? 4 : 3;
+	hdr.colorspace = 0;
+	data.resize(sizeof(hdr));
+	memcpy(data.data(), &hdr, sizeof(hdr));
+
+	int prev_r = 0, prev_g = 0, prev_b = 0, prev_a = 255;
+	uint32_t cur_run_len = 0;
+
+	enum commands_t
+	{
+		cRUN,
+		cIDX,
+		cDELTA,
+		cLUMA,
+		cRGB,
+		cRGBA,
+	};
+
+	uint32_t total_run = 0, total_rgb = 0, total_rgba = 0, total_index = 0, total_delta = 0, total_luma = 0, total_run_pixels = 0;
+
+	for (uint32_t y = 0; y < orig_img.get_height(); y++)
+	{
+		for (uint32_t x = 0; x < orig_img.get_width(); x++)
+		{
+			const color_rgba& c = orig_img(x, y);
+			const float mse_scale = smooth_block_mse_scales(x, y);
+
+			float best_mse = 0.0f;
+			float best_bits = 40.0f;
+			float best_t = best_mse + best_bits * lambda;
+			int best_command = cRGBA;
+			int best_index = 0, best_dr = 0, best_dg = 0, best_db = 0;
+
+			{
+				color_rgba trial_c(c.r, c.g, c.b, prev_a);
+				if (!should_reject(trial_c, c, 4, params))
+				{
+					float mse = compute_se(trial_c, c, 4, params);
+					float bits = 32.0f;
+					float trial_t = mse_scale * mse + bits * lambda;
+					if (trial_t < best_t)
+					{
+						best_mse = mse;
+						best_bits = bits;
+						best_t = trial_t;
+						best_command = cRGB;
+					}
+				}
+			}
+
+			{
+				color_rgba trial_c(prev_r, prev_g, prev_b, prev_a);
+				if (!should_reject(trial_c, c, 4, params))
+				{
+					float mse = compute_se(trial_c, c, 4, params);
+					float bits = cur_run_len ? 0 : 8.0f;
+					float trial_t = mse_scale * mse + bits * lambda;
+					if (trial_t < best_t)
+					{
+						best_mse = mse;
+						best_bits = bits;
+						best_t = trial_t;
+						best_command = cRUN;
+
+						if (best_mse == 0.0f)
+						{
+							cur_run_len++;
+							if (cur_run_len == 62)
+							{
+								total_run_pixels += cur_run_len;
+
+								data.push_back(0xC0 | (cur_run_len - 1));
+								cur_run_len = 0;
+
+								total_run++;
+							}
+
+							hash[(prev_r * 3 + prev_g * 5 + prev_b * 7 + prev_a * 11) & 63].set(prev_r, prev_g, prev_b, prev_a);
+
+							continue;
+						}
+					}
+				}
+			}
+
+			if (8.0f * lambda < best_t)
+			{
+				uint32_t hash_idx = (c.r * 3 + c.g * 5 + c.b * 7 + c.a * 11) & 63;
+
+				// First try the INDEX command losslessly.
+				if (c == hash[hash_idx])
+				{
+					float bits = 8.0f;
+					float trial_t = bits * lambda;
+
+					assert(trial_t < best_t);
+
+					best_mse = 0.0f;
+					best_bits = bits;
+					best_t = trial_t;
+					best_command = cIDX;
+					best_index = hash_idx;
+				}
+				else
+				{
+					// Try a lossy INDEX command.
+					for (uint32_t i = 0; i < 64; i++)
+					{
+						if (!should_reject(hash[i], c, 4, params))
+						{
+							float mse = compute_se(hash[i], c, 4, params);
+							float bits = 8.0f;
+							float trial_t = mse_scale * mse + bits * lambda;
+							if (trial_t < best_t)
+							{
+								best_mse = mse;
+								best_bits = bits;
+								best_t = trial_t;
+								best_command = cIDX;
+								best_index = i;
+							}
+						}
+					}
+				}
+			}
+
+			if (8.0f * lambda < best_t)
+			{
+				bool delta_encodable_losslessly = false;
+
+				// First try the DELTA command losslessly.
+				if (c.a == prev_a)
+				{
+					int dr = ((int)c.r - prev_r + 2) & 255;
+					int dg = ((int)c.g - prev_g + 2) & 255;
+					int db = ((int)c.b - prev_b + 2) & 255;
+
+					if ((dr <= 3) && (dg <= 3) && (db <= 3))
+					{
+						delta_encodable_losslessly = true;
+
+						float bits = 8.0f;
+						float trial_t = bits * lambda;
+
+						assert(trial_t < best_t);
+
+						best_mse = 0.0f;
+						best_bits = bits;
+						best_t = trial_t;
+						best_command = cDELTA;
+						best_dr = dr - 2;
+						best_dg = dg - 2;
+						best_db = db - 2;
+					}
+				}
+
+				// Try a lossy DELTA command.
+				if (!delta_encodable_losslessly)
+				{
+					for (uint32_t i = 0; i < 64; i++)
+					{
+						int dr = ((i >> 4) & 3) - 2;
+						int dg = ((i >> 2) & 3) - 2;
+						int db = (i & 3) - 2;
+
+						color_rgba trial_c((prev_r + dr) & 255, (prev_g + dg) & 255, (prev_b + db) & 255, prev_a);
+
+						if (!should_reject(trial_c, c, 4, params))
+						{
+							float mse = compute_se(trial_c, c, 4, params);
+							float bits = 8.0f;
+							float trial_t = mse_scale * mse + bits * lambda;
+
+							if (trial_t < best_t)
+							{
+								best_mse = mse;
+								best_bits = bits;
+								best_t = trial_t;
+								best_command = cDELTA;
+								best_dr = dr;
+								best_dg = dg;
+								best_db = db;
+							}
+						}
+					}
+				}
+			}
+
+			if (16.0f * lambda < best_t)
+			{
+				bool luma_encodable_losslessly_in_rgb = false;
+
+				// First try the LUMA command losslessly in RGB (may not be lossy in alpha).
+				{
+					int g_diff = (int)c.g - prev_g;
+
+					int dg = (g_diff + 32) & 255;
+
+					int dr = (((int)c.r - prev_r) - g_diff + 8) & 255;
+					int db = (((int)c.b - prev_b) - g_diff + 8) & 255;
+
+					if ((dg <= 63) && (dr <= 15) && (db <= 15))
+					{
+						luma_encodable_losslessly_in_rgb = true;
+
+						color_rgba trial_c(c.r, c.g, c.b, prev_a);
+
+						if (!should_reject(trial_c, c, 4, params))
+						{
+							float mse = compute_se(trial_c, c, 4, params);
+							float bits = 16.0f;
+							float trial_t = mse_scale * mse + bits * lambda;
+
+							if (trial_t < best_t)
+							{
+								best_mse = mse;
+								best_bits = bits;
+								best_t = trial_t;
+								best_command = cLUMA;
+								best_dr = dr - 8;
+								best_dg = dg - 32;
+								best_db = db - 8;
+							}
+						}
+					}
+				}
+
+				// If we can't use it losslessly, try it lossy.
+				if ((!luma_encodable_losslessly_in_rgb) && (params.m_speed_mode != cFastestSpeed))
+				{
+					if (params.m_speed_mode == cNormalSpeed)
+					{
+						// Search all encodable LUMA commands.
+						for (uint32_t i = 0; i < 16384; i++)
+						{
+							int dr = ((i >> 6) & 15) - 8;
+							int dg = (i & 63) - 32;
+							int db = ((i >> 10) & 15) - 8;
+
+							color_rgba trial_c((prev_r + dg + dr) & 255, (prev_g + dg) & 255, (prev_b + dg + db) & 255, prev_a);
+
+							if (!should_reject(trial_c, c, 4, params))
+							{
+								float mse = compute_se(trial_c, c, 4, params);
+								float bits = 16.0f;
+								float trial_t = mse_scale * mse + bits * lambda;
+
+								if (trial_t < best_t)
+								{
+									best_mse = mse;
+									best_bits = bits;
+									best_t = trial_t;
+									best_command = cLUMA;
+									best_dr = dr;
+									best_dg = dg;
+									best_db = db;
+								}
+							}
+						}
+					}
+					else
+					{
+						// TODO: This isn't very smart. What if the G delta is encodable but R and/or B aren't?
+						const int g_deltas[] = { -24, -16, -14, -12, -10, -8, -6, -4, -3, -2, -1, 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 24 };
+						const int TOTAL_G_DELTAS = sizeof(g_deltas) / sizeof(g_deltas[0]);
+
+						for (int kg = 0; kg < TOTAL_G_DELTAS; kg++)
+						{
+							const int dg = g_deltas[kg];
+							for (uint32_t i = 0; i < 256; i++)
+							{
+								int dr = (i & 15) - 8;
+								int db = ((i >> 4) & 15) - 8;
+
+								color_rgba trial_c((prev_r + dg + dr) & 255, (prev_g + dg) & 255, (prev_b + dg + db) & 255, prev_a);
+
+								if (!should_reject(trial_c, c, 4, params))
+								{
+									float mse = compute_se(trial_c, c, 4, params);
+									float bits = 16.0f;
+									float trial_t = mse_scale * mse + bits * lambda;
+
+									if (trial_t < best_t)
+									{
+										best_mse = mse;
+										best_bits = bits;
+										best_t = trial_t;
+										best_command = cLUMA;
+										best_dr = dr;
+										best_dg = dg;
+										best_db = db;
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+
+			switch (best_command)
+			{
+			case cRUN:
+			{
+				cur_run_len++;
+				if (cur_run_len == 62)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				hash[(prev_r * 3 + prev_g * 5 + prev_b * 7 + prev_a * 11) & 63].set(prev_r, prev_g, prev_b, prev_a);
+
+				break;
+			}
+			case cRGB:
+			{
+				if (cur_run_len)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				data.push_back(254);
+				data.push_back((uint8_t)c.r);
+				data.push_back((uint8_t)c.g);
+				data.push_back((uint8_t)c.b);
+				hash[(c.r * 3 + c.g * 5 + c.b * 7 + prev_a * 11) & 63].set(c.r, c.g, c.b, prev_a);
+				prev_r = c.r;
+				prev_g = c.g;
+				prev_b = c.b;
+
+				total_rgb++;
+
+				break;
+			}
+			case cRGBA:
+			{
+				if (cur_run_len)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				data.push_back(255);
+				data.push_back((uint8_t)c.r);
+				data.push_back((uint8_t)c.g);
+				data.push_back((uint8_t)c.b);
+				data.push_back((uint8_t)c.a);
+				hash[(c.r * 3 + c.g * 5 + c.b * 7 + c.a * 11) & 63] = c;
+				prev_r = c.r;
+				prev_g = c.g;
+				prev_b = c.b;
+				prev_a = c.a;
+
+				total_rgba++;
+
+				break;
+			}
+			case cIDX:
+			{
+				if (cur_run_len)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				data.push_back(best_index);
+
+				prev_r = hash[best_index].r;
+				prev_g = hash[best_index].g;
+				prev_b = hash[best_index].b;
+				prev_a = hash[best_index].a;
+
+				total_index++;
+
+				break;
+			}
+			case cDELTA:
+			{
+				if (cur_run_len)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				assert(best_dr >= -2 && best_dr <= 1);
+				assert(best_dg >= -2 && best_dg <= 1);
+				assert(best_db >= -2 && best_db <= 1);
+
+				data.push_back(64 + ((best_dr + 2) << 4) + ((best_dg + 2) << 2) + (best_db + 2));
+
+				uint32_t decoded_r = (prev_r + best_dr) & 0xFF;
+				uint32_t decoded_g = (prev_g + best_dg) & 0xFF;
+				uint32_t decoded_b = (prev_b + best_db) & 0xFF;
+				uint32_t decoded_a = prev_a;
+
+				hash[(decoded_r * 3 + decoded_g * 5 + decoded_b * 7 + decoded_a * 11) & 63].set(decoded_r, decoded_g, decoded_b, decoded_a);
+
+				prev_r = decoded_r;
+				prev_g = decoded_g;
+				prev_b = decoded_b;
+				prev_a = decoded_a;
+
+				total_delta++;
+
+				break;
+			}
+			case cLUMA:
+			{
+				if (cur_run_len)
+				{
+					total_run_pixels += cur_run_len;
+
+					data.push_back(0xC0 | (cur_run_len - 1));
+					cur_run_len = 0;
+
+					total_run++;
+				}
+
+				assert(best_dr >= -8 && best_dr <= 7);
+				assert(best_dg >= -32 && best_dg <= 31);
+				assert(best_db >= -8 && best_db <= 7);
+
+				data.push_back((uint8_t)(128 + (best_dg + 32)));
+				data.push_back((uint8_t)(((best_dr + 8) << 4) | (best_db + 8)));
+
+				uint32_t decoded_r = (prev_r + best_dr + best_dg) & 0xFF;
+				uint32_t decoded_g = (prev_g + best_dg) & 0xFF;
+				uint32_t decoded_b = (prev_b + best_db + best_dg) & 0xFF;
+				uint32_t decoded_a = prev_a;
+
+				hash[(decoded_r * 3 + decoded_g * 5 + decoded_b * 7 + decoded_a * 11) & 63].set(decoded_r, decoded_g, decoded_b, decoded_a);
+
+				prev_r = decoded_r;
+				prev_g = decoded_g;
+				prev_b = decoded_b;
+				prev_a = decoded_a;
+
+				total_luma++;
+
+				break;
+			}
+			default:
+			{
+				assert(0);
+				break;
+			}
+			}
+
+		}
+	}
+
+	if (cur_run_len)
+	{
+		total_run_pixels += cur_run_len;
+
+		data.push_back((64 + 128) | (cur_run_len - 1));
+		cur_run_len = 0;
+
+		total_run++;
+	}
+
+	// end padding
+	for (uint32_t i = 0; i < 7; i++) {
+		data.push_back(0);
+	}
+	data.push_back(1);
+
+	if (params.m_print_stats)
+	{
+		printf("Totals: Run: %u, Run Pixels: %u %3.2f%%, RGB: %u %3.2f%%, RGBA: %u %3.2f%%, INDEX: %u %3.2f%%, DELTA: %u %3.2f%%, LUMA: %u %3.2f%%\n\n",
+			total_run,
+			total_run_pixels, (total_run_pixels * 100.0f) / orig_img.get_total_pixels(),
+			total_rgb, (total_rgb * 100.0f) / orig_img.get_total_pixels(),
+			total_rgba, (total_rgba * 100.0f) / orig_img.get_total_pixels(),
+			total_index, (total_index * 100.0f) / orig_img.get_total_pixels(),
+			total_delta, (total_delta * 100.0f) / orig_img.get_total_pixels(),
+			total_luma, (total_luma * 100.0f) / orig_img.get_total_pixels());
+	}
+
+	return true;
+}
+
+static bool g_init {false};
+
+bool init_qoi_rdo(void) {
+	if (g_init) {
+		return false;
+	}
+	init_srgb_to_linear();
+	init_oklab_table("", true, false);
+	g_init = true;
+	return true;
+}
+
+bool quit_qoi_rdo(void) {
+	if (!g_init) {
+		return false;
+	}
+	g_srgb_to_oklab16.clear();
+
+	return true;
+}
+
+static float lambda_from_quality(int quality) {
+	quality = clamp(quality, 1, 100);
+
+	// TODO: more stuff and log scale
+	//return lerp(50000, 100, quality/100.f);
+	//return lerp(250'000, 0, sqrtf(quality/100.f));
+	//return lerp(1'000'000, 0, sqrtf(quality/100.f));
+	//return lerp(1'000'000, 0, clamp(log10f(quality/100.f)+1, 0.f, 1.f));
+	//return lerp(250'000, 0, clamp(log10f(quality/100.f)+1, 0.f, 1.f));
+	return lerp(250'000, 0, cbrtf(quality/100.f));
+}
+
+std::vector<uint8_t> encode_qoi_rdo_simple(const uint8_t* data, const qoi_rdo_desc& desc, int quality) {
+	if (!g_init) {
+		return {};
+	}
+
+	const float lambda = lambda_from_quality(quality);
+
+	vector2D<float> smooth_block_mse_scales(desc.width, desc.height);
+
+	image orig_img(data, desc.width, desc.height, desc.channels);
+
+	if (false /* m_no_mse_scaling */) {
+		smooth_block_mse_scales.set_all(1.0f);
+	} else {
+		create_smooth_maps(
+			smooth_block_mse_scales,
+			orig_img,
+			{} // smooth_desc
+		);
+	}
+
+	std::vector<uint8_t> output_data;
+
+	if (!encode_rdo_qoi(
+		orig_img,
+		output_data,
+		smooth_block_mse_scales,
+		lambda))
+	{
+		return {};
+	}
+
+	return output_data;
+}
+
diff --git a/qoirdo.hpp b/qoirdo.hpp
new file mode 100644
index 0000000..5b4c64d
--- /dev/null
+++ b/qoirdo.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+// qoirdo.hpp
+// Copyright (C) 2022 Richard Geldreich, Jr. All Rights Reserved.
+// Copyright (C) 2025 Erik Scholz
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <vector>
+
+bool init_qoi_rdo(void);
+bool quit_qoi_rdo(void);
+
+struct qoi_rdo_desc {
+	unsigned int width;
+	unsigned int height;
+	unsigned char channels;
+	unsigned char colorspace;
+};
+
+// quality 1-100
+std::vector<uint8_t> encode_qoi_rdo_simple(const uint8_t* data, const qoi_rdo_desc& desc, int quality);
+
+// TODO: finetuneable
+//uint8_t* encode_qoi_rdo_advanced(const uint8_t* data, const qoi_rdo_desc* desc, int* out_len);
diff --git a/tool.cpp b/tool.cpp
new file mode 100644
index 0000000..815de3f
--- /dev/null
+++ b/tool.cpp
@@ -0,0 +1,139 @@
+#include "./qoirdo.hpp"
+
+#define QOI_IMPLEMENTATION
+#include "./qoi.h"
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <ios>
+#include <vector>
+#include <cstdlib>
+
+void print_help(const char* exe) {
+	std::cout << exe << " [-q 1-100] <path_to.qoi>\n";
+}
+
+// read qoi image, reencode lossy with rdo
+int main(int argc, const char** argv) {
+	if (argc < 2) {
+		std::cerr << "error: at least one paramenter required.\n";
+		std::cout << "help:\n";
+		print_help(argv[0]);
+		return -1;
+	}
+
+	std::filesystem::path input_qoi;
+
+	int quality = 80;
+
+	if (argv[1] == std::string_view{"-q"}) {
+		if (argc < 4) {
+			std::cerr << "error: more parameters required\n";
+			std::cout << "help:\n";
+			print_help(argv[0]);
+			return -1;
+		}
+
+		quality = std::atoi(argv[2]);
+		if (quality < 1 || quality > 100) {
+			std::cerr << "error: invalid quality\n";
+			std::cout << "help:\n";
+			print_help(argv[0]);
+			return -1;
+		}
+
+		input_qoi = argv[3];
+	} else {
+		input_qoi = argv[1];
+	}
+
+	std::filesystem::path output_qoi;
+	if (input_qoi.extension() == ".qoi" || input_qoi.extension() == ".QOI") {
+		output_qoi = input_qoi;
+		output_qoi.replace_extension("rdo.qoi");
+	} else {
+		output_qoi = input_qoi;
+		output_qoi.replace_filename(input_qoi.filename().generic_u8string() + std::string{".rdo.qoi"});
+	}
+
+	std::cout << "input_qoi: " << input_qoi.generic_u8string() << "\n";
+	std::cout << "output_qoi: " << output_qoi.generic_u8string() << "\n";
+	std::cout << "quality: " << quality << "\n";
+
+	std::vector<uint8_t> input_encoded_data;
+	size_t input_file_size {0};
+	{ // read file
+		std::ifstream ifile{input_qoi, std::ios::in | std::ios::binary};
+		if (!ifile.is_open()) {
+			std::cerr << "failed to open file " << input_qoi << "\n";
+			return -2;
+		}
+		ifile.seekg(0, std::ios_base::end);
+		const auto size = ifile.tellg();
+		if (size <= 0) {
+			std::cerr << "failed to open file " << input_qoi << ", file too small\n";
+			return -2;
+		}
+		ifile.seekg(0, std::ios_base::beg);
+		input_encoded_data.resize(size);
+		ifile.read(reinterpret_cast<char*>(input_encoded_data.data()), input_encoded_data.size());
+		input_file_size = size;
+	}
+
+	// decode
+
+	qoi_desc input_desc{};
+	uint8_t* raw_image = static_cast<uint8_t*>(qoi_decode(input_encoded_data.data(), input_encoded_data.size(), &input_desc, 4));
+
+	if (raw_image == nullptr) {
+		std::cerr << "failed to decode input\n";
+		return -3;
+	}
+	if (input_desc.width == 0 || input_desc.height == 0) {
+		free(raw_image);
+		std::cerr << "funny trying to decode input\n";
+		return -3;
+	}
+
+	init_qoi_rdo();
+
+	// encode with rdo
+
+	qoi_rdo_desc desc{
+		input_desc.width,
+		input_desc.height,
+		/*input_desc.channels*/ 4, // ?
+		input_desc.colorspace,
+	};
+	std::vector<uint8_t> encoded_data = encode_qoi_rdo_simple(raw_image, desc,quality);
+	free(raw_image);
+
+	quit_qoi_rdo();
+
+	if (encoded_data.empty()) {
+		std::cerr << "failed to encode image\n";
+		return -3;
+	}
+	if (encoded_data.size() < 4) {
+		std::cout << "warn: encoded image suspiciously small\n";
+	}
+
+	{ // write out
+		std::ofstream ofile{output_qoi, std::ios::out | std::ios::binary | std::ios::trunc};
+
+		if (!ofile.is_open()) {
+			std::cerr << "failed to open output file " << output_qoi << "\n";
+			return -2;
+		}
+
+		ofile.write(reinterpret_cast<const char*>(encoded_data.data()), encoded_data.size());
+	}
+
+	std::cout << "written " << encoded_data.size() << " bytes to " << output_qoi << ". input was " << input_file_size << "\n";
+
+	// TODO: metrics
+
+	return 0;
+}
+