Squashed 'external/libwebp/libwebp/' content from commit dd7364c3c

git-subtree-dir: external/libwebp/libwebp
git-subtree-split: dd7364c3cefe0f5c0b3c18c3b1887d353f90fc1f
This commit is contained in:
2023-08-02 14:57:22 +02:00
commit 67653bbe50
331 changed files with 116119 additions and 0 deletions

41
sharpyuv/Makefile.am Normal file
View File

@ -0,0 +1,41 @@
AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir)
AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
lib_LTLIBRARIES = libsharpyuv.la
noinst_LTLIBRARIES =
noinst_LTLIBRARIES += libsharpyuv_sse2.la
noinst_LTLIBRARIES += libsharpyuv_neon.la
libsharpyuvinclude_HEADERS =
libsharpyuvinclude_HEADERS += sharpyuv.h
libsharpyuvinclude_HEADERS += sharpyuv_csp.h
noinst_HEADERS =
noinst_HEADERS += ../src/dsp/cpu.c
noinst_HEADERS += ../src/dsp/cpu.h
noinst_HEADERS += ../src/webp/types.h
libsharpyuv_sse2_la_SOURCES =
libsharpyuv_sse2_la_SOURCES += sharpyuv_sse2.c
libsharpyuv_sse2_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
libsharpyuv_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
libsharpyuv_neon_la_SOURCES =
libsharpyuv_neon_la_SOURCES += sharpyuv_neon.c
libsharpyuv_neon_la_CPPFLAGS = $(libsharpyuv_la_CPPFLAGS)
libsharpyuv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
libsharpyuv_la_SOURCES =
libsharpyuv_la_SOURCES += sharpyuv_cpu.c sharpyuv_cpu.h
libsharpyuv_la_SOURCES += sharpyuv_csp.c sharpyuv_csp.h
libsharpyuv_la_SOURCES += sharpyuv_dsp.c sharpyuv_dsp.h
libsharpyuv_la_SOURCES += sharpyuv_gamma.c sharpyuv_gamma.h
libsharpyuv_la_SOURCES += sharpyuv.c sharpyuv.h
libsharpyuv_la_CPPFLAGS = $(AM_CPPFLAGS)
libsharpyuv_la_LDFLAGS = -no-undefined -version-info 0:1:0 -lm
libsharpyuv_la_LIBADD =
libsharpyuv_la_LIBADD += libsharpyuv_sse2.la
libsharpyuv_la_LIBADD += libsharpyuv_neon.la
libsharpyuvincludedir = $(includedir)/webp/sharpyuv
pkgconfig_DATA = libsharpyuv.pc

View File

@ -0,0 +1,11 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@/webp
Name: libsharpyuv
Description: Library for sharp RGB to YUV conversion
Version: @PACKAGE_VERSION@
Cflags: -I${includedir}
Libs: -L${libdir} -l@webp_libname_prefix@sharpyuv
Libs.private: -lm @PTHREAD_CFLAGS@ @PTHREAD_LIBS@

41
sharpyuv/libsharpyuv.rc Normal file
View File

@ -0,0 +1,41 @@
#define APSTUDIO_READONLY_SYMBOLS
#include "winres.h"
#undef APSTUDIO_READONLY_SYMBOLS
#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,0,2,1
PRODUCTVERSION 0,0,2,1
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x1L
#else
FILEFLAGS 0x0L
#endif
FILEOS 0x40004L
FILETYPE 0x2L
FILESUBTYPE 0x0L
BEGIN
BLOCK "StringFileInfo"
BEGIN
BLOCK "040904b0"
BEGIN
VALUE "CompanyName", "Google, Inc."
VALUE "FileDescription", "libsharpyuv DLL"
VALUE "FileVersion", "0.2.1"
VALUE "InternalName", "libsharpyuv.dll"
VALUE "LegalCopyright", "Copyright (C) 2023"
VALUE "OriginalFilename", "libsharpyuv.dll"
VALUE "ProductName", "SharpYuv Library"
VALUE "ProductVersion", "0.2.1"
END
END
BLOCK "VarFileInfo"
BEGIN
VALUE "Translation", 0x409, 1200
END
END
#endif // English (United States) resources

565
sharpyuv/sharpyuv.c Normal file
View File

@ -0,0 +1,565 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Sharp RGB to YUV conversion.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv.h"
#include <assert.h>
#include <limits.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "src/webp/types.h"
#include "sharpyuv/sharpyuv_cpu.h"
#include "sharpyuv/sharpyuv_dsp.h"
#include "sharpyuv/sharpyuv_gamma.h"
//------------------------------------------------------------------------------
int SharpYuvGetVersion(void) {
return SHARPYUV_VERSION;
}
//------------------------------------------------------------------------------
// Sharp RGB->YUV conversion
static const int kNumIterations = 4;
#define YUV_FIX 16 // fixed-point precision for RGB->YUV
static const int kYuvHalf = 1 << (YUV_FIX - 1);
// Max bit depth so that intermediate calculations fit in 16 bits.
static const int kMaxBitDepth = 14;
// Returns the precision shift to use based on the input rgb_bit_depth.
static int GetPrecisionShift(int rgb_bit_depth) {
// Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
// bits if needed.
return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
: (kMaxBitDepth - rgb_bit_depth);
}
typedef int16_t fixed_t; // signed type with extra precision for UV
typedef uint16_t fixed_y_t; // unsigned type with extra precision for W
//------------------------------------------------------------------------------
static uint8_t clip_8b(fixed_t v) {
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
}
static uint16_t clip(fixed_t v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static fixed_y_t clip_bit_depth(int y, int bit_depth) {
const int max = (1 << bit_depth) - 1;
return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
}
//------------------------------------------------------------------------------
static int RGBToGray(int64_t r, int64_t g, int64_t b) {
const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
return (int)(luma >> YUV_FIX);
}
static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type);
const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type);
const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type);
const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type);
return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth,
transfer_type);
}
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
int i;
for (i = 0; i < w; ++i) {
const uint32_t R =
SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type);
const uint32_t G =
SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type);
const uint32_t B =
SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type);
const uint32_t Y = RGBToGray(R, G, B);
dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type);
}
}
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
fixed_t* dst, int uv_w, int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
int i;
for (i = 0; i < uv_w; ++i) {
const int r =
ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
src2[0 * uv_w + 1], rgb_bit_depth, transfer_type);
const int g =
ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
src2[2 * uv_w + 1], rgb_bit_depth, transfer_type);
const int b =
ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
src2[4 * uv_w + 1], rgb_bit_depth, transfer_type);
const int W = RGBToGray(r, g, b);
dst[0 * uv_w] = (fixed_t)(r - W);
dst[1 * uv_w] = (fixed_t)(g - W);
dst[2 * uv_w] = (fixed_t)(b - W);
dst += 1;
src1 += 2;
src2 += 2;
}
}
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
int i;
assert(w > 0);
for (i = 0; i < w; ++i) {
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
}
}
//------------------------------------------------------------------------------
static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
const int v0 = (A * 3 + B + 2) >> 2;
return clip_bit_depth(v0 + W0, bit_depth);
}
//------------------------------------------------------------------------------
static WEBP_INLINE int Shift(int v, int shift) {
return (shift >= 0) ? (v << shift) : (v >> -shift);
}
static void ImportOneRow(const uint8_t* const r_ptr,
const uint8_t* const g_ptr,
const uint8_t* const b_ptr,
int rgb_step,
int rgb_bit_depth,
int pic_width,
fixed_y_t* const dst) {
// Convert the rgb_step from a number of bytes to a number of uint8_t or
// uint16_t values depending the bit depth.
const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
int i;
const int w = (pic_width + 1) & ~1;
for (i = 0; i < pic_width; ++i) {
const int off = i * step;
const int shift = GetPrecisionShift(rgb_bit_depth);
if (rgb_bit_depth == 8) {
dst[i + 0 * w] = Shift(r_ptr[off], shift);
dst[i + 1 * w] = Shift(g_ptr[off], shift);
dst[i + 2 * w] = Shift(b_ptr[off], shift);
} else {
dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
}
}
if (pic_width & 1) { // replicate rightmost pixel
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
}
}
static void InterpolateTwoRows(const fixed_y_t* const best_y,
const fixed_t* prev_uv,
const fixed_t* cur_uv,
const fixed_t* next_uv,
int w,
fixed_y_t* out1,
fixed_y_t* out2,
int rgb_bit_depth) {
const int uv_w = w >> 1;
const int len = (w - 1) >> 1; // length to filter
int k = 3;
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
while (k-- > 0) { // process each R/G/B segments in turn
// special boundary case for i==0
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
bit_depth);
SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
bit_depth);
// special boundary case for i == w - 1 when w is even
if (!(w & 1)) {
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
best_y[w - 1 + 0], bit_depth);
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
best_y[w - 1 + w], bit_depth);
}
out1 += w;
out2 += w;
prev_uv += uv_w;
cur_uv += uv_w;
next_uv += uv_w;
}
}
static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
const int coeffs[4], int sfix) {
const int srounder = 1 << (YUV_FIX + sfix - 1);
const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
coeffs[3] + srounder;
return (luma >> (YUV_FIX + sfix));
}
static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
int u_stride, uint8_t* v_ptr, int v_stride,
int rgb_bit_depth,
int yuv_bit_depth, int width, int height,
const SharpYuvConversionMatrix* yuv_matrix) {
int i, j;
const fixed_t* const best_uv_base = best_uv;
const int w = (width + 1) & ~1;
const int h = (height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
const int sfix = GetPrecisionShift(rgb_bit_depth);
const int yuv_max = (1 << yuv_bit_depth) - 1;
for (best_uv = best_uv_base, j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
const int off = (i >> 1);
const int W = best_y[i];
const int r = best_uv[off + 0 * uv_w] + W;
const int g = best_uv[off + 1 * uv_w] + W;
const int b = best_uv[off + 2 * uv_w] + W;
const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
if (yuv_bit_depth <= 8) {
y_ptr[i] = clip_8b(y);
} else {
((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
}
}
best_y += w;
best_uv += (j & 1) * 3 * uv_w;
y_ptr += y_stride;
}
for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
for (i = 0; i < uv_w; ++i) {
const int off = i;
// Note r, g and b values here are off by W, but a constant offset on all
// 3 components doesn't change the value of u and v with a YCbCr matrix.
const int r = best_uv[off + 0 * uv_w];
const int g = best_uv[off + 1 * uv_w];
const int b = best_uv[off + 2 * uv_w];
const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
if (yuv_bit_depth <= 8) {
u_ptr[i] = clip_8b(u);
v_ptr[i] = clip_8b(v);
} else {
((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
}
}
best_uv += 3 * uv_w;
u_ptr += u_stride;
v_ptr += v_stride;
}
return 1;
}
//------------------------------------------------------------------------------
// Main function
static void* SafeMalloc(uint64_t nmemb, size_t size) {
const uint64_t total_size = nmemb * (uint64_t)size;
if (total_size != (size_t)total_size) return NULL;
return malloc((size_t)total_size);
}
#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
const uint8_t* b_ptr, int rgb_step, int rgb_stride,
int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
int v_stride, int yuv_bit_depth, int width,
int height,
const SharpYuvConversionMatrix* yuv_matrix,
SharpYuvTransferFunctionType transfer_type) {
// we expand the right/bottom border if needed
const int w = (width + 1) & ~1;
const int h = (height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
uint64_t prev_diff_y_sum = ~0;
int j, iter;
// TODO(skal): allocate one big memory chunk. But for now, it's easier
// for valgrind debugging to have several chunks.
fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
fixed_y_t* best_y = best_y_base;
fixed_y_t* target_y = target_y_base;
fixed_t* best_uv = best_uv_base;
fixed_t* target_uv = target_uv_base;
const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
int ok;
assert(w > 0);
assert(h > 0);
if (best_y_base == NULL || best_uv_base == NULL ||
target_y_base == NULL || target_uv_base == NULL ||
best_rgb_y == NULL || best_rgb_uv == NULL ||
tmp_buffer == NULL) {
ok = 0;
goto End;
}
// Import RGB samples to W/RGB representation.
for (j = 0; j < height; j += 2) {
const int is_last_row = (j == height - 1);
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
// prepare two rows of input
ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
src1);
if (!is_last_row) {
ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
rgb_step, rgb_bit_depth, width, src2);
} else {
memcpy(src2, src1, 3 * w * sizeof(*src2));
}
StoreGray(src1, best_y + 0, w);
StoreGray(src2, best_y + w, w);
UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type);
UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type);
UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type);
memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
r_ptr += 2 * rgb_stride;
g_ptr += 2 * rgb_stride;
b_ptr += 2 * rgb_stride;
}
// Iterate and resolve clipping conflicts.
for (iter = 0; iter < kNumIterations; ++iter) {
const fixed_t* cur_uv = best_uv_base;
const fixed_t* prev_uv = best_uv_base;
uint64_t diff_y_sum = 0;
best_y = best_y_base;
best_uv = best_uv_base;
target_y = target_y_base;
target_uv = target_uv_base;
for (j = 0; j < h; j += 2) {
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
{
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
src1, src2, rgb_bit_depth);
prev_uv = cur_uv;
cur_uv = next_uv;
}
UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type);
UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type);
UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type);
// update two rows of Y and one row of RGB
diff_y_sum +=
SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
}
// test exit condition
if (iter > 0) {
if (diff_y_sum < diff_y_threshold) break;
if (diff_y_sum > prev_diff_y_sum) break;
}
prev_diff_y_sum = diff_y_sum;
}
// final reconstruction
ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
width, height, yuv_matrix);
End:
free(best_y_base);
free(best_uv_base);
free(target_y_base);
free(target_uv_base);
free(best_rgb_y);
free(best_rgb_uv);
free(tmp_buffer);
return ok;
}
#undef SAFE_ALLOC
#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
#include <pthread.h> // NOLINT
#define LOCK_ACCESS \
static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
if (pthread_mutex_lock(&sharpyuv_lock)) return
#define UNLOCK_ACCESS_AND_RETURN \
do { \
(void)pthread_mutex_unlock(&sharpyuv_lock); \
return; \
} while (0)
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
#define LOCK_ACCESS do {} while (0)
#define UNLOCK_ACCESS_AND_RETURN return
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
// Hidden exported init function.
// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
// users can declare it as extern and call it with an alternate VP8CPUInfo
// function.
extern VP8CPUInfo SharpYuvGetCPUInfo;
SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
void SharpYuvInit(VP8CPUInfo cpu_info_func) {
static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
(VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
LOCK_ACCESS;
// Only update SharpYuvGetCPUInfo when called from external code to avoid a
// race on reading the value in SharpYuvConvert().
if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
SharpYuvGetCPUInfo = cpu_info_func;
}
if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
UNLOCK_ACCESS_AND_RETURN;
}
SharpYuvInitDsp();
SharpYuvInitGammaTables();
sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
UNLOCK_ACCESS_AND_RETURN;
}
int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
int rgb_step, int rgb_stride, int rgb_bit_depth,
void* y_ptr, int y_stride, void* u_ptr, int u_stride,
void* v_ptr, int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvConversionMatrix* yuv_matrix) {
SharpYuvOptions options;
options.yuv_matrix = yuv_matrix;
options.transfer_type = kSharpYuvTransferFunctionSrgb;
return SharpYuvConvertWithOptions(
r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride,
u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options);
}
int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix,
SharpYuvOptions* options, int version) {
const int major = (version >> 24);
const int minor = (version >> 16) & 0xff;
if (options == NULL || yuv_matrix == NULL ||
(major == SHARPYUV_VERSION_MAJOR && major == 0 &&
minor != SHARPYUV_VERSION_MINOR) ||
(major != SHARPYUV_VERSION_MAJOR)) {
return 0;
}
options->yuv_matrix = yuv_matrix;
options->transfer_type = kSharpYuvTransferFunctionSrgb;
return 1;
}
int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
const void* b_ptr, int rgb_step, int rgb_stride,
int rgb_bit_depth, void* y_ptr, int y_stride,
void* u_ptr, int u_stride, void* v_ptr,
int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvOptions* options) {
const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix;
SharpYuvTransferFunctionType transfer_type = options->transfer_type;
SharpYuvConversionMatrix scaled_matrix;
const int rgb_max = (1 << rgb_bit_depth) - 1;
const int rgb_round = 1 << (rgb_bit_depth - 1);
const int yuv_max = (1 << yuv_bit_depth) - 1;
const int sfix = GetPrecisionShift(rgb_bit_depth);
if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
u_ptr == NULL || v_ptr == NULL) {
return 0;
}
if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
rgb_bit_depth != 16) {
return 0;
}
if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
return 0;
}
if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
// Step/stride should be even for uint16_t buffers.
return 0;
}
if (yuv_bit_depth > 8 &&
(y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
// Stride should be even for uint16_t buffers.
return 0;
}
// The address of the function pointer is used to avoid a read race.
SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
// Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
// rgb->yuv conversion matrix.
if (rgb_bit_depth == yuv_bit_depth) {
memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
} else {
int i;
for (i = 0; i < 3; ++i) {
scaled_matrix.rgb_to_y[i] =
(yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
scaled_matrix.rgb_to_u[i] =
(yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
scaled_matrix.rgb_to_v[i] =
(yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
}
}
// Also incorporate precision change scaling.
scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
v_ptr, v_stride, yuv_bit_depth, width, height,
&scaled_matrix, transfer_type);
}
//------------------------------------------------------------------------------

174
sharpyuv/sharpyuv.h Normal file
View File

@ -0,0 +1,174 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Sharp RGB to YUV conversion.
#ifndef WEBP_SHARPYUV_SHARPYUV_H_
#define WEBP_SHARPYUV_SHARPYUV_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifndef SHARPYUV_EXTERN
#ifdef WEBP_EXTERN
#define SHARPYUV_EXTERN WEBP_EXTERN
#else
// This explicitly marks library functions and allows for changing the
// signature for e.g., Windows DLL builds.
#if defined(__GNUC__) && __GNUC__ >= 4
#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
#else
#if defined(_MSC_VER) && defined(WEBP_DLL)
#define SHARPYUV_EXTERN __declspec(dllexport)
#else
#define SHARPYUV_EXTERN extern
#endif /* _MSC_VER && WEBP_DLL */
#endif /* __GNUC__ >= 4 */
#endif /* WEBP_EXTERN */
#endif /* SHARPYUV_EXTERN */
#ifndef SHARPYUV_INLINE
#ifdef WEBP_INLINE
#define SHARPYUV_INLINE WEBP_INLINE
#else
#ifndef _MSC_VER
#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define SHARPYUV_INLINE inline
#else
#define SHARPYUV_INLINE
#endif
#else
#define SHARPYUV_INLINE __forceinline
#endif /* _MSC_VER */
#endif /* WEBP_INLINE */
#endif /* SHARPYUV_INLINE */
// SharpYUV API version following the convention from semver.org
#define SHARPYUV_VERSION_MAJOR 0
#define SHARPYUV_VERSION_MINOR 4
#define SHARPYUV_VERSION_PATCH 0
// Version as a uint32_t. The major number is the high 8 bits.
// The minor number is the middle 8 bits. The patch number is the low 16 bits.
#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
(((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
#define SHARPYUV_VERSION \
SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
SHARPYUV_VERSION_PATCH)
// Returns the library's version number, packed in hexadecimal. See
// SHARPYUV_VERSION.
SHARPYUV_EXTERN int SharpYuvGetVersion(void);
// RGB to YUV conversion matrix, in 16 bit fixed point.
// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
// Then y, u and v values are divided by 1<<16 and rounded.
typedef struct {
int rgb_to_y[4];
int rgb_to_u[4];
int rgb_to_v[4];
} SharpYuvConversionMatrix;
typedef struct SharpYuvOptions SharpYuvOptions;
// Enums for transfer functions, as defined in H.273,
// https://www.itu.int/rec/T-REC-H.273-202107-I/en
typedef enum SharpYuvTransferFunctionType {
// 0 is reserved
kSharpYuvTransferFunctionBt709 = 1,
// 2 is unspecified
// 3 is reserved
kSharpYuvTransferFunctionBt470M = 4,
kSharpYuvTransferFunctionBt470Bg = 5,
kSharpYuvTransferFunctionBt601 = 6,
kSharpYuvTransferFunctionSmpte240 = 7,
kSharpYuvTransferFunctionLinear = 8,
kSharpYuvTransferFunctionLog100 = 9,
kSharpYuvTransferFunctionLog100_Sqrt10 = 10,
kSharpYuvTransferFunctionIec61966 = 11,
kSharpYuvTransferFunctionBt1361 = 12,
kSharpYuvTransferFunctionSrgb = 13,
kSharpYuvTransferFunctionBt2020_10Bit = 14,
kSharpYuvTransferFunctionBt2020_12Bit = 15,
kSharpYuvTransferFunctionSmpte2084 = 16, // PQ
kSharpYuvTransferFunctionSmpte428 = 17,
kSharpYuvTransferFunctionHlg = 18,
kSharpYuvTransferFunctionNum
} SharpYuvTransferFunctionType;
// Converts RGB to YUV420 using a downsampling algorithm that minimizes
// artefacts caused by chroma subsampling.
// This is slower than standard downsampling (averaging of 4 UV values).
// Assumes that the image will be upsampled using a bilinear filter. If nearest
// neighbor is used instead, the upsampled image might look worse than with
// standard downsampling.
// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
// to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
// rgb_step: distance in bytes between two horizontally adjacent pixels on the
// r, g and b channels. If rgb_bit_depth is > 8, it should be a
// multiple of 2.
// rgb_stride: distance in bytes between two vertically adjacent pixels on the
// r, g, and b channels. If rgb_bit_depth is > 8, it should be a
// multiple of 2.
// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
// Note: 16 bit input is truncated to 14 bits before conversion to yuv.
// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels. Should
// point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
// otherwise.
// y_stride, u_stride, v_stride: distance in bytes between two vertically
// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
// should be multiples of 2.
// width, height: width and height of the image in pixels
// This function calls SharpYuvConvertWithOptions with a default transfer
// function of kSharpYuvTransferFunctionSrgb.
SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
const void* b_ptr, int rgb_step,
int rgb_stride, int rgb_bit_depth,
void* y_ptr, int y_stride, void* u_ptr,
int u_stride, void* v_ptr, int v_stride,
int yuv_bit_depth, int width, int height,
const SharpYuvConversionMatrix* yuv_matrix);
struct SharpYuvOptions {
// This matrix cannot be NULL and can be initialized by
// SharpYuvComputeConversionMatrix.
const SharpYuvConversionMatrix* yuv_matrix;
SharpYuvTransferFunctionType transfer_type;
};
// Internal, version-checked, entry point
SHARPYUV_EXTERN int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix*,
SharpYuvOptions*, int);
// Should always be called, to initialize a fresh SharpYuvOptions
// structure before modification. SharpYuvOptionsInit() must have succeeded
// before using the 'options' object.
static SHARPYUV_INLINE int SharpYuvOptionsInit(
const SharpYuvConversionMatrix* yuv_matrix, SharpYuvOptions* options) {
return SharpYuvOptionsInitInternal(yuv_matrix, options, SHARPYUV_VERSION);
}
SHARPYUV_EXTERN int SharpYuvConvertWithOptions(
const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step,
int rgb_stride, int rgb_bit_depth, void* y_ptr, int y_stride, void* u_ptr,
int u_stride, void* v_ptr, int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvOptions* options);
// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
// support (it's rarely used in practice, especially for images).
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_H_

14
sharpyuv/sharpyuv_cpu.c Normal file
View File

@ -0,0 +1,14 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
#include "sharpyuv/sharpyuv_cpu.h"
// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
// function pointer is renamed in sharpyuv_cpu.h.
#include "src/dsp/cpu.c"

22
sharpyuv/sharpyuv_cpu.h Normal file
View File

@ -0,0 +1,22 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
#include "sharpyuv/sharpyuv.h"
// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
// SharpYuvInit() replaces the use of the function pointer.
#undef WEBP_EXTERN
#define WEBP_EXTERN extern
#define VP8GetCPUInfo SharpYuvGetCPUInfo
#include "src/dsp/cpu.h"
#endif // WEBP_SHARPYUV_SHARPYUV_CPU_H_

110
sharpyuv/sharpyuv_csp.c Normal file
View File

@ -0,0 +1,110 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Colorspace utilities.
#include "sharpyuv/sharpyuv_csp.h"
#include <assert.h>
#include <math.h>
#include <stddef.h>
static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
SharpYuvConversionMatrix* matrix) {
const float kr = yuv_color_space->kr;
const float kb = yuv_color_space->kb;
const float kg = 1.0f - kr - kb;
const float cr = 0.5f / (1.0f - kb);
const float cb = 0.5f / (1.0f - kr);
const int shift = yuv_color_space->bit_depth - 8;
const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
float scale_y = 1.0f;
float add_y = 0.0f;
float scale_u = cr;
float scale_v = cb;
float add_uv = (float)(128 << shift);
assert(yuv_color_space->bit_depth >= 8);
if (yuv_color_space->range == kSharpYuvRangeLimited) {
scale_y *= (219 << shift) / denom;
scale_u *= (224 << shift) / denom;
scale_v *= (224 << shift) / denom;
add_y = (float)(16 << shift);
}
matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
matrix->rgb_to_y[3] = ToFixed16(add_y);
matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
matrix->rgb_to_u[3] = ToFixed16(add_uv);
matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
matrix->rgb_to_v[3] = ToFixed16(add_uv);
}
// Matrices are in YUV_FIX fixed point precision.
// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
static const SharpYuvConversionMatrix kWebpMatrix = {
{16839, 33059, 6420, 16 << 16},
{-9719, -19081, 28800, 128 << 16},
{28800, -24116, -4684, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
{16829, 33039, 6416, 16 << 16},
{-9714, -19071, 28784, 128 << 16},
{28784, -24103, -4681, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec601FullMatrix = {
{19595, 38470, 7471, 0},
{-11058, -21710, 32768, 128 << 16},
{32768, -27439, -5329, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
{11966, 40254, 4064, 16 << 16},
{-6596, -22189, 28784, 128 << 16},
{28784, -26145, -2639, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec709FullMatrix = {
{13933, 46871, 4732, 0},
{-7509, -25259, 32768, 128 << 16},
{32768, -29763, -3005, 128 << 16},
};
const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
SharpYuvMatrixType matrix_type) {
switch (matrix_type) {
case kSharpYuvMatrixWebp:
return &kWebpMatrix;
case kSharpYuvMatrixRec601Limited:
return &kRec601LimitedMatrix;
case kSharpYuvMatrixRec601Full:
return &kRec601FullMatrix;
case kSharpYuvMatrixRec709Limited:
return &kRec709LimitedMatrix;
case kSharpYuvMatrixRec709Full:
return &kRec709FullMatrix;
case kSharpYuvMatrixNum:
return NULL;
}
return NULL;
}

60
sharpyuv/sharpyuv_csp.h Normal file
View File

@ -0,0 +1,60 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Colorspace utilities.
#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
#include "sharpyuv/sharpyuv.h"
#ifdef __cplusplus
extern "C" {
#endif
// Range of YUV values.
typedef enum {
kSharpYuvRangeFull, // YUV values between [0;255] (for 8 bit)
kSharpYuvRangeLimited // Y in [16;235], YUV in [16;240] (for 8 bit)
} SharpYuvRange;
// Constants that define a YUV color space.
typedef struct {
// Kr and Kb are defined such that:
// Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
float kr;
float kb;
int bit_depth; // 8, 10 or 12
SharpYuvRange range;
} SharpYuvColorSpace;
// Fills in 'matrix' for the given YUVColorSpace.
SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
const SharpYuvColorSpace* yuv_color_space,
SharpYuvConversionMatrix* matrix);
// Enums for precomputed conversion matrices.
typedef enum {
kSharpYuvMatrixWebp = 0,
kSharpYuvMatrixRec601Limited,
kSharpYuvMatrixRec601Full,
kSharpYuvMatrixRec709Limited,
kSharpYuvMatrixRec709Full,
kSharpYuvMatrixNum
} SharpYuvMatrixType;
// Returns a pointer to a matrix for one of the predefined colorspaces.
SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
SharpYuvMatrixType matrix_type);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_CSP_H_

104
sharpyuv/sharpyuv_dsp.c Normal file
View File

@ -0,0 +1,104 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#include <assert.h>
#include <stdlib.h>
#include "sharpyuv/sharpyuv_cpu.h"
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
static uint16_t clip(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
uint64_t diff = 0;
int i;
const int max_y = (1 << bit_depth) - 1;
for (i = 0; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip(new_y, max_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
int i;
const int max_y = (1 << bit_depth) - 1;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//-----------------------------------------------------------------------------
uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len, int bit_depth);
void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
int len);
void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth);
extern VP8CPUInfo SharpYuvGetCPUInfo;
extern void InitSharpYuvSSE2(void);
extern void InitSharpYuvNEON(void);
void SharpYuvInitDsp(void) {
#if !WEBP_NEON_OMIT_C_CODE
SharpYuvUpdateY = SharpYuvUpdateY_C;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
SharpYuvFilterRow = SharpYuvFilterRow_C;
#endif
if (SharpYuvGetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (SharpYuvGetCPUInfo(kSSE2)) {
InitSharpYuvSSE2();
}
#endif // WEBP_HAVE_SSE2
}
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
InitSharpYuvNEON();
}
#endif // WEBP_HAVE_NEON
assert(SharpYuvUpdateY != NULL);
assert(SharpYuvUpdateRGB != NULL);
assert(SharpYuvFilterRow != NULL);
}

28
sharpyuv/sharpyuv_dsp.h Normal file
View File

@ -0,0 +1,28 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
#include "sharpyuv/sharpyuv_cpu.h"
#include "src/webp/types.h"
extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len, int bit_depth);
extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
int16_t* dst, int len);
extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth);
void SharpYuvInitDsp(void);
#endif // WEBP_SHARPYUV_SHARPYUV_DSP_H_

419
sharpyuv/sharpyuv_gamma.c Normal file
View File

@ -0,0 +1,419 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Gamma correction utilities.
#include "sharpyuv/sharpyuv_gamma.h"
#include <assert.h>
#include <float.h>
#include <math.h>
#include "src/webp/types.h"
// Gamma correction compensates loss of resolution during chroma subsampling.
// Size of pre-computed table for converting from gamma to linear.
#define GAMMA_TO_LINEAR_TAB_BITS 10
#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
#define LINEAR_TO_GAMMA_TAB_BITS 9
#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
static const double kGammaF = 1. / 0.45;
#define GAMMA_TO_LINEAR_BITS 16
static volatile int kGammaTablesSOk = 0;
void SharpYuvInitGammaTables(void) {
assert(GAMMA_TO_LINEAR_BITS <= 16);
if (!kGammaTablesSOk) {
int v;
const double a = 0.09929682680944;
const double thresh = 0.018053968510807;
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
// Precompute gamma to linear table.
{
const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
const double a_rec = 1. / (1. + a);
for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
const double g = norm * v;
double value;
if (g <= thresh * 4.5) {
value = g / 4.5;
} else {
value = pow(a_rec * (g + a), kGammaF);
}
kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
}
// to prevent small rounding errors to cause read-overflow:
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
}
// Precompute linear to gamma table.
{
const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
const double g = scale * v;
double value;
if (g <= thresh) {
value = 4.5 * g;
} else {
value = (1. + a) * pow(g, 1. / kGammaF) - a;
}
kLinearToGammaTabS[v] =
(uint32_t)(final_scale * value + 0.5);
}
// to prevent small rounding errors to cause read-overflow:
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
}
kGammaTablesSOk = 1;
}
}
static WEBP_INLINE int Shift(int v, int shift) {
return (shift >= 0) ? (v << shift) : (v >> -shift);
}
static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
int tab_pos_shift_right,
int tab_value_shift) {
const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
// fractional part, in 'tab_pos_shift' fixed-point precision
const uint32_t x = v - (tab_pos << tab_pos_shift_right); // fractional part
// v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
// Final interpolation.
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
const int half =
(tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
return result;
}
static uint32_t ToLinearSrgb(uint16_t v, int bit_depth) {
const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
if (shift > 0) {
return kGammaToLinearTabS[v << shift];
}
return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
}
static uint16_t FromLinearSrgb(uint32_t value, int bit_depth) {
return FixedPointInterpolation(
value, kLinearToGammaTabS,
(GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
bit_depth - GAMMA_TO_LINEAR_BITS);
}
////////////////////////////////////////////////////////////////////////////////
#define CLAMP(x, low, high) \
(((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
static WEBP_INLINE float Roundf(float x) {
if (x < 0)
return (float)ceil((double)(x - 0.5f));
else
return (float)floor((double)(x + 0.5f));
}
static WEBP_INLINE float Powf(float base, float exp) {
return (float)pow((double)base, (double)exp);
}
static WEBP_INLINE float Log10f(float x) { return (float)log10((double)x); }
static float ToLinear709(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinear709(float linear) {
if (linear < 0.f) {
return 0.f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
} else if (linear < 1.f) {
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
return 1.f;
}
static float ToLinear470M(float gamma) {
return Powf(CLAMP(gamma, 0.f, 1.f), 1.f / 2.2f);
}
static float FromLinear470M(float linear) {
return Powf(CLAMP(linear, 0.f, 1.f), 2.2f);
}
static float ToLinear470Bg(float gamma) {
return Powf(CLAMP(gamma, 0.f, 1.f), 1.f / 2.8f);
}
static float FromLinear470Bg(float linear) {
return Powf(CLAMP(linear, 0.f, 1.f), 2.8f);
}
static float ToLinearSmpte240(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma < 4.f * 0.022821585529445f) {
return gamma / 4.f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.111572195921731f) / 1.111572195921731f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinearSmpte240(float linear) {
if (linear < 0.f) {
return 0.f;
} else if (linear < 0.022821585529445f) {
return linear * 4.f;
} else if (linear < 1.f) {
return 1.111572195921731f * Powf(linear, 0.45f) - 0.111572195921731f;
}
return 1.f;
}
static float ToLinearLog100(float gamma) {
return (gamma < 0.01f) ? 0.0f : 1.0f + Log10f(MIN(gamma, 1.f)) / 2.0f;
}
static float FromLinearLog100(float linear) {
// The function is non-bijective so choose the middle of [0, 0.01].
const float mid_interval = 0.01f / 2.f;
return (linear <= 0.0f) ? mid_interval
: Powf(10.0f, 2.f * (MIN(linear, 1.f) - 1.0f));
}
static float ToLinearLog100Sqrt10(float gamma) {
return (gamma < 0.00316227766f) ? 0.0f
: 1.0f + Log10f(MIN(gamma, 1.f)) / 2.5f;
}
static float FromLinearLog100Sqrt10(float linear) {
// The function is non-bijective so choose the middle of [0, 0.00316227766f[.
const float mid_interval = 0.00316227766f / 2.f;
return (linear < 0.0f) ? mid_interval
: Powf(10.0f, 2.5f * (MIN(linear, 1.f) - 1.0f));
}
static float ToLinearIec61966(float gamma) {
if (gamma <= -4.5f * 0.018053968510807f) {
return Powf((-gamma + 0.09929682680944f) / -1.09929682680944f, 1.f / 0.45f);
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
}
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
static float FromLinearIec61966(float linear) {
if (linear <= -0.018053968510807f) {
return -1.09929682680944f * Powf(-linear, 0.45f) + 0.09929682680944f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
}
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
static float ToLinearBt1361(float gamma) {
if (gamma < -0.25f) {
return -0.25f;
} else if (gamma < 0.f) {
return Powf((gamma - 0.02482420670236f) / -0.27482420670236f, 1.f / 0.45f) /
-4.f;
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinearBt1361(float linear) {
if (linear < -0.25f) {
return -0.25f;
} else if (linear < 0.f) {
return -0.27482420670236f * Powf(-4.f * linear, 0.45f) + 0.02482420670236f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
} else if (linear < 1.f) {
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
return 1.f;
}
static float ToLinearPq(float gamma) {
if (gamma > 0.f) {
const float pow_gamma = Powf(gamma, 32.f / 2523.f);
const float num = MAX(pow_gamma - 107.f / 128.f, 0.0f);
const float den = MAX(2413.f / 128.f - 2392.f / 128.f * pow_gamma, FLT_MIN);
return Powf(num / den, 4096.f / 653.f);
}
return 0.f;
}
static float FromLinearPq(float linear) {
if (linear > 0.f) {
const float pow_linear = Powf(linear, 653.f / 4096.f);
const float num = 107.f / 128.f + 2413.f / 128.f * pow_linear;
const float den = 1.0f + 2392.f / 128.f * pow_linear;
return Powf(num / den, 2523.f / 32.f);
}
return 0.f;
}
static float ToLinearSmpte428(float gamma) {
return Powf(0.91655527974030934f * MAX(gamma, 0.f), 1.f / 2.6f);
}
static float FromLinearSmpte428(float linear) {
return Powf(MAX(linear, 0.f), 2.6f) / 0.91655527974030934f;
}
// Conversion in BT.2100 requires RGB info. Simplify to gamma correction here.
static float ToLinearHlg(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma <= 0.5f) {
return Powf((gamma * gamma) * (1.f / 3.f), 1.2f);
}
return Powf((expf((gamma - 0.55991073f) / 0.17883277f) + 0.28466892f) / 12.0f,
1.2f);
}
static float FromLinearHlg(float linear) {
linear = Powf(linear, 1.f / 1.2f);
if (linear < 0.f) {
return 0.f;
} else if (linear <= (1.f / 12.f)) {
return sqrtf(3.f * linear);
}
return 0.17883277f * logf(12.f * linear - 0.28466892f) + 0.55991073f;
}
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type) {
float v_float, linear;
if (transfer_type == kSharpYuvTransferFunctionSrgb) {
return ToLinearSrgb(v, bit_depth);
}
v_float = (float)v / ((1 << bit_depth) - 1);
switch (transfer_type) {
case kSharpYuvTransferFunctionBt709:
case kSharpYuvTransferFunctionBt601:
case kSharpYuvTransferFunctionBt2020_10Bit:
case kSharpYuvTransferFunctionBt2020_12Bit:
linear = ToLinear709(v_float);
break;
case kSharpYuvTransferFunctionBt470M:
linear = ToLinear470M(v_float);
break;
case kSharpYuvTransferFunctionBt470Bg:
linear = ToLinear470Bg(v_float);
break;
case kSharpYuvTransferFunctionSmpte240:
linear = ToLinearSmpte240(v_float);
break;
case kSharpYuvTransferFunctionLinear:
return v;
case kSharpYuvTransferFunctionLog100:
linear = ToLinearLog100(v_float);
break;
case kSharpYuvTransferFunctionLog100_Sqrt10:
linear = ToLinearLog100Sqrt10(v_float);
break;
case kSharpYuvTransferFunctionIec61966:
linear = ToLinearIec61966(v_float);
break;
case kSharpYuvTransferFunctionBt1361:
linear = ToLinearBt1361(v_float);
break;
case kSharpYuvTransferFunctionSmpte2084:
linear = ToLinearPq(v_float);
break;
case kSharpYuvTransferFunctionSmpte428:
linear = ToLinearSmpte428(v_float);
break;
case kSharpYuvTransferFunctionHlg:
linear = ToLinearHlg(v_float);
break;
default:
assert(0);
linear = 0;
break;
}
return (uint32_t)Roundf(linear * ((1 << 16) - 1));
}
uint16_t SharpYuvLinearToGamma(uint32_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type) {
float v_float, linear;
if (transfer_type == kSharpYuvTransferFunctionSrgb) {
return FromLinearSrgb(v, bit_depth);
}
v_float = (float)v / ((1 << 16) - 1);
switch (transfer_type) {
case kSharpYuvTransferFunctionBt709:
case kSharpYuvTransferFunctionBt601:
case kSharpYuvTransferFunctionBt2020_10Bit:
case kSharpYuvTransferFunctionBt2020_12Bit:
linear = FromLinear709(v_float);
break;
case kSharpYuvTransferFunctionBt470M:
linear = FromLinear470M(v_float);
break;
case kSharpYuvTransferFunctionBt470Bg:
linear = FromLinear470Bg(v_float);
break;
case kSharpYuvTransferFunctionSmpte240:
linear = FromLinearSmpte240(v_float);
break;
case kSharpYuvTransferFunctionLinear:
return v;
case kSharpYuvTransferFunctionLog100:
linear = FromLinearLog100(v_float);
break;
case kSharpYuvTransferFunctionLog100_Sqrt10:
linear = FromLinearLog100Sqrt10(v_float);
break;
case kSharpYuvTransferFunctionIec61966:
linear = FromLinearIec61966(v_float);
break;
case kSharpYuvTransferFunctionBt1361:
linear = FromLinearBt1361(v_float);
break;
case kSharpYuvTransferFunctionSmpte2084:
linear = FromLinearPq(v_float);
break;
case kSharpYuvTransferFunctionSmpte428:
linear = FromLinearSmpte428(v_float);
break;
case kSharpYuvTransferFunctionHlg:
linear = FromLinearHlg(v_float);
break;
default:
assert(0);
linear = 0;
break;
}
return (uint16_t)Roundf(linear * ((1 << bit_depth) - 1));
}

38
sharpyuv/sharpyuv_gamma.h Normal file
View File

@ -0,0 +1,38 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Gamma correction utilities.
#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
#include "sharpyuv/sharpyuv.h"
#include "src/webp/types.h"
#ifdef __cplusplus
extern "C" {
#endif
// Initializes precomputed tables. Must be called once before calling
// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
void SharpYuvInitGammaTables(void);
// Converts a 'bit_depth'-bit gamma color value to a 16-bit linear value.
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type);
// Converts a 16-bit linear color value to a 'bit_depth'-bit gamma value.
uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth,
SharpYuvTransferFunctionType transfer_type);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_

181
sharpyuv/sharpyuv_neon.c Normal file
View File

@ -0,0 +1,181 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <stdlib.h>
#include <arm_neon.h>
static uint16_t clip_NEON(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(max_y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_NEON(new_y, max_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const int16x8_t max = vdupq_n_s16(max_y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t e0 = vrhaddq_s16(c1, a0);
const int16x8_t e1 = vrhaddq_s16(c0, a1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const uint16x8_t max = vdupq_n_u16(max_y);
for (i = 0; i + 4 <= len; i += 4) {
const int16x4_t a0 = vld1_s16(A + i + 0);
const int16x4_t a1 = vld1_s16(A + i + 1);
const int16x4_t b0 = vld1_s16(B + i + 0);
const int16x4_t b1 = vld1_s16(B + i + 1);
const int32x4_t a0b1 = vaddl_s16(a0, b1);
const int32x4_t a1b0 = vaddl_s16(a1, b0);
const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0); // A0+A1+B0+B1
const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1); // 2*(A0+B1)
const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0); // 2*(A1+B0)
const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
const int32x4x2_t f = vzipq_s32(e0, e1);
const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
const uint16x8_t i_clamped = vminq_u16(i_16, max);
vst1q_u16(out + 2 * i + 0, i_clamped);
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
if (bit_depth <= 10) {
SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
} else {
SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
}
}
//------------------------------------------------------------------------------
extern void InitSharpYuvNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
SharpYuvUpdateY = SharpYuvUpdateY_NEON;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
SharpYuvFilterRow = SharpYuvFilterRow_NEON;
}
#else // !WEBP_USE_NEON
extern void InitSharpYuvNEON(void);
void InitSharpYuvNEON(void) {}
#endif // WEBP_USE_NEON

201
sharpyuv/sharpyuv_sse2.c Normal file
View File

@ -0,0 +1,201 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <stdlib.h>
#include <emmintrin.h>
static uint16_t clip_SSE2(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
uint64_t diff = 0;
uint32_t tmp[4];
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(max_y);
const __m128i one = _mm_set1_epi16(1);
__m128i sum = zero;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_y
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
const __m128i F = _mm_add_epi16(C, D); // new_y
const __m128i G = _mm_or_si128(E, one); // -1 or 1
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
_mm_storeu_si128((__m128i*)(dst + i), H);
sum = _mm_add_epi32(sum, I);
}
_mm_storeu_si128((__m128i*)tmp, sum);
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_SSE2(new_y, max_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i = 0;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
const __m128i E = _mm_add_epi16(C, D); // new_uv
_mm_storeu_si128((__m128i*)(dst + i), E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const __m128i kCst8 = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(max_y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 8 <= len; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
const __m128i a0b1 = _mm_add_epi16(a0, b1);
const __m128i a1b0 = _mm_add_epi16(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi16(c1, a0);
const __m128i d1 = _mm_add_epi16(c0, a1);
const __m128i e0 = _mm_srai_epi16(d0, 1);
const __m128i e1 = _mm_srai_epi16(d1, 1);
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
const __m128i h0 = _mm_add_epi16(g0, f0);
const __m128i h1 = _mm_add_epi16(g1, f1);
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
}
}
static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
}
static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const __m128i kCst8 = _mm_set1_epi32(8);
const __m128i max = _mm_set1_epi16(max_y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= len; i += 4) {
const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
const __m128i a0b1 = _mm_add_epi32(a0, b1);
const __m128i a1b0 = _mm_add_epi32(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi32(c1, a0);
const __m128i d1 = _mm_add_epi32(c0, a1);
const __m128i e0 = _mm_srai_epi32(d0, 1);
const __m128i e1 = _mm_srai_epi32(d1, 1);
const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
if (bit_depth <= 10) {
SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
} else {
SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
}
}
//------------------------------------------------------------------------------
extern void InitSharpYuvSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
}
#else // !WEBP_USE_SSE2
extern void InitSharpYuvSSE2(void);
void InitSharpYuvSSE2(void) {}
#endif // WEBP_USE_SSE2