From 54a57896b65f69f3422fcda27f22980d73b65ac6 Mon Sep 17 00:00:00 2001
From: Green Sky <green@g-s.xyz>
Date: Wed, 2 Oct 2024 12:42:17 +0200
Subject: [PATCH] sdl video push conversion stream and toxav video sink

---
 src/CMakeLists.txt                            |   2 +
 .../sdl/video_push_converter.cpp              |  58 +++++++++
 .../sdl/video_push_converter.hpp              |  89 ++++++++++++++
 src/tox_av_voip_model.cpp                     | 113 +++++++++++++++++-
 4 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 src/frame_streams/sdl/video_push_converter.cpp
 create mode 100644 src/frame_streams/sdl/video_push_converter.hpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 036ab969..4b7d1603 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -115,6 +115,8 @@ target_sources(tomato PUBLIC
 	./frame_streams/sdl/sdl_audio2_frame_stream2.hpp
 	./frame_streams/sdl/sdl_audio2_frame_stream2.cpp
 	./frame_streams/sdl/video.hpp
+	./frame_streams/sdl/video_push_converter.hpp
+	./frame_streams/sdl/video_push_converter.cpp
 
 	./stream_manager_ui.hpp
 	./stream_manager_ui.cpp
diff --git a/src/frame_streams/sdl/video_push_converter.cpp b/src/frame_streams/sdl/video_push_converter.cpp
new file mode 100644
index 00000000..df5d71ac
--- /dev/null
+++ b/src/frame_streams/sdl/video_push_converter.cpp
@@ -0,0 +1,58 @@
+#include "./video_push_converter.hpp"
+
+SDL_Surface* convertYUY2_IYUV(SDL_Surface* surf) {
+	if (surf->format != SDL_PIXELFORMAT_YUY2) {
+		return nullptr;
+	}
+	if ((surf->w % 2) != 0) {
+		SDL_SetError("YUY2->IYUV does not support odd widths");
+		// hmmm, we dont handle odd widths
+		return nullptr;
+	}
+
+	SDL_LockSurface(surf);
+
+	SDL_Surface* conv_surf = SDL_CreateSurface(surf->w, surf->h, SDL_PIXELFORMAT_IYUV);
+	SDL_LockSurface(conv_surf);
+
+	// YUY2 is 4:2:2 packed
+	// Y is simple, we just copy it over
+	// U V are double the resolution (vertically), so we avg both
+	// Packed mode: Y0+U0+Y1+V0 (1 plane)
+
+	uint8_t* y_plane = static_cast<uint8_t*>(conv_surf->pixels);
+	uint8_t* u_plane = static_cast<uint8_t*>(conv_surf->pixels) + conv_surf->w*conv_surf->h;
+	uint8_t* v_plane = static_cast<uint8_t*>(conv_surf->pixels) + conv_surf->w*conv_surf->h + (conv_surf->w/2)*(conv_surf->h/2);
+
+	const uint8_t* yuy2_data = static_cast<const uint8_t*>(surf->pixels);
+
+	for (int y = 0; y < surf->h; y++) {
+		for (int x = 0; x < surf->w; x += 2) {
+			// every pixel uses 2 bytes
+			const uint8_t* yuy2_curser = yuy2_data + y*surf->w*2 + x*2;
+			uint8_t src_y0 = yuy2_curser[0];
+			uint8_t src_u = yuy2_curser[1];
+			uint8_t src_y1 = yuy2_curser[2];
+			uint8_t src_v = yuy2_curser[3];
+
+			y_plane[y*conv_surf->w + x] = src_y0;
+			y_plane[y*conv_surf->w + x+1] = src_y1;
+
+			size_t uv_index = (y/2) * (conv_surf->w/2) + x/2;
+			if (y % 2 == 0) {
+				// first write
+				u_plane[uv_index] = src_u;
+				v_plane[uv_index] = src_v;
+			} else {
+				// second write, mix with existing value
+				u_plane[uv_index] = (int(u_plane[uv_index]) + int(src_u)) / 2;
+				v_plane[uv_index] = (int(v_plane[uv_index]) + int(src_v)) / 2;
+			}
+		}
+	}
+
+	SDL_UnlockSurface(conv_surf);
+	SDL_UnlockSurface(surf);
+	return conv_surf;
+}
+
diff --git a/src/frame_streams/sdl/video_push_converter.hpp b/src/frame_streams/sdl/video_push_converter.hpp
new file mode 100644
index 00000000..4a26be95
--- /dev/null
+++ b/src/frame_streams/sdl/video_push_converter.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "./video.hpp"
+#include "../frame_stream2.hpp"
+
+#include <cassert>
+
+#include <iostream> // meh
+
+static bool isFormatYUV(SDL_PixelFormat f) {
+	return
+		f == SDL_PIXELFORMAT_YV12 ||
+		f == SDL_PIXELFORMAT_IYUV ||
+		f == SDL_PIXELFORMAT_YUY2 ||
+		f == SDL_PIXELFORMAT_UYVY ||
+		f == SDL_PIXELFORMAT_YVYU ||
+		f == SDL_PIXELFORMAT_NV12 ||
+		f == SDL_PIXELFORMAT_NV21 ||
+		f == SDL_PIXELFORMAT_P010
+	;
+}
+
+SDL_Surface* convertYUY2_IYUV(SDL_Surface* surf);
+
+template<typename RealStream>
+struct PushConversionVideoStream : public RealStream {
+	SDL_PixelFormat _forced_format {SDL_PIXELFORMAT_IYUV};
+
+	template<typename... Args>
+	PushConversionVideoStream(SDL_PixelFormat forced_format, Args&&... args) : RealStream(std::forward<Args>(args)...), _forced_format(forced_format) {}
+	~PushConversionVideoStream(void) {}
+
+	bool push(const SDLVideoFrame& value) override {
+		SDL_Surface* surf = value.surface.get();
+		if (surf->format != _forced_format) {
+			//std::cerr << "DTC: need to convert from " << SDL_GetPixelFormatName(converted_surf->format) << " to SDL_PIXELFORMAT_IYUV\n";
+			if (surf->format == SDL_PIXELFORMAT_YUY2 && _forced_format == SDL_PIXELFORMAT_IYUV) {
+				// optimized custom impl
+
+				//auto start = Message::getTimeMS();
+				surf = convertYUY2_IYUV(surf);
+				//auto end = Message::getTimeMS();
+				// 3ms
+				//std::cerr << "DTC: timing " << SDL_GetPixelFormatName(converted_surf->format) << "->SDL_PIXELFORMAT_IYUV: " << end-start << "ms\n";
+			} else if (isFormatYUV(surf->format)) {
+				// TODO: fix sdl rgb->yuv conversion resulting in too dark (colorspace) issues
+				// https://github.com/libsdl-org/SDL/issues/10877
+
+				// meh, need to convert to rgb as a stopgap
+
+				//auto start = Message::getTimeMS();
+				SDL_Surface* tmp_conv_surf = SDL_ConvertSurfaceAndColorspace(surf, SDL_PIXELFORMAT_RGB24, nullptr, SDL_COLORSPACE_RGB_DEFAULT, 0);
+				//auto end = Message::getTimeMS();
+				// 1ms
+				//std::cerr << "DTC: timing " << SDL_GetPixelFormatName(converted_surf->format) << "->SDL_PIXELFORMAT_RGB24: " << end-start << "ms\n";
+
+				//start = Message::getTimeMS();
+				surf = SDL_ConvertSurfaceAndColorspace(tmp_conv_surf, _forced_format, nullptr, SDL_COLORSPACE_YUV_DEFAULT, 0);
+				//end = Message::getTimeMS();
+				// 60ms
+				//std::cerr << "DTC: timing SDL_PIXELFORMAT_RGB24->" << SDL_GetPixelFormatName(_forced_format) << ": " << end-start << "ms\n";
+
+				SDL_DestroySurface(tmp_conv_surf);
+			} else {
+				surf = SDL_ConvertSurface(surf, _forced_format);
+			}
+
+			if (surf == nullptr) {
+				// oh god
+				std::cerr << "DTC error: failed to convert surface to IYUV: " << SDL_GetError() << "\n";
+				return false;
+			}
+		}
+		assert(surf != nullptr);
+		if (surf != value.surface.get()) {
+			// TODO: add ctr with uptr
+			SDLVideoFrame new_value{value.timestampUS, nullptr};
+			new_value.surface = {
+				surf,
+				&SDL_DestroySurface
+			};
+
+			return RealStream::push(std::move(new_value));
+		} else {
+			return RealStream::push(value);
+		}
+	}
+};
+
diff --git a/src/tox_av_voip_model.cpp b/src/tox_av_voip_model.cpp
index 249f8b6c..48f460bd 100644
--- a/src/tox_av_voip_model.cpp
+++ b/src/tox_av_voip_model.cpp
@@ -10,6 +10,7 @@
 #include "./frame_streams/audio_stream_pop_reframer.hpp"
 
 #include "./frame_streams/sdl/video.hpp"
+#include "./frame_streams/sdl/video_push_converter.hpp"
 
 #include <cstring>
 
@@ -96,6 +97,63 @@ struct ToxAVCallAudioSink : public FrameStream2SinkI<AudioFrame2> {
 	}
 };
 
+// exlusive
+struct ToxAVCallVideoSink : public FrameStream2SinkI<SDLVideoFrame> {
+	using stream_type = PushConversionVideoStream<LockedFrameStream2<SDLVideoFrame>>;
+	ToxAVI& _toxav;
+
+	// bitrate for enabled state
+	uint32_t _video_bitrate {2}; // HACK: hardcode to 2mbits (toxap wrongly multiplies internally by 1000)
+
+	uint32_t _fid;
+	std::shared_ptr<stream_type> _writer;
+
+	ToxAVCallVideoSink(ToxAVI& toxav, uint32_t fid) : _toxav(toxav), _fid(fid) {}
+	~ToxAVCallVideoSink(void) {
+		if (_writer) {
+			_writer = nullptr;
+			_toxav.toxavVideoSetBitRate(_fid, 0);
+		}
+	}
+
+	// sink
+	std::shared_ptr<FrameStream2I<SDLVideoFrame>> subscribe(void) override {
+		if (_writer) {
+			// max 1 (exclusive, composite video somewhere else)
+			return nullptr;
+		}
+
+		auto err = _toxav.toxavVideoSetBitRate(_fid, _video_bitrate);
+		if (err != TOXAV_ERR_BIT_RATE_SET_OK) {
+			return nullptr;
+		}
+
+		// toxav needs I420
+		_writer = std::make_shared<stream_type>(SDL_PIXELFORMAT_IYUV);
+
+		return _writer;
+	}
+
+	bool unsubscribe(const std::shared_ptr<FrameStream2I<SDLVideoFrame>>& sub) override {
+		if (!sub || !_writer) {
+			// nah
+			return false;
+		}
+
+		if (sub == _writer) {
+			_writer = nullptr;
+
+			/*auto err = */_toxav.toxavVideoSetBitRate(_fid, 0);
+			// print warning? on error?
+
+			return true;
+		}
+
+		// what
+		return false;
+	}
+};
+
 void ToxAVVoIPModel::addAudioSource(ObjectHandle session, uint32_t friend_number) {
 	auto& stream_source = session.get_or_emplace<Components::VoIP::StreamSources>().streams;
 
@@ -157,7 +215,7 @@ void ToxAVVoIPModel::addVideoSource(ObjectHandle session, uint32_t friend_number
 
 	if (
 		const auto* defaults = session.try_get<Components::VoIP::DefaultConfig>();
-		defaults != nullptr && defaults->incoming_audio
+		defaults != nullptr && defaults->incoming_video
 	) {
 		incoming_video.emplace<Components::TagConnectToDefault>(); // depends on what was specified in enter()
 	}
@@ -172,6 +230,26 @@ void ToxAVVoIPModel::addVideoSource(ObjectHandle session, uint32_t friend_number
 }
 
 void ToxAVVoIPModel::addVideoSink(ObjectHandle session, uint32_t friend_number) {
+	auto& stream_sinks = session.get_or_emplace<Components::VoIP::StreamSinks>().streams;
+	ObjectHandle outgoing_video {_os.registry(), _os.registry().create()};
+
+	auto new_vsink = std::make_unique<ToxAVCallVideoSink>(_av, friend_number);
+	outgoing_video.emplace<ToxAVCallVideoSink*>(new_vsink.get());
+	outgoing_video.emplace<Components::FrameStream2Sink<SDLVideoFrame>>(std::move(new_vsink));
+	outgoing_video.emplace<Components::StreamSink>(Components::StreamSink::create<SDLVideoFrame>("ToxAV Friend Call Outgoing Video"));
+
+	if (
+		const auto* defaults = session.try_get<Components::VoIP::DefaultConfig>();
+		defaults != nullptr && defaults->outgoing_video
+	) {
+		outgoing_video.emplace<Components::TagConnectToDefault>(); // depends on what was specified in enter()
+	}
+
+	stream_sinks.push_back(outgoing_video);
+	session.emplace<Components::ToxAVVideoSink>(outgoing_video);
+	// TODO: tie session to stream
+
+	_os.throwEventConstruct(outgoing_video);
 }
 
 void ToxAVVoIPModel::destroySession(ObjectHandle session) {
@@ -288,6 +366,39 @@ void ToxAVVoIPModel::tick(void) {
 			}
 		}
 	}
+
+	for (const auto& [oc, vsink] : _os.registry().view<ToxAVCallVideoSink*>().each()) {
+		if (!vsink->_writer) {
+			continue;
+		}
+
+		for (size_t i = 0; i < 10; i++) {
+			auto new_frame_opt = vsink->_writer->pop();
+			if (!new_frame_opt.has_value()) {
+				break;
+			}
+			const auto& new_frame = new_frame_opt.value();
+
+			if (!new_frame.surface) {
+				// wtf?
+				continue;
+			}
+
+			// conversion is done in the sink's stream
+			SDL_Surface* surf = new_frame.surface.get();
+			assert(surf != nullptr);
+
+			SDL_LockSurface(surf);
+			_av.toxavVideoSendFrame(
+				vsink->_fid,
+				surf->w, surf->h,
+				static_cast<const uint8_t*>(surf->pixels),
+				static_cast<const uint8_t*>(surf->pixels) + surf->w * surf->h,
+				static_cast<const uint8_t*>(surf->pixels) + surf->w * surf->h + (surf->w/2) * (surf->h/2)
+			);
+			SDL_UnlockSurface(surf);
+		}
+	}
 }
 
 ObjectHandle ToxAVVoIPModel::enter(const Contact3 c, const Components::VoIP::DefaultConfig& defaults) {