diff --git a/CMakeLists.txt b/CMakeLists.txt index 60c98f2..deb61cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,9 @@ add_library(solanaceae_sha1_ngcft1 ./solanaceae/ngc_ft1_sha1/file_constructor.hpp ./solanaceae/ngc_ft1_sha1/file_constructor.cpp + ./solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.hpp + ./solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.cpp + ./solanaceae/ngc_ft1_sha1/hash_utils.hpp ./solanaceae/ngc_ft1_sha1/hash_utils.cpp diff --git a/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.cpp b/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.cpp new file mode 100644 index 0000000..b652921 --- /dev/null +++ b/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.cpp @@ -0,0 +1,253 @@ +#include "./sha1_mapped_filesystem.hpp" + +#include + +#include "../file_constructor.hpp" +#include "../ft1_sha1_info.hpp" +#include "../hash_utils.hpp" +#include "../components.hpp" + +#include + +#include +#include +#include +#include + +#include + +namespace Backends { + +struct SHA1MappedFilesystem_InfoBuilderState { + std::atomic_bool info_builder_dirty {false}; + std::mutex info_builder_queue_mutex; + using InfoBuilderEntry = std::function; + std::list info_builder_queue; +}; + +SHA1MappedFilesystem::SHA1MappedFilesystem( + ObjectStore2& os +) : StorageBackendI::StorageBackendI(os), _ibs(std::make_unique()) { +} + +SHA1MappedFilesystem::~SHA1MappedFilesystem(void) { +} + +void SHA1MappedFilesystem::tick(void) { + if (_ibs->info_builder_dirty) { + std::lock_guard l{_ibs->info_builder_queue_mutex}; + _ibs->info_builder_dirty = false; // set while holding lock + + for (auto& it : _ibs->info_builder_queue) { + it(); + } + _ibs->info_builder_queue.clear(); + } +} + +ObjectHandle SHA1MappedFilesystem::newObject(ByteSpan id) { + ObjectHandle o{_os.registry(), _os.registry().create()}; + + o.emplace(this); + o.emplace(std::vector{id}); + //o.emplace(object_file_path.generic_u8string()); + + _os.throwEventConstruct(o); + + return o; +} + +void SHA1MappedFilesystem::newFromFile(std::string_view file_name, std::string_view file_path, std::function&& cb) { + std::thread(std::move([ + this, + ibs = _ibs.get(), + cb = std::move(cb), + file_name_ = std::string(file_name), + file_path_ = std::string(file_path) + ]() mutable { + // 0. open and fail + std::unique_ptr file_impl = construct_file2_rw_mapped(file_path_, -1); + if (!file_impl->isGood()) { + { + std::lock_guard l{ibs->info_builder_queue_mutex}; + ibs->info_builder_queue.push_back([file_path_](){ + // back on iterate thread + + std::cerr << "SHA1MF error: failed opening file '" << file_path_ << "'!\n"; + }); + ibs->info_builder_dirty = true; // still in scope, set before mutex unlock + } + return; + } + + // 1. build info by hashing all chunks + FT1InfoSHA1 sha1_info; + // build info + sha1_info.file_name = file_name_; + sha1_info.file_size = file_impl->_file_size; // TODO: remove the reliance on implementation details + + { // build chunks + // HACK: load file fully + // ... its only a hack if its not memory mapped, but reading in chunk_sized chunks is probably a good idea anyway + const auto file_data = file_impl->read(file_impl->_file_size, 0); + size_t i = 0; + for (; i + sha1_info.chunk_size < file_data.size; i += sha1_info.chunk_size) { + sha1_info.chunks.push_back(hash_sha1(file_data.ptr+i, sha1_info.chunk_size)); + } + + if (i < file_data.size) { + sha1_info.chunks.push_back(hash_sha1(file_data.ptr+i, file_data.size-i)); + } + } + + file_impl.reset(); + + std::lock_guard l{ibs->info_builder_queue_mutex}; + ibs->info_builder_queue.push_back(std::move([ + this, + file_name_, + file_path_, + sha1_info = std::move(sha1_info), + cb = std::move(cb) + ]() mutable { // + // executed on iterate thread + + // reopen, cant move, since std::function needs to be copy consturctable (meh) + std::unique_ptr file_impl = construct_file2_rw_mapped(file_path_, sha1_info.file_size); + if (!file_impl->isGood()) { + std::cerr << "SHA1MF error: failed opening file '" << file_path_ << "'!\n"; + return; + } + + // 2. hash info + std::vector sha1_info_data; + std::vector sha1_info_hash; + + std::cout << "SHA1MF info is: \n" << sha1_info; + sha1_info_data = sha1_info.toBuffer(); + std::cout << "SHA1MF sha1_info size: " << sha1_info_data.size() << "\n"; + sha1_info_hash = hash_sha1(sha1_info_data.data(), sha1_info_data.size()); + std::cout << "SHA1MF sha1_info_hash: " << bin2hex(sha1_info_hash) << "\n"; + + ObjectHandle o; + // check if content exists + // TODO: store "info_to_content" in reg/backend, for better lookup speed + // rn ok, bc this is rare + for (const auto& [it_ov, it_ih] : _os.registry().view().each()) { + if (it_ih.hash == sha1_info_hash) { + o = {_os.registry(), it_ov}; + } + } + //if (self->info_to_content.count(sha1_info_hash)) { + if (static_cast(o)) { + //ce = self->info_to_content.at(sha1_info_hash); + + // TODO: check if content is incomplete and use file instead + if (!o.all_of()) { + o.emplace(sha1_info); + } + if (!o.all_of()) { + o.emplace(sha1_info_data); + } + + // hash has to be set already + // Components::FT1InfoSHA1Hash + + { // lookup tables and have + auto& cc = o.get_or_emplace(); + cc.have_all = true; + // skip have vec, since all + //cc.have_chunk + cc.have_count = sha1_info.chunks.size(); // need? + + //self->_info_to_content[sha1_info_hash] = ce; + cc.chunk_hash_to_index.clear(); // for cpy pst + for (size_t i = 0; i < sha1_info.chunks.size(); i++) { + //self->_chunks[sha1_info.chunks[i]] = ce; + cc.chunk_hash_to_index[sha1_info.chunks[i]].push_back(i); + } + } + + { // file info + // TODO: not overwrite fi? since same? + auto& file_info = o.emplace_or_replace(); + file_info.file_list.emplace_back() = {std::string{file_name_}, file_impl->_file_size}; + file_info.total_size = file_impl->_file_size; + + o.emplace_or_replace(std::vector{std::string{file_path_}}); + } + + // hmmm + o.remove(); + + // we dont want the info anymore + o.remove(); + } else { + o = newObject(ByteSpan{sha1_info_hash}); + + o.emplace(sha1_info); + o.emplace(sha1_info_data); // keep around? or file? + o.emplace(sha1_info_hash); + { // lookup tables and have + auto& cc = o.emplace(); + cc.have_all = true; + // skip have vec, since all + cc.have_count = sha1_info.chunks.size(); // need? + + cc.chunk_hash_to_index.clear(); // for cpy pst + for (size_t i = 0; i < sha1_info.chunks.size(); i++) { + cc.chunk_hash_to_index[sha1_info.chunks[i]].push_back(i); + } + } + + { // file info + auto& file_info = o.emplace(); + file_info.file_list.emplace_back() = {std::string{file_name_}, file_impl->_file_size}; + file_info.total_size = file_impl->_file_size; + + o.emplace(std::vector{std::string{file_path_}}); + } + } + + o.emplace_or_replace(std::move(file_impl)); + + // TODO: replace with transfers stats + if (!o.all_of()) { + o.emplace(0u); + } + + cb(o); + + // TODO: earlier? + _os.throwEventUpdate(o); + })); + ibs->info_builder_dirty = true; // still in scope, set before mutex unlock + })).detach(); +} + +std::unique_ptr SHA1MappedFilesystem::file2(Object ov, FILE2_FLAGS flags) { + ObjectHandle o{_os.registry(), ov}; + + if (!static_cast(o)) { + return nullptr; + } + + if (!o.all_of()) { + return nullptr; + } + + const auto& file_list = o.get().file_list; + if (file_list.empty()) { + return nullptr; + } + + auto res = construct_file2_rw_mapped(file_list.front(), -1); + if (!res || !res->isGood()) { + return nullptr; + } + + return res; +} + +} // Backends + diff --git a/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.hpp b/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.hpp new file mode 100644 index 0000000..dc3fd56 --- /dev/null +++ b/solanaceae/ngc_ft1_sha1/backends/sha1_mapped_filesystem.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include +#include +#include + +namespace Backends { + +// fwd to hide the threading headers +struct SHA1MappedFilesystem_InfoBuilderState; + +struct SHA1MappedFilesystem : public StorageBackendI { + std::unique_ptr _ibs; + + SHA1MappedFilesystem( + ObjectStore2& os + ); + ~SHA1MappedFilesystem(void); + + // pull from info builder queue + // call from main thread (os thread?) often + void tick(void); + + ObjectHandle newObject(ByteSpan id) override; + + // performs async file hashing + // create message in cb + void newFromFile(std::string_view file_name, std::string_view file_path, std::function&& cb/*, bool merge_preexisting = false*/); + + // might return pre-existing? + ObjectHandle newFromInfoHash(ByteSpan info_hash); + + std::unique_ptr file2(Object o, FILE2_FLAGS flags); // default does nothing +}; + +} // Backends +