change binary meta format and add zstd to metadata

This commit is contained in:
Green Sky 2024-02-19 12:32:09 +01:00
parent 4fb2b51b7d
commit 6aac44cda9
No known key found for this signature in database
6 changed files with 160 additions and 67 deletions

View File

@ -40,25 +40,26 @@ A Metadata json object can have arbitrary keys, some are predefined:
### Split Metadata ### Split Metadata
file magic bytes `SOLMET` (6 bytes) msgpack array:
1 byte encryption type (`0x00` is none) - `[0]`: file magic string `SOLMET` (6 bytes)
- `[1]`: uint8 encryption type (`0x00` is none)
1 byte compression type (`0x00` is none) - `[2]`: uint8 compression type (`0x00` is none, `0x01` is zstd)
- `[3]`: binary metadata (optionally compressed and encrypted)
...metadata here...
note that the encryption and compression are for the metadata only. note that the encryption and compression are for the metadata only.
The metadata itself contains encryption and compression info about the data. The metadata itself contains encryption and compression info about the data.
### Split Data ### Split Data
(none) all the data is in the metadata file. All the metadata is in the metadata file. (like encryption and compression)
This is mostly to allow direct storage for files in the Fragment store without excessive duplication. This is mostly to allow direct storage for files in the Fragment store without excessive duplication.
Keep in mind to not use the actual file name as the data/meta file name. Keep in mind to not use the actual file name as the data/meta file name.
### Single fragment ### Single fragment
Note: this format is unused for now
file magic bytes `SOLFIL` (6 bytes) file magic bytes `SOLFIL` (6 bytes)
1 byte encryption type (`0x00` is none) 1 byte encryption type (`0x00` is none)

View File

@ -224,15 +224,6 @@ FragmentID FragmentStore::getFragmentCustomMatcher(
return entt::null; return entt::null;
} }
template<typename F>
static void writeBinaryMetafileHeader(F& file, const Encryption enc, const Compression comp) {
file.write("SOLMET", 6);
file.put(static_cast<std::underlying_type_t<Encryption>>(enc));
// TODO: is compressiontype encrypted?
file.put(static_cast<std::underlying_type_t<Compression>>(comp));
}
bool FragmentStore::syncToStorage(FragmentID fid, std::function<write_to_storage_fetch_data_cb>& data_cb) { bool FragmentStore::syncToStorage(FragmentID fid, std::function<write_to_storage_fetch_data_cb>& data_cb) {
if (!_reg.valid(fid)) { if (!_reg.valid(fid)) {
return false; return false;
@ -298,13 +289,8 @@ bool FragmentStore::syncToStorage(FragmentID fid, std::function<write_to_storage
return false; return false;
} }
// metadata type
if (meta_type == MetaFileType::BINARY_MSGPACK) { // binary metadata file
writeBinaryMetafileHeader(meta_file, meta_enc, meta_comp);
}
// sharing code between binary msgpack and text json for now // sharing code between binary msgpack and text json for now
nlohmann::json meta_data = nlohmann::json::object(); // metadata needs to be an object, null not allowed nlohmann::json meta_data_j = nlohmann::json::object(); // metadata needs to be an object, null not allowed
// metadata file // metadata file
for (const auto& [type_id, storage] : _reg.storage()) { for (const auto& [type_id, storage] : _reg.storage()) {
@ -325,33 +311,52 @@ bool FragmentStore::syncToStorage(FragmentID fid, std::function<write_to_storage
//if (meta_type == MetaFileType::BINARY_MSGPACK) { // msgpack uses the hash id instead //if (meta_type == MetaFileType::BINARY_MSGPACK) { // msgpack uses the hash id instead
//s_cb_it->second(storage.value(fid), meta_data[storage.type().hash()]); //s_cb_it->second(storage.value(fid), meta_data[storage.type().hash()]);
//} else if (meta_type == MetaFileType::TEXT_JSON) { //} else if (meta_type == MetaFileType::TEXT_JSON) {
s_cb_it->second({_reg, fid}, meta_data[storage.type().name()]); s_cb_it->second({_reg, fid}, meta_data_j[storage.type().name()]);
//} //}
} }
if (meta_type == MetaFileType::BINARY_MSGPACK) { // binary metadata file if (meta_type == MetaFileType::BINARY_MSGPACK) { // binary metadata file
const auto res = nlohmann::json::to_msgpack(meta_data); const std::vector<uint8_t> meta_data = nlohmann::json::to_msgpack(meta_data_j);
std::vector<uint8_t> meta_data_compressed; // empty if none
//std::vector<uint8_t> meta_data_encrypted; // empty if none
// TODO: refactor if (meta_comp == Compression::ZSTD) {
if (meta_comp == Compression::NONE) { meta_data_compressed.resize(ZSTD_compressBound(meta_data.size()));
meta_file.write(reinterpret_cast<const char*>(res.data()), res.size());
} else if (meta_comp == Compression::ZSTD) {
std::vector<uint8_t> compressed_buffer;
compressed_buffer.resize(ZSTD_compressBound(res.size()));
size_t const cSize = ZSTD_compress(compressed_buffer.data(), compressed_buffer.size(), res.data(), res.size(), 0); // 0 is default is probably 3 size_t const cSize = ZSTD_compress(meta_data_compressed.data(), meta_data_compressed.size(), meta_data.data(), meta_data.size(), 0); // 0 is default is probably 3
if (ZSTD_isError(cSize)) { if (ZSTD_isError(cSize)) {
std::cerr << "FS error: compressing meta failed\n"; std::cerr << "FS error: compressing meta failed\n";
return false; // HACK meta_data_compressed.clear();
meta_comp = Compression::NONE;
} else {
meta_data_compressed.resize(cSize);
} }
} else if (meta_comp == Compression::NONE) {
compressed_buffer.resize(cSize); // maybe skip this resize // do nothing
} else {
meta_file.write(reinterpret_cast<const char*>(compressed_buffer.data()), compressed_buffer.size()); assert(false && "implement me");
} }
// TODO: encryption
// the meta file is itself msgpack data
nlohmann::json meta_header_j = nlohmann::json::array();
meta_header_j.emplace_back() = "SOLMET";
meta_header_j.push_back(meta_enc);
meta_header_j.push_back(meta_comp);
if (false) { // TODO: encryption
} else if (!meta_data_compressed.empty()) {
meta_header_j.push_back(nlohmann::json::binary(meta_data_compressed));
} else {
meta_header_j.push_back(nlohmann::json::binary(meta_data));
}
const auto meta_header_data = nlohmann::json::to_msgpack(meta_header_j);
meta_file.write(reinterpret_cast<const char*>(meta_header_data.data()), meta_header_data.size());
} else if (meta_type == MetaFileType::TEXT_JSON) { } else if (meta_type == MetaFileType::TEXT_JSON) {
// cant be compressed or encrypted // cant be compressed or encrypted
meta_file << meta_data.dump(2, ' ', true); meta_file << meta_data_j.dump(2, ' ', true);
} }
// now data // now data
@ -409,6 +414,8 @@ bool FragmentStore::syncToStorage(FragmentID fid, std::function<write_to_storage
} }
// same as if lastChunk break; // same as if lastChunk break;
} while (buffer_actual_size == buffer.size()); } while (buffer_actual_size == buffer.size());
} else {
assert(false && "implement me");
} }
meta_file.flush(); meta_file.flush();
@ -511,6 +518,8 @@ bool FragmentStore::loadFromStorage(FragmentID fid, std::function<read_from_stor
} while (buffer_actual_size == in_buffer.size() && !data_file.eof()); } while (buffer_actual_size == in_buffer.size() && !data_file.eof());
ZSTD_freeDCtx(dctx); ZSTD_freeDCtx(dctx);
} else {
assert(false && "implement me");
} }
return true; return true;
@ -660,47 +669,127 @@ size_t FragmentStore::scanStoragePath(std::string_view path) {
for (const auto& it : file_frag_list) { for (const auto& it : file_frag_list) {
nlohmann::json j; nlohmann::json j;
if (it.meta_ext == ".meta.msgpack") { if (it.meta_ext == ".meta.msgpack") {
// uh
// read binary header
assert(false);
} else if (it.meta_ext == ".meta.json") {
std::ifstream file(it.frag_path.generic_u8string() + it.meta_ext, std::ios::in | std::ios::binary); std::ifstream file(it.frag_path.generic_u8string() + it.meta_ext, std::ios::in | std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
std::cout << "FS error: failed opening meta " << it.frag_path << "\n"; std::cout << "FS error: failed opening meta " << it.frag_path << "\n";
continue; continue;
} }
file >> j; // file is a msgpack within a msgpack
if (!j.is_object()) { std::vector<uint8_t> full_meta_data;
std::cerr << "FS error: json in meta is broken " << it.id_str << "\n"; { // read meta file
// figure out size
file.seekg(0, file.end);
uint64_t file_size = file.tellg();
file.seekg(0, file.beg);
full_meta_data.resize(file_size);
file.read(reinterpret_cast<char*>(full_meta_data.data()), full_meta_data.size());
}
const auto meta_header_j = nlohmann::json::from_msgpack(full_meta_data);
if (!meta_header_j.is_array() || meta_header_j.size() < 4) {
std::cerr << "FS error: broken binary meta " << it.frag_path << "\n";
continue; continue;
} }
// TODO: existing fragment file if (meta_header_j.at(0) != "SOLMET") {
//newFragmentFile(); std::cerr << "FS error: wrong magic '" << meta_header_j.at(0) << "' in meta " << it.frag_path << "\n";
FragmentHandle fh{_reg, _reg.create()}; continue;
fh.emplace<FragComp::ID>(hex2bin(it.id_str));
fh.emplace<FragComp::Ephemeral::FilePath>(it.frag_path.generic_u8string());
for (const auto& [k, v] : j.items()) {
// type id from string hash
const auto type_id = entt::hashed_string(k.data(), k.size());
const auto deserl_fn_it = _sc._deserl_json.find(type_id);
if (deserl_fn_it != _sc._deserl_json.cend()) {
// TODO: check return value
deserl_fn_it->second(fh, v);
} else {
std::cerr << "FS warning: missing deserializer for meta key '" << k << "'\n";
}
} }
// throw new frag event here
throwEventConstruct(fh); Encryption meta_enc = meta_header_j.at(1);
count++; if (meta_enc != Encryption::NONE) {
std::cerr << "FS error: unknown encryption " << it.frag_path << "\n";
continue;
}
Compression meta_comp = meta_header_j.at(2);
if (meta_comp != Compression::NONE && meta_comp != Compression::ZSTD) {
std::cerr << "FS error: unknown compression " << it.frag_path << "\n";
continue;
}
//const auto& meta_data_ref = meta_header_j.at(3).is_binary()?meta_header_j.at(3):meta_header_j.at(3).at("data");
if (!meta_header_j.at(3).is_binary()) {
std::cerr << "FS error: meta data not binary " << it.frag_path << "\n";
continue;
}
const nlohmann::json::binary_t& meta_data_ref = meta_header_j.at(3);
std::vector<uint8_t> meta_data_decomp;
if (meta_comp == Compression::NONE) {
// do nothing
} else if (meta_comp == Compression::ZSTD) {
meta_data_decomp.resize(ZSTD_DStreamOutSize());
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
ZSTD_inBuffer input {meta_data_ref.data(), meta_data_ref.size(), 0 };
ZSTD_outBuffer output = { meta_data_decomp.data(), meta_data_decomp.size(), 0 };
do {
size_t const ret = ZSTD_decompressStream(dctx, &output , &input);
if (ZSTD_isError(ret)) {
// error <.<
std::cerr << "FS error: decompression error\n";
meta_data_decomp.clear();
break;
}
} while (input.pos < input.size);
meta_data_decomp.resize(output.pos);
ZSTD_freeDCtx(dctx);
} else {
assert(false && "implement me");
}
// TODO: enc
if (!meta_data_decomp.empty()) {
j = nlohmann::json::from_msgpack(meta_data_decomp);
} else {
j = nlohmann::json::from_msgpack(meta_data_ref);
}
} else if (it.meta_ext == ".meta.json") {
std::ifstream file(it.frag_path.generic_u8string() + it.meta_ext, std::ios::in | std::ios::binary);
if (!file.is_open()) {
std::cerr << "FS error: failed opening meta " << it.frag_path << "\n";
continue;
}
file >> j;
} else { } else {
assert(false); assert(false);
} }
if (!j.is_object()) {
std::cerr << "FS error: json in meta is broken " << it.id_str << "\n";
continue;
}
// TODO: existing fragment file
//newFragmentFile();
FragmentHandle fh{_reg, _reg.create()};
fh.emplace<FragComp::ID>(hex2bin(it.id_str));
fh.emplace<FragComp::Ephemeral::FilePath>(it.frag_path.generic_u8string());
for (const auto& [k, v] : j.items()) {
// type id from string hash
const auto type_id = entt::hashed_string(k.data(), k.size());
const auto deserl_fn_it = _sc._deserl_json.find(type_id);
if (deserl_fn_it != _sc._deserl_json.cend()) {
// TODO: check return value
deserl_fn_it->second(fh, v);
} else {
std::cerr << "FS warning: missing deserializer for meta key '" << k << "'\n";
}
}
// throw new frag event here
throwEventConstruct(fh);
count++;
} }
return count; return count;

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

View File

@ -145,7 +145,7 @@ void MessageFragmentStore::handleMessage(const Message3Handle& m) {
// if its still not found, we need a new fragment // if its still not found, we need a new fragment
if (fragment_uid.empty()) { if (fragment_uid.empty()) {
const auto new_fid = _fs.newFragmentFile("test_message_store/", MetaFileType::TEXT_JSON); const auto new_fid = _fs.newFragmentFile("test_message_store/", MetaFileType::BINARY_MSGPACK);
auto fh = _fs.fragmentHandle(new_fid); auto fh = _fs.fragmentHandle(new_fid);
if (!static_cast<bool>(fh)) { if (!static_cast<bool>(fh)) {
std::cout << "MFS error: failed to create new fragment for message\n"; std::cout << "MFS error: failed to create new fragment for message\n";
@ -154,6 +154,7 @@ void MessageFragmentStore::handleMessage(const Message3Handle& m) {
fragment_uid = fh.get<FragComp::ID>().v; fragment_uid = fh.get<FragComp::ID>().v;
fh.emplace_or_replace<FragComp::Ephemeral::MetaCompressionType>().comp = Compression::ZSTD;
fh.emplace_or_replace<FragComp::DataCompressionType>().comp = Compression::ZSTD; fh.emplace_or_replace<FragComp::DataCompressionType>().comp = Compression::ZSTD;
auto& new_ts_range = fh.emplace<FragComp::MessagesTSRange>(); auto& new_ts_range = fh.emplace<FragComp::MessagesTSRange>();

View File

@ -8,10 +8,12 @@ enum class Encryption : uint8_t {
enum class Compression : uint8_t { enum class Compression : uint8_t {
NONE = 0x00, NONE = 0x00,
ZSTD = 0x01, ZSTD = 0x01,
// TODO: zstd without magic
// TODO: zstd meta dict
// TODO: zstd data(message) dict
}; };
enum class MetaFileType : uint8_t { enum class MetaFileType : uint8_t {
TEXT_JSON, TEXT_JSON,
//BINARY_ARB, BINARY_MSGPACK, // msgpacked msgpack
BINARY_MSGPACK,
}; };