diff --git a/CMakeLists.txt b/CMakeLists.txt index e70f9d3..6b34bbb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,5 +39,7 @@ add_subdirectory(./prototyping EXCLUDE_FROM_ALL) add_subdirectory(./version0) +add_subdirectory(./bench) + add_subdirectory(./vim_research) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt new file mode 100644 index 0000000..8062fa8 --- /dev/null +++ b/bench/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) + +project(crdt_bench CXX C) + +######################################## + +add_executable(crdt_bench_jpaper_v0 + ./v0_jpaper.cpp +) + +target_link_libraries(crdt_bench_jpaper_v0 PUBLIC + crdt_version0 + nlohmann_json::nlohmann_json +) + +######################################## + +if (FALSE) +add_executable(crdt_bench_jpaper_v1 + ./test1.cpp +) + +target_link_libraries(crdt_bench_jpaper_v1 PUBLIC + crdt_version1 + nlohmann_json::nlohmann_json +) +endif() + diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000..cfe6d3e --- /dev/null +++ b/bench/README.md @@ -0,0 +1,38 @@ +# Timings + +all benches use the uncompressed .json from disk + +all done with the "cool an breezy copy" preset + +note: the .json is not a perfect fit, bc it is not designed for the Yjs algo (missing parent_right and ids are not perfect) + +the json contains: + doc size (with tombstones): 182315 + doc size: 104852 + total inserts: 182315 + total deletes: 77463 + total ops: 259778 + +## baseline ( just walking through the json, no insertions ) + +- g++9 -g : + - 23.0s + - 22.6s + - 23.0s + +- g++9 -O3 -DNDEBUG : + - 9.6s + - 9.7s + - 9.7s + + +## version0 + +- g++9 -g -O2 : + - 10m35s + +- g++9 -O3 -DNDEBUG : + - 8m7s + +## version1 + diff --git a/bench/v0_jpaper.cpp b/bench/v0_jpaper.cpp new file mode 100644 index 0000000..c7f7f60 --- /dev/null +++ b/bench/v0_jpaper.cpp @@ -0,0 +1,198 @@ +#define EXTRA_ASSERTS 0 + +//#include +#include +#include + +#include +#include +#include +#include +#include + +using ActorID = std::array; +//using Doc = GreenCRDT::V0::TextDocument; +using List = GreenCRDT::V0::List; + +template<> +struct std::hash { + std::size_t operator()(ActorID const& s) const noexcept { + static_assert(sizeof(size_t) == 8); + // TODO: maybe shuffle the indices a bit + return + (static_cast(s[0]) << 8*0) | + (static_cast(s[1]) << 8*1) | + (static_cast(s[2]) << 8*2) | + (static_cast(s[3]) << 8*3) | + (static_cast(s[4]) << 8*4) | + (static_cast(s[5]) << 8*5) | + (static_cast(s[6]) << 8*6) | + (static_cast(s[7]) << 8*7) + ; + } +}; + +// for dev, benching in debug is usefull, but only if the ammount of asserts is reasonable +#if !defined(extra_assert) + #if defined(EXTRA_ASSERTS) && EXTRA_ASSERTS == 1 + #define extra_assert(...) assert(__VA_ARGS__) + #else + #define extra_assert(...) void(0) + #endif +#endif + +namespace detail { + uint8_t nib_from_hex(char c) { + extra_assert((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')); + + if (c >= '0' && c <= '9') { + return static_cast(c) - '0'; + } else if (c >= 'a' && c <= 'f') { + return (static_cast(c) - 'a') + 10u; + } else { + return 0u; + } + } +} // detail + +static ActorID ActorIDFromStr(std::string_view str) { + extra_assert(str.size() == 32*2); + ActorID tmp; + + for (size_t i = 0; i < tmp.size(); i++) { + tmp[i] = detail::nib_from_hex(str[i*2]) << 4 | detail::nib_from_hex(str[i*2+1]); + } + + return tmp; +} + +// seq@ID type format used in the json +struct JObj { + ActorID id; + uint64_t seq {0}; +}; + +static JObj JObjFromStr(std::string_view str) { + extra_assert(str.size() > 32*2 + 1); + + size_t at_pos = str.find_first_of('@'); + auto seq_sv = str.substr(0, at_pos); + auto id_sv = str.substr(at_pos+1); + + assert(seq_sv.size() != 0); + assert(id_sv.size() == 32*2); + + uint64_t tmp_seq {0}; + for (size_t i = 0; i < seq_sv.size(); i++) { + assert(seq_sv[i] >= '0' && seq_sv[i] <= '9'); + tmp_seq *= 10; + tmp_seq += seq_sv[i] - '0'; + } + + return {ActorIDFromStr(id_sv), tmp_seq}; +} + +int main(void) { + List list; + + std::ifstream file {"../res/paper.json"}; + std::cout << "start reading...\n"; + + uint64_t g_total_inserts {0}; + uint64_t g_total_deletes {0}; + //uint64_t g_seq_inserts {0}; // the opsec are not sequentially growing for inserts, so we sidestep + std::unordered_map g_seq_inserts {0}; // the opsec are not sequentially growing for inserts, so we sidestep + std::unordered_map> map_seq; // maps json op_seq -> lits id seq + + for (std::string line; std::getline(file, line); ) { + nlohmann::json j_entry = nlohmann::json::parse(line); + const ActorID actor = ActorIDFromStr(static_cast(j_entry["actor"])); + uint64_t op_seq = j_entry["startOp"]; + for (const auto& j_op : j_entry["ops"]) { + if (j_op["action"] == "set") { + const auto obj = JObjFromStr(static_cast(j_op["obj"])); + if (obj.seq != 1) { + // skip all non text edits (create text doc, curser etc) + continue; + } + + if (j_op["insert"]) { + const auto& j_parent = j_op["key"]; + extra_assert(!j_parent.is_null()); + if (j_parent == "_head") { + uint64_t tmp_seq {g_seq_inserts[actor]++}; + bool r = list.add( + {actor, tmp_seq}, + static_cast(j_op["value"]).front(), + std::nullopt, + std::nullopt + ); + assert(r); + map_seq[actor][op_seq] = tmp_seq; + g_total_inserts++; + } else { // we have a parrent + extra_assert(static_cast(j_op["value"]).size() == 1); + + // split parent into seq and actor + const auto parent_left = JObjFromStr(static_cast(j_parent)); + auto idx_opt = list.findIdx({parent_left.id, map_seq[parent_left.id][parent_left.seq]}); + assert(idx_opt.has_value()); + + const auto parent_left_id = list.list.at(idx_opt.value()).id; + std::optional parent_right_id; + if (idx_opt.value()+1 < list.list.size()) { + parent_right_id = list.list.at(idx_opt.value()+1).id; + } + + uint64_t tmp_seq {g_seq_inserts[actor]++}; + bool r = list.add( + {actor, tmp_seq}, + static_cast(j_op["value"]).front(), + parent_left_id, + parent_right_id + ); + assert(r); + map_seq[actor][op_seq] = tmp_seq; + g_total_inserts++; + } + } else { + // i think this is curser movement + } + } else if (j_op["action"] == "del") { + const auto list_id = JObjFromStr(static_cast(j_op["key"])); + bool r = list.del({list_id.id, map_seq[list_id.id][list_id.seq]}); + assert(r); + g_total_deletes++; + } else if (j_op["action"] == "makeText") { + // doc.clear(); + } else if (j_op["action"] == "makeMap") { + // no idea + } else { + std::cout << "op: " << j_op << "\n"; + } + + op_seq++; + } + } + + std::cout << "\ndoc size (with tombstones): " << list.list.size() << "\n"; + std::cout << "doc size: " << list.doc_size << "\n"; + std::cout << "total inserts: " << g_total_inserts << "\n"; + std::cout << "total deletes: " << g_total_deletes << "\n"; + std::cout << "total ops: " << g_total_inserts + g_total_deletes << "\n"; + + // checked, looks correct +#if 0 + std::cout << "doc text:\n"; + // simple print + for (const auto& it : list.list) { + if (it.value) { + std::cout << it.value.value(); + } + } + std::cout << "\n"; +#endif + + return 0; +} + diff --git a/version0/green_crdt/v0/list.hpp b/version0/green_crdt/v0/list.hpp index e038133..0738be8 100644 --- a/version0/green_crdt/v0/list.hpp +++ b/version0/green_crdt/v0/list.hpp @@ -8,6 +8,14 @@ #include +#if !defined(extra_assert) + #if defined(EXTRA_ASSERTS) && EXTRA_ASSERTS == 1 + #define extra_assert(...) assert(__VA_ARGS__) + #else + #define extra_assert(...) void(0) + #endif +#endif + namespace GreenCRDT::V0 { template @@ -71,7 +79,9 @@ struct List { std::map last_seen_seq; std::optional findIdx(const ListID& list_id) const { +//#if defined(EXTRA_ASSERTS) && EXTRA_ASSERTS == 1 //verify(); // too expensive +//#endif for (size_t i = 0; i < list.size(); i++) { if (list[i].id == list_id) { return i;