simple llama.cpp server api usage works

2024-01-22 21:14:33 +01:00
commit c497b19b20
11 changed files with 537 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required(VERSION 3.9...3.24 FATAL_ERROR)
+
+project(solanaceae)
+
+add_library(solanaceae_llama-cpp-web
+	./solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
+	./solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
+	./solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
+)
+
+target_include_directories(solanaceae_llama-cpp-web PUBLIC .)
+target_compile_features(solanaceae_llama-cpp-web PRIVATE cxx_std_20)
+target_compile_features(solanaceae_llama-cpp-web INTERFACE cxx_std_17)
+target_link_libraries(solanaceae_llama-cpp-web PUBLIC
+	httplib::httplib
+	nlohmann_json::nlohmann_json
+
+	solanaceae_util
+	solanaceae_message3
+)
+
+########################################
+
+add_executable(test1
+	test1.cpp
+)
+
+target_link_libraries(test1 PUBLIC
+	solanaceae_llama-cpp-web
+)
+
--- a/src/solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
+++ b/src/solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
@@ -0,0 +1,152 @@
+#include "./llama_cpp_web_impl.hpp"
+
+#include <solanaceae/util/utils.hpp>
+
+#include <nlohmann/json.hpp>
+
+#include <chrono>
+
+// TODO: variant that strips unicode?
+static std::string convertToSafeGrammarString(std::string_view input) {
+	std::string res;
+	for (const char c : input) {
+		res += "\\x";
+		res += bin2hex({static_cast<uint8_t>(c)});
+	}
+	return res;
+}
+
+LlamaCppWeb::~LlamaCppWeb(void) {
+}
+
+bool LlamaCppWeb::isHealthy(void) {
+	auto res = _cli.Get("/health");
+	if (
+		res.error() != httplib::Error::Success ||
+		res->status != 200 ||
+		res->body.empty() ||
+		res->get_header_value("Content-Type") != "application/json"
+	) {
+		return false;
+	}
+
+	//std::cout << "/health code: " << res->status << " body: " << res->body << "\n";
+	//std::cout << "Content-Type: " << res->get_header_value("Content-Type") << "\n";
+
+	const auto response_body_j = nlohmann::json::parse(res->body, nullptr, false);
+
+	const std::string status = response_body_j.value("status", std::string{"value-not-found"});
+	if (status != "ok") {
+		std::cerr << "status not ok: " << status << "\n";
+		return false;
+	}
+
+	return true; // healthy endpoint
+}
+
+int64_t LlamaCppWeb::completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) {
+	if (possible.empty()) {
+		return -1;
+	}
+	if (possible.size() == 1) {
+		return 0;
+	}
+
+	// see
+	// https://github.com/ggerganov/llama.cpp/tree/master/grammars#example
+	std::string grammar {"root ::= "};
+	bool first = true;
+	for (const auto& it : possible) {
+		if (first) {
+			first = false;
+		} else {
+			grammar += "| ";
+		}
+		grammar += "\"";
+		//grammar += it;
+		grammar += convertToSafeGrammarString(it);
+		grammar += "\" ";
+	}
+	//grammar += ")";
+
+	//std::cout << "generated grammar:\n" << grammar << "\n";
+
+	auto ret = complete(nlohmann::json{
+		{"prompt", prompt},
+		{"grammar", grammar},
+		{"min_p", 0.1}, // model dependent
+		{"repeat_penalty", 1.0}, // deactivate
+		{"temperature", 0.9}, // depends 1.0 for chat models
+		{"top_k", 60},
+		{"top_p", 1.0}, // disable
+		{"n_predict", 256}, // unlikely to ever be so high
+		{"seed", _rng()},
+	});
+
+	if (ret.empty()) {
+		return -2;
+	}
+
+	if (!ret.count("content")) {
+		return -3;
+	}
+
+	std::string selected = ret.at("content");
+	if (selected.empty()) {
+		return -4;
+	}
+
+	for (int64_t i = 0; i < (int64_t)possible.size(); i++) {
+		if (selected == possible[i]) {
+			return i;
+		}
+	}
+
+	std::cerr << "complete failed j:'" << ret.dump() << "'\n";
+	return -5;
+}
+
+std::string LlamaCppWeb::completeLine(const std::string_view prompt) {
+	auto ret = complete(nlohmann::json{
+		{"prompt", prompt},
+		{"min_p", 0.1}, // model dependent
+		{"repeat_penalty", 1.0}, // deactivate
+		{"temperature", 0.9}, // depends 1.0 for chat models
+		{"top_k", 60},
+		{"top_p", 1.0}, // disable
+		{"n_predict", 1000},
+		{"seed", _rng()},
+		{"stop", {"\n"}},
+	});
+
+	return ret.dump();
+}
+
+nlohmann::json LlamaCppWeb::complete(const nlohmann::json& request_j) {
+	if (!isHealthy()) {
+		return {};
+	}
+
+	// completions can take very long
+	// steaming instead would be better
+	_cli.set_read_timeout(std::chrono::minutes(10));
+
+	//std::cout << "j dump: '" << request_j.dump(-1, ' ', true) << "'\n";
+
+	auto res = _cli.Post("/completion", request_j.dump(-1, ' ', true), "application/json");
+
+	//std::cerr << "res.error():" << res.error() << "\n";
+
+	if (
+		res.error() != httplib::Error::Success ||
+		res->status != 200
+		//res->body.empty() ||
+		//res->get_header_value("Content-Type") != "application/json"
+	) {
+		std::cerr << "error posting\n";
+		return {};
+	}
+
+	return nlohmann::json::parse(res->body, nullptr, false);
+}
+
--- a/src/solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
+++ b/src/solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "./llama_cpp_web_interface.hpp"
+
+#include <httplib.h>
+#include <nlohmann/json_fwd.hpp>
+
+#include <random>
+
+struct LlamaCppWeb : public LlamaCppWebI {
+	httplib::Client _cli{"http://localhost:8080"};
+	std::minstd_rand _rng{std::random_device{}()};
+
+	~LlamaCppWeb(void);
+
+	bool isHealthy(void) override;
+	int64_t completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) override;
+	std::string completeLine(const std::string_view prompt) override;
+
+	// TODO: expose?
+	nlohmann::json complete(const nlohmann::json& request_j);
+};
+
--- a/src/solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
+++ b/src/solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+struct LlamaCppWebI {
+	virtual ~LlamaCppWebI(void) {}
+
+	virtual bool isHealthy(void) = 0;
+
+	// TODO: add more complex api
+
+	virtual int64_t completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) = 0;
+
+	// stops at newlines
+	// (and limit of 1000 and eos)
+	virtual std::string completeLine(const std::string_view prompt) = 0;
+};
+
--- a/src/test1.cpp
+++ b/src/test1.cpp
@@ -0,0 +1,56 @@
+#include <solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp>
+
+#include <nlohmann/json.hpp>
+
+#include <iostream>
+#include <random>
+#include <vector>
+#include <chrono>
+#include <cstdint>
+
+int main(void) {
+	LlamaCppWeb lcw;
+
+	if (!lcw.isHealthy()) {
+		std::cerr << lcw._cli.host() << " " << lcw._cli.port() << " endpoint not healthy\n";
+		return 1;
+	}
+	std::cerr << lcw._cli.host() << " " << lcw._cli.port() << " endpoint healthy\n";
+
+	std::cout << "The meaning of life is to"
+		<< lcw.complete(nlohmann::json{
+			{"prompt", "The meaning of life is to"},
+			{"min_p", 0.1}, // model dependent
+			{"repeat_penalty", 1.0}, // deactivate
+			{"temperature", 0.9}, // depends 1.0 for chat models
+			{"top_k", 60},
+			{"top_p", 1.0}, // disable
+			{"n_predict", 16},
+			{"stop", {".", "\n"}},
+			{"gramar", ""}
+		})
+		<< "\n";
+
+	std::cout << "-------------------------\n";
+
+	std::cout << "complete from select:\n";
+	std::vector<std::string_view> possible {
+		" die",
+		" die.",
+		" live",
+		" love",
+		" excersize",
+		" Hi",
+	};
+	for (size_t i = 0; i < 10; i++) {
+		std::cout << "The meaning of life is to";
+		auto res = lcw.completeSelect("The meaning of life is to", possible);
+		if (res < 0) {
+			std::cout << " error--\n";
+		} else {
+			std::cout << possible[res] << "\n";
+		}
+	}
+
+	return 0;
+}