simple llama.cpp server api usage works
This commit is contained in:
31
src/CMakeLists.txt
Normal file
31
src/CMakeLists.txt
Normal file
@ -0,0 +1,31 @@
|
||||
cmake_minimum_required(VERSION 3.9...3.24 FATAL_ERROR)
|
||||
|
||||
project(solanaceae)
|
||||
|
||||
add_library(solanaceae_llama-cpp-web
|
||||
./solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
|
||||
./solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
|
||||
./solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
|
||||
)
|
||||
|
||||
target_include_directories(solanaceae_llama-cpp-web PUBLIC .)
|
||||
target_compile_features(solanaceae_llama-cpp-web PRIVATE cxx_std_20)
|
||||
target_compile_features(solanaceae_llama-cpp-web INTERFACE cxx_std_17)
|
||||
target_link_libraries(solanaceae_llama-cpp-web PUBLIC
|
||||
httplib::httplib
|
||||
nlohmann_json::nlohmann_json
|
||||
|
||||
solanaceae_util
|
||||
solanaceae_message3
|
||||
)
|
||||
|
||||
########################################
|
||||
|
||||
add_executable(test1
|
||||
test1.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(test1 PUBLIC
|
||||
solanaceae_llama-cpp-web
|
||||
)
|
||||
|
152
src/solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
Normal file
152
src/solanaceae/llama-cpp-web/llama_cpp_web_impl.cpp
Normal file
@ -0,0 +1,152 @@
|
||||
#include "./llama_cpp_web_impl.hpp"
|
||||
|
||||
#include <solanaceae/util/utils.hpp>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <chrono>
|
||||
|
||||
// TODO: variant that strips unicode?
|
||||
static std::string convertToSafeGrammarString(std::string_view input) {
|
||||
std::string res;
|
||||
for (const char c : input) {
|
||||
res += "\\x";
|
||||
res += bin2hex({static_cast<uint8_t>(c)});
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
LlamaCppWeb::~LlamaCppWeb(void) {
|
||||
}
|
||||
|
||||
bool LlamaCppWeb::isHealthy(void) {
|
||||
auto res = _cli.Get("/health");
|
||||
if (
|
||||
res.error() != httplib::Error::Success ||
|
||||
res->status != 200 ||
|
||||
res->body.empty() ||
|
||||
res->get_header_value("Content-Type") != "application/json"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//std::cout << "/health code: " << res->status << " body: " << res->body << "\n";
|
||||
//std::cout << "Content-Type: " << res->get_header_value("Content-Type") << "\n";
|
||||
|
||||
const auto response_body_j = nlohmann::json::parse(res->body, nullptr, false);
|
||||
|
||||
const std::string status = response_body_j.value("status", std::string{"value-not-found"});
|
||||
if (status != "ok") {
|
||||
std::cerr << "status not ok: " << status << "\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true; // healthy endpoint
|
||||
}
|
||||
|
||||
int64_t LlamaCppWeb::completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) {
|
||||
if (possible.empty()) {
|
||||
return -1;
|
||||
}
|
||||
if (possible.size() == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// see
|
||||
// https://github.com/ggerganov/llama.cpp/tree/master/grammars#example
|
||||
std::string grammar {"root ::= "};
|
||||
bool first = true;
|
||||
for (const auto& it : possible) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
grammar += "| ";
|
||||
}
|
||||
grammar += "\"";
|
||||
//grammar += it;
|
||||
grammar += convertToSafeGrammarString(it);
|
||||
grammar += "\" ";
|
||||
}
|
||||
//grammar += ")";
|
||||
|
||||
//std::cout << "generated grammar:\n" << grammar << "\n";
|
||||
|
||||
auto ret = complete(nlohmann::json{
|
||||
{"prompt", prompt},
|
||||
{"grammar", grammar},
|
||||
{"min_p", 0.1}, // model dependent
|
||||
{"repeat_penalty", 1.0}, // deactivate
|
||||
{"temperature", 0.9}, // depends 1.0 for chat models
|
||||
{"top_k", 60},
|
||||
{"top_p", 1.0}, // disable
|
||||
{"n_predict", 256}, // unlikely to ever be so high
|
||||
{"seed", _rng()},
|
||||
});
|
||||
|
||||
if (ret.empty()) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
if (!ret.count("content")) {
|
||||
return -3;
|
||||
}
|
||||
|
||||
std::string selected = ret.at("content");
|
||||
if (selected.empty()) {
|
||||
return -4;
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < (int64_t)possible.size(); i++) {
|
||||
if (selected == possible[i]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "complete failed j:'" << ret.dump() << "'\n";
|
||||
return -5;
|
||||
}
|
||||
|
||||
std::string LlamaCppWeb::completeLine(const std::string_view prompt) {
|
||||
auto ret = complete(nlohmann::json{
|
||||
{"prompt", prompt},
|
||||
{"min_p", 0.1}, // model dependent
|
||||
{"repeat_penalty", 1.0}, // deactivate
|
||||
{"temperature", 0.9}, // depends 1.0 for chat models
|
||||
{"top_k", 60},
|
||||
{"top_p", 1.0}, // disable
|
||||
{"n_predict", 1000},
|
||||
{"seed", _rng()},
|
||||
{"stop", {"\n"}},
|
||||
});
|
||||
|
||||
return ret.dump();
|
||||
}
|
||||
|
||||
nlohmann::json LlamaCppWeb::complete(const nlohmann::json& request_j) {
|
||||
if (!isHealthy()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// completions can take very long
|
||||
// steaming instead would be better
|
||||
_cli.set_read_timeout(std::chrono::minutes(10));
|
||||
|
||||
//std::cout << "j dump: '" << request_j.dump(-1, ' ', true) << "'\n";
|
||||
|
||||
auto res = _cli.Post("/completion", request_j.dump(-1, ' ', true), "application/json");
|
||||
|
||||
//std::cerr << "res.error():" << res.error() << "\n";
|
||||
|
||||
if (
|
||||
res.error() != httplib::Error::Success ||
|
||||
res->status != 200
|
||||
//res->body.empty() ||
|
||||
//res->get_header_value("Content-Type") != "application/json"
|
||||
) {
|
||||
std::cerr << "error posting\n";
|
||||
return {};
|
||||
}
|
||||
|
||||
return nlohmann::json::parse(res->body, nullptr, false);
|
||||
}
|
||||
|
23
src/solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
Normal file
23
src/solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp
Normal file
@ -0,0 +1,23 @@
|
||||
#pragma once
|
||||
|
||||
#include "./llama_cpp_web_interface.hpp"
|
||||
|
||||
#include <httplib.h>
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <random>
|
||||
|
||||
struct LlamaCppWeb : public LlamaCppWebI {
|
||||
httplib::Client _cli{"http://localhost:8080"};
|
||||
std::minstd_rand _rng{std::random_device{}()};
|
||||
|
||||
~LlamaCppWeb(void);
|
||||
|
||||
bool isHealthy(void) override;
|
||||
int64_t completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) override;
|
||||
std::string completeLine(const std::string_view prompt) override;
|
||||
|
||||
// TODO: expose?
|
||||
nlohmann::json complete(const nlohmann::json& request_j);
|
||||
};
|
||||
|
20
src/solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
Normal file
20
src/solanaceae/llama-cpp-web/llama_cpp_web_interface.hpp
Normal file
@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
struct LlamaCppWebI {
|
||||
virtual ~LlamaCppWebI(void) {}
|
||||
|
||||
virtual bool isHealthy(void) = 0;
|
||||
|
||||
// TODO: add more complex api
|
||||
|
||||
virtual int64_t completeSelect(const std::string_view prompt, const std::vector<std::string_view>& possible) = 0;
|
||||
|
||||
// stops at newlines
|
||||
// (and limit of 1000 and eos)
|
||||
virtual std::string completeLine(const std::string_view prompt) = 0;
|
||||
};
|
||||
|
56
src/test1.cpp
Normal file
56
src/test1.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include <solanaceae/llama-cpp-web/llama_cpp_web_impl.hpp>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
|
||||
int main(void) {
|
||||
LlamaCppWeb lcw;
|
||||
|
||||
if (!lcw.isHealthy()) {
|
||||
std::cerr << lcw._cli.host() << " " << lcw._cli.port() << " endpoint not healthy\n";
|
||||
return 1;
|
||||
}
|
||||
std::cerr << lcw._cli.host() << " " << lcw._cli.port() << " endpoint healthy\n";
|
||||
|
||||
std::cout << "The meaning of life is to"
|
||||
<< lcw.complete(nlohmann::json{
|
||||
{"prompt", "The meaning of life is to"},
|
||||
{"min_p", 0.1}, // model dependent
|
||||
{"repeat_penalty", 1.0}, // deactivate
|
||||
{"temperature", 0.9}, // depends 1.0 for chat models
|
||||
{"top_k", 60},
|
||||
{"top_p", 1.0}, // disable
|
||||
{"n_predict", 16},
|
||||
{"stop", {".", "\n"}},
|
||||
{"gramar", ""}
|
||||
})
|
||||
<< "\n";
|
||||
|
||||
std::cout << "-------------------------\n";
|
||||
|
||||
std::cout << "complete from select:\n";
|
||||
std::vector<std::string_view> possible {
|
||||
" die",
|
||||
" die.",
|
||||
" live",
|
||||
" love",
|
||||
" excersize",
|
||||
" Hi",
|
||||
};
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
std::cout << "The meaning of life is to";
|
||||
auto res = lcw.completeSelect("The meaning of life is to", possible);
|
||||
if (res < 0) {
|
||||
std::cout << " error--\n";
|
||||
} else {
|
||||
std::cout << possible[res] << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user