Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

CPPLLAMA_VERSION?=9350a1cf21b1492c69b20175b73a419b897d6a3a
CPPLLAMA_VERSION?=88c46cbdac05cebd936511b1d3c74112e721615f

# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
Expand Down
10 changes: 7 additions & 3 deletions backend/cpp/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,20 @@
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
set(TARGET myclip)
add_library(${TARGET} clip.cpp clip.h)
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(myclip PUBLIC .)
target_include_directories(myclip PUBLIC ../..)
target_include_directories(myclip PUBLIC ../../common)
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
endif()
# END CLIP hack


set(TARGET grpc-server)
# END CLIP hack
set(CMAKE_CXX_STANDARD 17)
cmake_minimum_required(VERSION 3.15)
set(TARGET grpc-server)
Expand Down
3 changes: 3 additions & 0 deletions backend/cpp/llama/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ llama.cpp/examples/grpc-server:
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp

rebuild:
Expand Down
135 changes: 73 additions & 62 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
#include <memory>
#include <string>
#include <getopt.h>
#include "../llava/clip.h"
#include "clip.h"
#include "llava.h"
#include "stb_image.h"
#include "common.h"
#include "json.hpp"
Expand All @@ -32,6 +33,7 @@
#include <grpcpp/grpcpp.h>
#include <grpcpp/health_check_service_interface.h>
#include <atomic>
#include <signal.h>

using grpc::Server;
using grpc::ServerBuilder;
Expand All @@ -51,10 +53,11 @@ struct server_params
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "chatml";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
};

bool server_verbose = false;
Expand Down Expand Up @@ -173,6 +176,7 @@ struct llama_client_slot
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
int32_t n_predict = -1;

int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
Expand Down Expand Up @@ -424,6 +428,7 @@ struct llama_server_context

slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;

LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);

Expand Down Expand Up @@ -451,10 +456,6 @@ struct llama_server_context
default_generation_settings_for_props["seed"] = -1;

batch = llama_batch_init(n_ctx, 0, params.n_parallel);

// empty system prompt
system_prompt = "";
system_tokens.clear();
}

std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
Expand Down Expand Up @@ -531,7 +532,7 @@ struct llama_server_context
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params;
llama_sampling_params default_sparams;

slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
Expand All @@ -555,6 +556,16 @@ struct llama_server_context
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);

if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict},
{"slot.n_predict", slot->n_predict},
});
slot->params.n_predict = slot->n_predict;
}

// infill
if (data.count("input_prefix") != 0)
Expand Down Expand Up @@ -683,6 +694,24 @@ struct llama_server_context
}
}

const auto &samplers_sequence = data.find("samplers");
if (samplers_sequence != data.end() && samplers_sequence->is_array())
{
std::vector<std::string> sampler_names;
for (const auto &sampler_name : *samplers_sequence)
{
if (sampler_name.is_string())
{
sampler_names.emplace_back(sampler_name);
}
}
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
}
else
{
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
}

if (multimodal)
{
const auto &images_data = data.find("image_data");
Expand Down Expand Up @@ -772,27 +801,30 @@ struct llama_server_context
}

void update_system_prompt() {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
kv_cache_clear();
system_tokens.clear();

llama_batch_clear(batch);
if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);

kv_cache_clear();
llama_batch_clear(batch);

for (int i = 0; i < (int) system_tokens.size(); ++i)
{
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
for (int i = 0; i < (int)system_tokens.size(); ++i)
{
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}

if (llama_decode(ctx, batch) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
if (llama_decode(ctx, batch) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}

// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
}
}

LOG_TEE("system prompt updated\n");
Expand All @@ -814,10 +846,8 @@ struct llama_server_context
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");

if (slots.size() > 0)
{
notify_system_prompt_changed();
}

notify_system_prompt_changed();
}

static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
Expand Down Expand Up @@ -975,44 +1005,12 @@ struct llama_server_context
{
continue;
}
clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
{
LOG_TEE("Error processing the given image");
clip_free(clp_ctx);
clip_image_f32_batch_free(img_res_v);
return false;
}
if (img_res_v.size == 0)
{
LOG_TEE("Error processing the given image");
return false;
}

// note: assumes only one image was returned by clip_image_preprocess
clip_image_f32 * img_res = img_res_v.data;

img.image_tokens = clip_n_patches(clp_ctx);
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
if (!img.image_embedding)
{
LOG_TEE("Unable to allocate memory for image embeddings\n");
clip_image_f32_batch_free(img_res_v);
clip_free(clp_ctx);
return false;
}
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
{
LOG_TEE("Unable to encode image\n");
clip_image_f32_batch_free(img_res_v);
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image");
return false;
}

clip_image_f32_batch_free(img_res_v);

img.request_encode_image = false;
}

Expand All @@ -1036,8 +1034,15 @@ struct llama_server_context
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
{
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
}

return json {
{"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temperature", slot.sparams.temp},
Expand Down Expand Up @@ -1065,7 +1070,9 @@ struct llama_server_context
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
};
}

Expand Down Expand Up @@ -1877,6 +1884,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}

std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }

/////////////////////////////////
////////////////////////////////
//////// LOCALAI code starts below here
Expand Down Expand Up @@ -2147,7 +2157,8 @@ class BackendServiceImpl final : public backend::Backend::Service {
gpt_params params;
params_parse(request, params);

llama_backend_init(params.numa);
llama_backend_init();
llama_numa_init(params.numa);

// load the model
if (!llama.load_model(params))
Expand Down