Skip to content

Commit 8292781

Browse files
authored
deps(llama.cpp): update, support Gemma models (#1734)
deps(llama.cpp): update Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 54ec634 commit 8292781

File tree

4 files changed

+84
-66
lines changed

4 files changed

+84
-66
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
88

99
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
1010

11-
CPPLLAMA_VERSION?=9350a1cf21b1492c69b20175b73a419b897d6a3a
11+
CPPLLAMA_VERSION?=88c46cbdac05cebd936511b1d3c74112e721615f
1212

1313
# gpt4all version
1414
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all

backend/cpp/llama/CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@
22
## XXX: In some versions of CMake clip wasn't being built before llama.
33
## This is an hack for now, but it should be fixed in the future.
44
set(TARGET myclip)
5-
add_library(${TARGET} clip.cpp clip.h)
5+
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
66
install(TARGETS ${TARGET} LIBRARY)
7-
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
7+
target_include_directories(myclip PUBLIC .)
8+
target_include_directories(myclip PUBLIC ../..)
9+
target_include_directories(myclip PUBLIC ../../common)
10+
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
811
target_compile_features(${TARGET} PRIVATE cxx_std_11)
912
if (NOT MSVC)
1013
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
1114
endif()
15+
# END CLIP hack
16+
1217

1318
set(TARGET grpc-server)
14-
# END CLIP hack
1519
set(CMAKE_CXX_STANDARD 17)
1620
cmake_minimum_required(VERSION 3.15)
1721
set(TARGET grpc-server)

backend/cpp/llama/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ llama.cpp/examples/grpc-server:
4545
## XXX: In some versions of CMake clip wasn't being built before llama.
4646
## This is an hack for now, but it should be fixed in the future.
4747
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
48+
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
49+
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
50+
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
4851
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
4952

5053
rebuild:

backend/cpp/llama/grpc-server.cpp

Lines changed: 73 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
#include <memory>
1212
#include <string>
1313
#include <getopt.h>
14-
#include "../llava/clip.h"
14+
#include "clip.h"
15+
#include "llava.h"
1516
#include "stb_image.h"
1617
#include "common.h"
1718
#include "json.hpp"
@@ -32,6 +33,7 @@
3233
#include <grpcpp/grpcpp.h>
3334
#include <grpcpp/health_check_service_interface.h>
3435
#include <atomic>
36+
#include <signal.h>
3537

3638
using grpc::Server;
3739
using grpc::ServerBuilder;
@@ -51,10 +53,11 @@ struct server_params
5153
std::string hostname = "127.0.0.1";
5254
std::vector<std::string> api_keys;
5355
std::string public_path = "examples/server/public";
54-
std::string chat_template = "chatml";
56+
std::string chat_template = "";
5557
int32_t port = 8080;
5658
int32_t read_timeout = 600;
5759
int32_t write_timeout = 600;
60+
bool slots_endpoint = true;
5861
};
5962

6063
bool server_verbose = false;
@@ -173,6 +176,7 @@ struct llama_client_slot
173176
int32_t n_decoded = 0;
174177
int32_t n_remaining = -1;
175178
int32_t i_batch = -1;
179+
int32_t n_predict = -1;
176180

177181
int32_t num_prompt_tokens = 0;
178182
int32_t num_prompt_tokens_processed = 0;
@@ -424,6 +428,7 @@ struct llama_server_context
424428

425429
slot.id = i;
426430
slot.n_ctx = n_ctx_slot;
431+
slot.n_predict = params.n_predict;
427432

428433
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
429434

@@ -451,10 +456,6 @@ struct llama_server_context
451456
default_generation_settings_for_props["seed"] = -1;
452457

453458
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
454-
455-
// empty system prompt
456-
system_prompt = "";
457-
system_tokens.clear();
458459
}
459460

460461
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
@@ -531,7 +532,7 @@ struct llama_server_context
531532
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
532533
slot_params default_params;
533534
llama_sampling_params default_sparams;
534-
535+
535536
slot->params.stream = json_value(data, "stream", false);
536537
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
537538
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
@@ -555,6 +556,16 @@ struct llama_server_context
555556
slot->params.seed = json_value(data, "seed", default_params.seed);
556557
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
557558
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
559+
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
560+
561+
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
562+
// Might be better to reject the request with a 400 ?
563+
LOG_WARNING("Max tokens to predict exceeds server configuration", {
564+
{"params.n_predict", slot->params.n_predict},
565+
{"slot.n_predict", slot->n_predict},
566+
});
567+
slot->params.n_predict = slot->n_predict;
568+
}
558569

559570
// infill
560571
if (data.count("input_prefix") != 0)
@@ -683,6 +694,24 @@ struct llama_server_context
683694
}
684695
}
685696

697+
const auto &samplers_sequence = data.find("samplers");
698+
if (samplers_sequence != data.end() && samplers_sequence->is_array())
699+
{
700+
std::vector<std::string> sampler_names;
701+
for (const auto &sampler_name : *samplers_sequence)
702+
{
703+
if (sampler_name.is_string())
704+
{
705+
sampler_names.emplace_back(sampler_name);
706+
}
707+
}
708+
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
709+
}
710+
else
711+
{
712+
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
713+
}
714+
686715
if (multimodal)
687716
{
688717
const auto &images_data = data.find("image_data");
@@ -772,27 +801,30 @@ struct llama_server_context
772801
}
773802

774803
void update_system_prompt() {
775-
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
804+
kv_cache_clear();
805+
system_tokens.clear();
776806

777-
llama_batch_clear(batch);
807+
if (!system_prompt.empty()) {
808+
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
778809

779-
kv_cache_clear();
810+
llama_batch_clear(batch);
780811

781-
for (int i = 0; i < (int) system_tokens.size(); ++i)
782-
{
783-
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
784-
}
812+
for (int i = 0; i < (int)system_tokens.size(); ++i)
813+
{
814+
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
815+
}
785816

786-
if (llama_decode(ctx, batch) != 0)
787-
{
788-
LOG_TEE("%s: llama_decode() failed\n", __func__);
789-
return;
790-
}
817+
if (llama_decode(ctx, batch) != 0)
818+
{
819+
LOG_TEE("%s: llama_decode() failed\n", __func__);
820+
return;
821+
}
791822

792-
// assign the system KV cache to all parallel sequences
793-
for (int32_t i = 1; i < params.n_parallel; ++i)
794-
{
795-
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
823+
// assign the system KV cache to all parallel sequences
824+
for (int32_t i = 1; i < params.n_parallel; ++i)
825+
{
826+
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
827+
}
796828
}
797829

798830
LOG_TEE("system prompt updated\n");
@@ -814,10 +846,8 @@ struct llama_server_context
814846
name_user = sys_props.value("anti_prompt", "");
815847
name_assistant = sys_props.value("assistant_name", "");
816848

817-
if (slots.size() > 0)
818-
{
819-
notify_system_prompt_changed();
820-
}
849+
850+
notify_system_prompt_changed();
821851
}
822852

823853
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@@ -975,44 +1005,12 @@ struct llama_server_context
9751005
{
9761006
continue;
9771007
}
978-
clip_image_f32_batch img_res_v;
979-
img_res_v.size = 0;
980-
img_res_v.data = nullptr;
981-
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
982-
{
983-
LOG_TEE("Error processing the given image");
984-
clip_free(clp_ctx);
985-
clip_image_f32_batch_free(img_res_v);
986-
return false;
987-
}
988-
if (img_res_v.size == 0)
989-
{
990-
LOG_TEE("Error processing the given image");
991-
return false;
992-
}
9931008

994-
// note: assumes only one image was returned by clip_image_preprocess
995-
clip_image_f32 * img_res = img_res_v.data;
996-
997-
img.image_tokens = clip_n_patches(clp_ctx);
998-
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
999-
if (!img.image_embedding)
1000-
{
1001-
LOG_TEE("Unable to allocate memory for image embeddings\n");
1002-
clip_image_f32_batch_free(img_res_v);
1003-
clip_free(clp_ctx);
1004-
return false;
1005-
}
1006-
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
1007-
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
1008-
{
1009-
LOG_TEE("Unable to encode image\n");
1010-
clip_image_f32_batch_free(img_res_v);
1009+
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
1010+
LOG_TEE("Error processing the given image");
10111011
return false;
10121012
}
10131013

1014-
clip_image_f32_batch_free(img_res_v);
1015-
10161014
img.request_encode_image = false;
10171015
}
10181016

@@ -1036,8 +1034,15 @@ struct llama_server_context
10361034
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
10371035
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
10381036
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1037+
std::vector<std::string> samplers_sequence;
1038+
for (const auto &sampler_type : slot.sparams.samplers_sequence)
1039+
{
1040+
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
1041+
}
1042+
10391043
return json {
10401044
{"n_ctx", slot.n_ctx},
1045+
{"n_predict", slot.n_predict},
10411046
{"model", params.model_alias},
10421047
{"seed", slot.params.seed},
10431048
{"temperature", slot.sparams.temp},
@@ -1065,7 +1070,9 @@ struct llama_server_context
10651070
{"stream", slot.params.stream},
10661071
{"logit_bias", slot.sparams.logit_bias},
10671072
{"n_probs", slot.sparams.n_probs},
1073+
{"min_keep", slot.sparams.min_keep},
10681074
{"grammar", slot.sparams.grammar},
1075+
{"samplers", samplers_sequence}
10691076
};
10701077
}
10711078

@@ -1877,6 +1884,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
18771884
}
18781885
}
18791886

1887+
std::function<void(int)> shutdown_handler;
1888+
inline void signal_handler(int signal) { shutdown_handler(signal); }
1889+
18801890
/////////////////////////////////
18811891
////////////////////////////////
18821892
//////// LOCALAI code starts below here
@@ -2147,7 +2157,8 @@ class BackendServiceImpl final : public backend::Backend::Service {
21472157
gpt_params params;
21482158
params_parse(request, params);
21492159

2150-
llama_backend_init(params.numa);
2160+
llama_backend_init();
2161+
llama_numa_init(params.numa);
21512162

21522163
// load the model
21532164
if (!llama.load_model(params))

0 commit comments

Comments
 (0)