1111#include < memory>
1212#include < string>
1313#include < getopt.h>
14- #include " ../llava/clip.h"
14+ #include " clip.h"
15+ #include " llava.h"
1516#include " stb_image.h"
1617#include " common.h"
1718#include " json.hpp"
3233#include < grpcpp/grpcpp.h>
3334#include < grpcpp/health_check_service_interface.h>
3435#include < atomic>
36+ #include < signal.h>
3537
3638using grpc::Server;
3739using grpc::ServerBuilder;
@@ -51,10 +53,11 @@ struct server_params
5153 std::string hostname = " 127.0.0.1" ;
5254 std::vector<std::string> api_keys;
5355 std::string public_path = " examples/server/public" ;
54- std::string chat_template = " chatml " ;
56+ std::string chat_template = " " ;
5557 int32_t port = 8080 ;
5658 int32_t read_timeout = 600 ;
5759 int32_t write_timeout = 600 ;
60+ bool slots_endpoint = true ;
5861};
5962
6063bool server_verbose = false ;
@@ -173,6 +176,7 @@ struct llama_client_slot
173176 int32_t n_decoded = 0 ;
174177 int32_t n_remaining = -1 ;
175178 int32_t i_batch = -1 ;
179+ int32_t n_predict = -1 ;
176180
177181 int32_t num_prompt_tokens = 0 ;
178182 int32_t num_prompt_tokens_processed = 0 ;
@@ -424,6 +428,7 @@ struct llama_server_context
424428
425429 slot.id = i;
426430 slot.n_ctx = n_ctx_slot;
431+ slot.n_predict = params.n_predict ;
427432
428433 LOG_TEE (" -> Slot %i - max context: %i\n " , slot.id , n_ctx_slot);
429434
@@ -451,10 +456,6 @@ struct llama_server_context
451456 default_generation_settings_for_props[" seed" ] = -1 ;
452457
453458 batch = llama_batch_init (n_ctx, 0 , params.n_parallel );
454-
455- // empty system prompt
456- system_prompt = " " ;
457- system_tokens.clear ();
458459 }
459460
460461 std::vector<llama_token> tokenize (const json & json_prompt, bool add_bos) const
@@ -531,7 +532,7 @@ struct llama_server_context
531532 bool launch_slot_with_data (llama_client_slot* &slot, json data) {
532533 slot_params default_params;
533534 llama_sampling_params default_sparams;
534-
535+
535536 slot->params .stream = json_value (data, " stream" , false );
536537 slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
537538 slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
@@ -555,6 +556,16 @@ struct llama_server_context
555556 slot->params .seed = json_value (data, " seed" , default_params.seed );
556557 slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
557558 slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
559+ slot->sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
560+
561+ if (slot->n_predict > 0 && slot->params .n_predict > slot->n_predict ) {
562+ // Might be better to reject the request with a 400 ?
563+ LOG_WARNING (" Max tokens to predict exceeds server configuration" , {
564+ {" params.n_predict" , slot->params .n_predict },
565+ {" slot.n_predict" , slot->n_predict },
566+ });
567+ slot->params .n_predict = slot->n_predict ;
568+ }
558569
559570 // infill
560571 if (data.count (" input_prefix" ) != 0 )
@@ -683,6 +694,24 @@ struct llama_server_context
683694 }
684695 }
685696
697+ const auto &samplers_sequence = data.find (" samplers" );
698+ if (samplers_sequence != data.end () && samplers_sequence->is_array ())
699+ {
700+ std::vector<std::string> sampler_names;
701+ for (const auto &sampler_name : *samplers_sequence)
702+ {
703+ if (sampler_name.is_string ())
704+ {
705+ sampler_names.emplace_back (sampler_name);
706+ }
707+ }
708+ slot->sparams .samplers_sequence = sampler_types_from_names (sampler_names, false );
709+ }
710+ else
711+ {
712+ slot->sparams .samplers_sequence = default_sparams.samplers_sequence ;
713+ }
714+
686715 if (multimodal)
687716 {
688717 const auto &images_data = data.find (" image_data" );
@@ -772,27 +801,30 @@ struct llama_server_context
772801 }
773802
774803 void update_system_prompt () {
775- system_tokens = ::llama_tokenize (ctx, system_prompt, add_bos_token);
804+ kv_cache_clear ();
805+ system_tokens.clear ();
776806
777- llama_batch_clear (batch);
807+ if (!system_prompt.empty ()) {
808+ system_tokens = ::llama_tokenize (ctx, system_prompt, add_bos_token);
778809
779- kv_cache_clear ( );
810+ llama_batch_clear (batch );
780811
781- for (int i = 0 ; i < (int ) system_tokens.size (); ++i)
782- {
783- llama_batch_add (batch, system_tokens[i], i, { 0 }, false );
784- }
812+ for (int i = 0 ; i < (int )system_tokens.size (); ++i)
813+ {
814+ llama_batch_add (batch, system_tokens[i], i, { 0 }, false );
815+ }
785816
786- if (llama_decode (ctx, batch) != 0 )
787- {
788- LOG_TEE (" %s: llama_decode() failed\n " , __func__);
789- return ;
790- }
817+ if (llama_decode (ctx, batch) != 0 )
818+ {
819+ LOG_TEE (" %s: llama_decode() failed\n " , __func__);
820+ return ;
821+ }
791822
792- // assign the system KV cache to all parallel sequences
793- for (int32_t i = 1 ; i < params.n_parallel ; ++i)
794- {
795- llama_kv_cache_seq_cp (ctx, 0 , i, 0 , system_tokens.size ());
823+ // assign the system KV cache to all parallel sequences
824+ for (int32_t i = 1 ; i < params.n_parallel ; ++i)
825+ {
826+ llama_kv_cache_seq_cp (ctx, 0 , i, 0 , system_tokens.size ());
827+ }
796828 }
797829
798830 LOG_TEE (" system prompt updated\n " );
@@ -814,10 +846,8 @@ struct llama_server_context
814846 name_user = sys_props.value (" anti_prompt" , " " );
815847 name_assistant = sys_props.value (" assistant_name" , " " );
816848
817- if (slots.size () > 0 )
818- {
819- notify_system_prompt_changed ();
820- }
849+
850+ notify_system_prompt_changed ();
821851 }
822852
823853 static size_t find_stopping_strings (const std::string &text, const size_t last_token_size,
@@ -975,44 +1005,12 @@ struct llama_server_context
9751005 {
9761006 continue ;
9771007 }
978- clip_image_f32_batch img_res_v;
979- img_res_v.size = 0 ;
980- img_res_v.data = nullptr ;
981- if (!clip_image_preprocess (clp_ctx, img.img_data , img_res_v))
982- {
983- LOG_TEE (" Error processing the given image" );
984- clip_free (clp_ctx);
985- clip_image_f32_batch_free (img_res_v);
986- return false ;
987- }
988- if (img_res_v.size == 0 )
989- {
990- LOG_TEE (" Error processing the given image" );
991- return false ;
992- }
9931008
994- // note: assumes only one image was returned by clip_image_preprocess
995- clip_image_f32 * img_res = img_res_v.data ;
996-
997- img.image_tokens = clip_n_patches (clp_ctx);
998- img.image_embedding = (float *)malloc (clip_embd_nbytes (clp_ctx));
999- if (!img.image_embedding )
1000- {
1001- LOG_TEE (" Unable to allocate memory for image embeddings\n " );
1002- clip_image_f32_batch_free (img_res_v);
1003- clip_free (clp_ctx);
1004- return false ;
1005- }
1006- LOG_TEE (" slot %i - encoding image [id: %i]\n " , slot.id , img.id );
1007- if (!clip_image_encode (clp_ctx, params.n_threads , img_res, img.image_embedding ))
1008- {
1009- LOG_TEE (" Unable to encode image\n " );
1010- clip_image_f32_batch_free (img_res_v);
1009+ if (!llava_image_embed_make_with_clip_img (clp_ctx, params.n_threads , img.img_data , &img.image_embedding , &img.image_tokens )) {
1010+ LOG_TEE (" Error processing the given image" );
10111011 return false ;
10121012 }
10131013
1014- clip_image_f32_batch_free (img_res_v);
1015-
10161014 img.request_encode_image = false ;
10171015 }
10181016
@@ -1036,8 +1034,15 @@ struct llama_server_context
10361034 const auto eos_bias = slot.sparams .logit_bias .find (llama_token_eos (model));
10371035 const bool ignore_eos = eos_bias != slot.sparams .logit_bias .end () &&
10381036 eos_bias->second < 0 .0f && std::isinf (eos_bias->second );
1037+ std::vector<std::string> samplers_sequence;
1038+ for (const auto &sampler_type : slot.sparams .samplers_sequence )
1039+ {
1040+ samplers_sequence.emplace_back (sampler_type_to_name_string (sampler_type));
1041+ }
1042+
10391043 return json {
10401044 {" n_ctx" , slot.n_ctx },
1045+ {" n_predict" , slot.n_predict },
10411046 {" model" , params.model_alias },
10421047 {" seed" , slot.params .seed },
10431048 {" temperature" , slot.sparams .temp },
@@ -1065,7 +1070,9 @@ struct llama_server_context
10651070 {" stream" , slot.params .stream },
10661071 {" logit_bias" , slot.sparams .logit_bias },
10671072 {" n_probs" , slot.sparams .n_probs },
1073+ {" min_keep" , slot.sparams .min_keep },
10681074 {" grammar" , slot.sparams .grammar },
1075+ {" samplers" , samplers_sequence}
10691076 };
10701077 }
10711078
@@ -1877,6 +1884,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
18771884 }
18781885}
18791886
1887+ std::function<void (int )> shutdown_handler;
1888+ inline void signal_handler (int signal) { shutdown_handler (signal); }
1889+
18801890// ///////////////////////////////
18811891// //////////////////////////////
18821892// ////// LOCALAI code starts below here
@@ -2147,7 +2157,8 @@ class BackendServiceImpl final : public backend::Backend::Service {
21472157 gpt_params params;
21482158 params_parse (request, params);
21492159
2150- llama_backend_init (params.numa );
2160+ llama_backend_init ();
2161+ llama_numa_init (params.numa );
21512162
21522163 // load the model
21532164 if (!llama.load_model (params))
0 commit comments