@@ -391,18 +391,19 @@ struct llama_client_slot
391391 double t_token_generation; // ms
392392
393393 void reset () {
394- num_prompt_tokens = 0 ;
395- generated_text = " " ;
396- truncated = false ;
397- stopped_eos = false ;
398- stopped_word = false ;
399- stopped_limit = false ;
400- stopping_word = " " ;
401- multibyte_pending = 0 ;
402- n_past = 0 ;
403- sent_count = 0 ;
394+ num_prompt_tokens = 0 ;
395+ generated_text = " " ;
396+ truncated = false ;
397+ stopped_eos = false ;
398+ stopped_word = false ;
399+ stopped_limit = false ;
400+ stopping_word = " " ;
401+ multibyte_pending = 0 ;
402+ n_past = 0 ;
403+ sent_count = 0 ;
404404 sent_token_probs_index = 0 ;
405- infill = false ;
405+ infill = false ;
406+
406407 generated_token_probs.clear ();
407408
408409 for (slot_image &img : images)
@@ -882,7 +883,8 @@ struct llama_server_context
882883
883884 // wait until system prompt load
884885 system_need_update = true ;
885- while (system_need_update) {
886+ while (system_need_update)
887+ {
886888 std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
887889 }
888890 // system prompt loaded, continue
@@ -997,26 +999,31 @@ struct llama_server_context
997999 const std::string str_test = slot.generated_text .substr (pos);
9981000 bool is_stop_full = false ;
9991001 size_t stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_FULL, slot);
1000- if (stop_pos != std::string::npos) {
1002+ if (stop_pos != std::string::npos)
1003+ {
10011004 is_stop_full = true ;
10021005 slot.generated_text .erase (
10031006 slot.generated_text .begin () + pos + stop_pos,
10041007 slot.generated_text .end ());
10051008 pos = std::min (slot.sent_count , slot.generated_text .size ());
1006- } else {
1009+ }
1010+ else
1011+ {
10071012 is_stop_full = false ;
10081013 stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_PARTIAL, slot);
10091014 }
10101015
10111016 // check if there is any token to predict
1012- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
1017+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 ))
1018+ {
10131019 // no send the stop word in the response
10141020 result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
10151021 slot.sent_count += result.text_to_send .size ();
10161022 // add the token to slot queue and cache
10171023 }
10181024 slot.add_token_string (result);
1019- if (slot.params .stream ) {
1025+ if (slot.params .stream )
1026+ {
10201027 send_partial_response (slot, result);
10211028 }
10221029 }
@@ -1051,6 +1058,7 @@ struct llama_server_context
10511058 {" stopped_limit" , slot.stopped_limit },
10521059 {" stopping_word" , slot.stopping_word },
10531060 });
1061+
10541062 return slot.has_next_token ; // continue
10551063 }
10561064
@@ -1089,7 +1097,8 @@ struct llama_server_context
10891097 return slot.images .size () > 0 ;
10901098 }
10911099
1092- void send_error (int id, std::string error) {
1100+ void send_error (int id, std::string error)
1101+ {
10931102 std::lock_guard<std::mutex> lock (mutex_results);
10941103 task_result res;
10951104 res.id = id;
@@ -1098,11 +1107,13 @@ struct llama_server_context
10981107 queue_results.push_back (res);
10991108 }
11001109
1101- json get_model_props () {
1110+ json get_model_props ()
1111+ {
11021112 return get_formated_generation (slots[0 ]);
11031113 }
11041114
1105- json get_formated_generation (llama_client_slot &slot) {
1115+ json get_formated_generation (llama_client_slot &slot)
1116+ {
11061117 const auto eos_bias = slot.sparams .logit_bias .find (llama_token_eos (ctx));
11071118 const bool ignore_eos = eos_bias != slot.sparams .logit_bias .end () &&
11081119 eos_bias->second < 0 .0f && std::isinf (eos_bias->second );
@@ -1134,19 +1145,22 @@ struct llama_server_context
11341145 };
11351146 }
11361147
1137- void send_partial_response (llama_client_slot & slot, completion_token_output tkn) {
1148+ void send_partial_response (llama_client_slot &slot, completion_token_output tkn)
1149+ {
11381150 std::lock_guard<std::mutex> lock (mutex_results);
11391151 task_result res;
11401152 res.id = slot.task_id ;
11411153 res.error = false ;
11421154 res.stop = false ;
1155+
11431156 res.result_json = json
11441157 {
11451158 {" content" , tkn.text_to_send },
11461159 {" stop" , false },
11471160 {" slot_id" , slot.id },
11481161 {" multimodal" , multimodal}
11491162 };
1163+
11501164 if (slot.sparams .n_probs > 0 )
11511165 {
11521166 std::vector<completion_token_output> probs_output = {};
@@ -1160,15 +1174,18 @@ struct llama_server_context
11601174 slot.sent_token_probs_index = probs_stop_pos;
11611175 res.result_json [" completion_probabilities" ] = probs_vector_to_json (ctx, probs_output);
11621176 }
1177+
11631178 queue_results.push_back (res);
11641179 }
11651180
1166- void send_final_response (llama_client_slot & slot) {
1181+ void send_final_response (llama_client_slot &slot)
1182+ {
11671183 std::lock_guard<std::mutex> lock (mutex_results);
11681184 task_result res;
11691185 res.id = slot.task_id ;
11701186 res.error = false ;
11711187 res.stop = true ;
1188+
11721189 res.result_json = json
11731190 {
11741191 {" content" , !slot.params .stream ? slot.generated_text : " " },
@@ -1191,20 +1208,25 @@ struct llama_server_context
11911208 if (slot.sparams .n_probs > 0 )
11921209 {
11931210 std::vector<completion_token_output> probs = {};
1194- if (!slot.params .stream && slot.stopped_word ) {
1211+ if (!slot.params .stream && slot.stopped_word )
1212+ {
11951213 const std::vector<llama_token> stop_word_toks = llama_tokenize (ctx, slot.stopping_word , false );
11961214 probs = std::vector<completion_token_output>(slot.generated_token_probs .begin (), slot.generated_token_probs .end () - stop_word_toks.size ());
1197- } else {
1215+ }
1216+ else
1217+ {
11981218 probs = std::vector<completion_token_output>(
11991219 slot.generated_token_probs .begin (),
12001220 slot.generated_token_probs .begin () + slot.sent_token_probs_index );
12011221 }
12021222 res.result_json [" completion_probabilities" ] = probs_vector_to_json (ctx, probs);
12031223 }
1224+
12041225 queue_results.push_back (res);
12051226 }
12061227
1207- void send_embedding (llama_client_slot & slot) {
1228+ void send_embedding (llama_client_slot &slot)
1229+ {
12081230 std::lock_guard<std::mutex> lock (mutex_results);
12091231 task_result res;
12101232 res.id = slot.task_id ;
@@ -1234,7 +1256,8 @@ struct llama_server_context
12341256 queue_results.push_back (res);
12351257 }
12361258
1237- int request_completion (json data, bool infill) {
1259+ int request_completion (json data, bool infill)
1260+ {
12381261 std::lock_guard<std::mutex> lock (mutex_tasks);
12391262 task_server task;
12401263 task.id = id_gen++;
@@ -1245,17 +1268,22 @@ struct llama_server_context
12451268 return task.id ;
12461269 }
12471270
1248- task_result next_result (int task_id) {
1249- while (true ) {
1271+ task_result next_result (int task_id)
1272+ {
1273+ while (true )
1274+ {
12501275 std::this_thread::sleep_for (std::chrono::microseconds (5 ));
12511276 std::lock_guard<std::mutex> lock (mutex_results);
12521277
1253- if (queue_results.empty ()) {
1278+ if (queue_results.empty ())
1279+ {
12541280 continue ;
12551281 }
12561282
1257- for (int i = 0 ; i < (int ) queue_results.size (); i++) {
1258- if (queue_results[i].id == task_id) {
1283+ for (int i = 0 ; i < (int ) queue_results.size (); i++)
1284+ {
1285+ if (queue_results[i].id == task_id)
1286+ {
12591287 task_result res = queue_results[i];
12601288 queue_results.erase (queue_results.begin () + i);
12611289 return res;
@@ -1335,7 +1363,8 @@ struct llama_server_context
13351363 return true ;
13361364 }
13371365
1338- void request_cancel (int task_id) {
1366+ void request_cancel (int task_id)
1367+ {
13391368 std::lock_guard<std::mutex> lock (mutex_tasks);
13401369 task_server task;
13411370 task.id = id_gen++;
@@ -1344,9 +1373,11 @@ struct llama_server_context
13441373 queue_tasks.push_back (task);
13451374 }
13461375
1347- void process_tasks () {
1376+ void process_tasks ()
1377+ {
13481378 std::lock_guard<std::mutex> lock (mutex_tasks);
1349- while (!queue_tasks.empty ()) {
1379+ while (!queue_tasks.empty ())
1380+ {
13501381 task_server task = queue_tasks.front ();
13511382 queue_tasks.erase (queue_tasks.begin ());
13521383 switch (task.type )
@@ -1379,8 +1410,10 @@ struct llama_server_context
13791410 }
13801411 } break ;
13811412 case CANCEL_TASK: { // release slot linked with the task id
1382- for (auto & slot : slots) {
1383- if (slot.task_id == task.target_id ) {
1413+ for (auto & slot : slots)
1414+ {
1415+ if (slot.task_id == task.target_id )
1416+ {
13841417 slot.release ();
13851418 break ;
13861419 }
@@ -2006,7 +2039,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
20062039 else if (arg == " --embedding" )
20072040 {
20082041 params.embedding = true ;
2009- } else if (arg == " -cb" || arg == " --cont-batching" )
2042+ }
2043+ else if (arg == " -cb" || arg == " --cont-batching" )
20102044 {
20112045 params.cont_batching = true ;
20122046 }
@@ -2047,7 +2081,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
20472081 );
20482082 llama.process_system_prompt_data (json::parse (systm_content));
20492083 }
2050- else if (arg == " --mmproj" ) {
2084+ else if (arg == " --mmproj" )
2085+ {
20512086 if (++i >= argc)
20522087 {
20532088 invalid_param = true ;
@@ -2163,6 +2198,7 @@ int main(int argc, char **argv)
21632198
21642199 LOG_INFO (" build info" , {{" build" , BUILD_NUMBER},
21652200 {" commit" , BUILD_COMMIT}});
2201+
21662202 LOG_INFO (" system info" , {
21672203 {" n_threads" , params.n_threads },
21682204 {" n_threads_batch" , params.n_threads_batch },
@@ -2239,10 +2275,12 @@ int main(int argc, char **argv)
22392275 return ;
22402276 }
22412277 } else {
2242- const auto chunked_content_provider = [task_id, &llama](size_t , httplib::DataSink & sink) {
2243- while (true ) {
2278+ const auto chunked_content_provider = [task_id, &llama](size_t , httplib::DataSink & sink)
2279+ {
2280+ while (true )
2281+ {
22442282 task_result result = llama.next_result (task_id);
2245- if (!result.error ) {
2283+ if (!result.error ) {
22462284 const std::string str =
22472285 " data: " +
22482286 result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
@@ -2264,10 +2302,13 @@ int main(int argc, char **argv)
22642302 sink.done ();
22652303 return true ;
22662304 };
2267- auto on_complete = [task_id, &llama] (bool ) {
2305+
2306+ auto on_complete = [task_id, &llama] (bool )
2307+ {
22682308 // cancel
22692309 llama.request_cancel (task_id);
22702310 };
2311+
22712312 res.set_chunked_content_provider (" text/event-stream" , chunked_content_provider, on_complete);
22722313 }
22732314 });
@@ -2279,7 +2320,8 @@ int main(int argc, char **argv)
22792320 if (!json_value (data, " stream" , false )) {
22802321 std::string completion_text;
22812322 task_result result = llama.next_result (task_id);
2282- if (!result.error && result.stop ) {
2323+ if (!result.error && result.stop )
2324+ {
22832325 res.set_content (result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace), " application/json" );
22842326 }
22852327 else
@@ -2290,9 +2332,10 @@ int main(int argc, char **argv)
22902332 }
22912333 } else {
22922334 const auto chunked_content_provider = [task_id, &llama](size_t , httplib::DataSink & sink) {
2293- while (true ) {
2335+ while (true )
2336+ {
22942337 task_result result = llama.next_result (task_id);
2295- if (!result.error ) {
2338+ if (!result.error ) {
22962339 const std::string str =
22972340 " data: " +
22982341 result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
@@ -2304,20 +2347,28 @@ int main(int argc, char **argv)
23042347 {
23052348 return false ;
23062349 }
2307- if (result.stop ) {
2350+ if (result.stop )
2351+ {
23082352 break ;
23092353 }
2310- } else {
2354+ }
2355+ else
2356+ {
23112357 break ;
23122358 }
23132359 }
2360+
23142361 sink.done ();
2362+
23152363 return true ;
23162364 };
2317- auto on_complete = [task_id, &llama] (bool ) {
2365+
2366+ auto on_complete = [task_id, &llama] (bool )
2367+ {
23182368 // cancel
23192369 llama.request_cancel (task_id);
23202370 };
2371+
23212372 res.set_chunked_content_provider (" text/event-stream" , chunked_content_provider, on_complete);
23222373 }
23232374 });
0 commit comments