2525#include < thread>
2626#include < mutex>
2727#include < chrono>
28+ #include < condition_variable>
2829
2930#ifndef SERVER_VERBOSE
3031#define SERVER_VERBOSE 1
@@ -541,7 +542,9 @@ struct llama_server_context
541542 std::vector<task_result> queue_results;
542543 std::vector<task_multi> queue_multitasks;
543544 std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
545+ std::condition_variable condition_tasks;
544546 std::mutex mutex_results;
547+ std::condition_variable condition_results;
545548
546549 ~llama_server_context ()
547550 {
@@ -1169,14 +1172,15 @@ struct llama_server_context
11691172
11701173 void send_error (task_server& task, std::string error)
11711174 {
1172- std::lock_guard <std::mutex> lock (mutex_results);
1175+ std::unique_lock <std::mutex> lock (mutex_results);
11731176 task_result res;
11741177 res.id = task.id ;
11751178 res.multitask_id = task.multitask_id ;
11761179 res.stop = false ;
11771180 res.error = true ;
11781181 res.result_json = { { " content" , error } };
11791182 queue_results.push_back (res);
1183+ condition_results.notify_all ();
11801184 }
11811185
11821186 void add_multi_task (int id, std::vector<int >& sub_ids)
@@ -1186,6 +1190,7 @@ struct llama_server_context
11861190 multi.id = id;
11871191 std::copy (sub_ids.begin (), sub_ids.end (), std::inserter (multi.subtasks_remaining , multi.subtasks_remaining .end ()));
11881192 queue_multitasks.push_back (multi);
1193+ condition_tasks.notify_one ();
11891194 }
11901195
11911196 void update_multi_task (int multitask_id, int subtask_id, task_result& result)
@@ -1197,6 +1202,7 @@ struct llama_server_context
11971202 {
11981203 multitask.subtasks_remaining .erase (subtask_id);
11991204 multitask.results .push_back (result);
1205+ condition_tasks.notify_one ();
12001206 }
12011207 }
12021208 }
@@ -1244,7 +1250,7 @@ struct llama_server_context
12441250
12451251 void send_partial_response (llama_client_slot &slot, completion_token_output tkn)
12461252 {
1247- std::lock_guard <std::mutex> lock (mutex_results);
1253+ std::unique_lock <std::mutex> lock (mutex_results);
12481254 task_result res;
12491255 res.id = slot.task_id ;
12501256 res.multitask_id = slot.multitask_id ;
@@ -1280,11 +1286,12 @@ struct llama_server_context
12801286 }
12811287
12821288 queue_results.push_back (res);
1289+ condition_results.notify_all ();
12831290 }
12841291
12851292 void send_final_response (llama_client_slot &slot)
12861293 {
1287- std::lock_guard <std::mutex> lock (mutex_results);
1294+ std::unique_lock <std::mutex> lock (mutex_results);
12881295 task_result res;
12891296 res.id = slot.task_id ;
12901297 res.multitask_id = slot.multitask_id ;
@@ -1340,11 +1347,12 @@ struct llama_server_context
13401347 }
13411348
13421349 queue_results.push_back (res);
1350+ condition_results.notify_all ();
13431351 }
13441352
13451353 void send_embedding (llama_client_slot &slot)
13461354 {
1347- std::lock_guard <std::mutex> lock (mutex_results);
1355+ std::unique_lock <std::mutex> lock (mutex_results);
13481356 task_result res;
13491357 res.id = slot.task_id ;
13501358 res.multitask_id = slot.multitask_id ;
@@ -1372,6 +1380,7 @@ struct llama_server_context
13721380 };
13731381 }
13741382 queue_results.push_back (res);
1383+ condition_results.notify_all ();
13751384 }
13761385
13771386 int request_completion (json data, bool infill, bool embedding, int multitask_id)
@@ -1395,20 +1404,18 @@ struct llama_server_context
13951404
13961405 // otherwise, it's a single-prompt task, we actually queue it
13971406 queue_tasks.push_back (task);
1407+ condition_tasks.notify_one ();
13981408 return task.id ;
13991409 }
14001410
14011411 task_result next_result (int task_id)
14021412 {
14031413 while (true )
14041414 {
1405- std::this_thread::sleep_for (std::chrono::microseconds (5 ));
1406- std::lock_guard<std::mutex> lock (mutex_results);
1407-
1408- if (queue_results.empty ())
1409- {
1410- continue ;
1411- }
1415+ std::unique_lock<std::mutex> lock (mutex_results);
1416+ condition_results.wait (lock, [&]{
1417+ return !queue_results.empty ();
1418+ });
14121419
14131420 for (int i = 0 ; i < (int ) queue_results.size (); i++)
14141421 {
@@ -1504,12 +1511,13 @@ struct llama_server_context
15041511
15051512 void request_cancel (int task_id)
15061513 {
1507- std::lock_guard <std::mutex> lock (mutex_tasks);
1514+ std::unique_lock <std::mutex> lock (mutex_tasks);
15081515 task_server task;
15091516 task.id = id_gen++;
15101517 task.type = CANCEL_TASK;
15111518 task.target_id = task_id;
15121519 queue_tasks.push_back (task);
1520+ condition_tasks.notify_one ();
15131521 }
15141522
15151523 int split_multiprompt_task (task_server& multiprompt_task)
@@ -1535,7 +1543,7 @@ struct llama_server_context
15351543
15361544 void process_tasks ()
15371545 {
1538- std::lock_guard <std::mutex> lock (mutex_tasks);
1546+ std::unique_lock <std::mutex> lock (mutex_tasks);
15391547 while (!queue_tasks.empty ())
15401548 {
15411549 task_server task = queue_tasks.front ();
@@ -1607,6 +1615,7 @@ struct llama_server_context
16071615
16081616 std::lock_guard<std::mutex> lock (mutex_results);
16091617 queue_results.push_back (aggregate_result);
1618+ condition_results.notify_all ();
16101619
16111620 queue_iterator = queue_multitasks.erase (queue_iterator);
16121621 }
@@ -1637,8 +1646,10 @@ struct llama_server_context
16371646 LOG_TEE (" all slots are idle and system prompt is empty, clear the KV cache\n " );
16381647 kv_cache_clear ();
16391648 }
1640- // avoid 100% usage of cpu all time
1641- std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
1649+ std::unique_lock<std::mutex> lock (mutex_tasks);
1650+ condition_tasks.wait (lock, [&]{
1651+ return !queue_tasks.empty ();
1652+ });
16421653 }
16431654
16441655 for (llama_client_slot &slot : slots)
0 commit comments