77
88start_time = time .time ()
99SERVER_ENDPOINT = "http://localhost:3928"
10- TOTAL_REQUESTS = 16
11- N_PARALLEL = 4
12- MAX_CTX_FOR_ONE_SEQUENCE = 512
13- N_CTX = MAX_CTX_FOR_ONE_SEQUENCE * N_PARALLEL # this number related to reserve GPU memory for kv cache
10+ TOTAL_USERS = 40
11+ NUM_ROUNDS = 10
12+ MAX_TOKENS = 500
13+ N_PARALLEL = 32
14+ MAX_CTX_FOR_ONE_SEQUENCE = 1000
15+ # this number related to reserve GPU memory for kv cache
16+ N_CTX = MAX_CTX_FOR_ONE_SEQUENCE * N_PARALLEL
17+
1418
1519def start_server ():
1620 import subprocess
@@ -21,80 +25,108 @@ def start_server():
2125
2226def load_model ():
2327 headers = {"Content-Type" : "application/json" }
24- data = {"llama_model_path" : "/mnt/nas/gguf-models/meta-llama3.1-8b-instruct-q4km.gguf" , "model_alias" : "meta-llama3.1-8b-instruct" ,
25- "model" : "meta-llama3.1-8b-instruct" , "ctx_len" : N_CTX ,"n_batch" :2048 , "ngl" : 300 , "model_type" : "llm" , "n_parallel" : N_PARALLEL }
28+ data = {"llama_model_path" : "/mnt/nas/gguf-models/meta-llama3.1-8b-instruct-q4km.gguf" , "model_alias" : "meta-llama3.1-8b-instruct" ,"engine" : "cortex.llamacpp" ,
29+ "model" : "meta-llama3.1-8b-instruct" , "ctx_len" : N_CTX , "ngl" : 300 , "model_type" : "llm" , "n_parallel" : N_PARALLEL }
30+
2631 result = requests .post (SERVER_ENDPOINT + "/loadmodel" ,
2732 headers = headers , json = data )
33+ # result = requests.post(SERVER_ENDPOINT+"/inferences/server/loadmodel",
34+ # headers=headers, json=data)
2835 print (result .json ())
2936
3037
31- async def send_request (session , prompt ):
38+ async def send_request (session , prompt ,sleep = 0 ):
39+ await asyncio .sleep (sleep )
3240 headers = {"Content-Type" : "application/json" }
33- data = {"model" : "meta-llama3.1-8b-instruct" ,
41+ data = {"model" : "meta-llama3.1-8b-instruct" , "max_tokens" : MAX_TOKENS , "stop" : [ "<|eom_id|>" , "<|end_of_text|>" , "<|eot_id|>" ], "engine" : "cortex.llamacpp" ,
3442 "messages" : [{"role" : "user" , "content" : prompt },]}
3543 async with session .post (SERVER_ENDPOINT + "/v1/chat/completions" , headers = headers , json = data ) as resp :
3644 result = await resp .json ()
3745 return result
3846
47+ async def one_user (session , prompt ):
48+ tasks = [send_request (session , prompt ,random .random ()* 0.2 + i ) for i in range (NUM_ROUNDS )]
49+ results = await asyncio .gather (* tasks )
50+ return results
51+
3952
4053async def send_request_sequence ():
4154 # warm up
42- async with aiohttp .ClientSession () as session :
55+ async with aiohttp .ClientSession (timeout = aiohttp . ClientTimeout () ) as session :
4356 res = await send_request (session , "What is GPU?" )
4457
4558 start = time .time ()
4659 total_token_processed = 0
47- async with aiohttp .ClientSession () as session :
60+ async with aiohttp .ClientSession (timeout = aiohttp . ClientTimeout () ) as session :
4861
4962 tasks = []
5063 prompts = ["What is GPU?" , "Who won the world cup 2022?" , "Tell me some dad's joke" ,
5164 "Write a quick sort function" , "What is the price of Nvidia H100?" , "Who won the world series in 2020?" ]
52- for number in range (TOTAL_REQUESTS ):
65+ for number in range (TOTAL_USERS ):
5366 res = await send_request (session , random .choice (prompts ))
5467 if res .get ("usage" ):
5568 total_token_processed += res ["usage" ]["total_tokens" ]
5669 else :
5770 print (res )
58-
71+
5972 end = time .time ()
6073 print ("Finished in" , end - start , "s" )
6174 print ("Total token:" , total_token_processed )
62- print ("Throughput when run in sequence:" , total_token_processed / (end - start ), "tokens/s" )
75+ print ("Throughput when run in sequence:" ,
76+ total_token_processed / (end - start ), "tokens/s" )
6377 print ("------------------------------------------------------------------------" )
6478
6579
6680async def main ():
6781 # warm up
68- async with aiohttp .ClientSession () as session :
82+ async with aiohttp .ClientSession (timeout = aiohttp . ClientTimeout () ) as session :
6983 res = await send_request (session , "What is GPU?" )
7084
7185 start = time .time ()
7286 total_token_processed = 0
73- async with aiohttp .ClientSession () as session :
87+ async with aiohttp .ClientSession (timeout = aiohttp . ClientTimeout () ) as session :
7488
7589 tasks = []
76- prompts = ["What is GPU?" , "Who won the world cup 2022?" , "Tell me some dad's joke" ,
77- "Write a quick sort function" , "What is the price of Nvidia H100?" , "Who won the world series in 2020?" ]
78- for number in range (TOTAL_REQUESTS ):
90+ prompts = [
91+ "What is GPU?" ,
92+ "Who won the world cup 2022?" ,
93+ "Tell me so many dad's joke," ,
94+ "Write a quick sort function," ,
95+ "What is the price of Nvidia H100?" ,
96+ "Who won the world series in 2020?" ,
97+ "Tell me a very long story," ,
98+ "Who is the best football player in the world?" ,
99+ "Tell me about compiler," ,
100+ "Tell me about AI," ]
101+ for number in range (TOTAL_USERS ):
79102 tasks .append (asyncio .ensure_future (
80- send_request (session , random .choice (prompts ))))
81-
82- results = await asyncio .gather (* tasks )
83- for res in results :
84- # print(res)
85- if res .get ("usage" ):
86- total_token_processed += res ["usage" ]["total_tokens" ]
87- else :
88- print (res )
103+ one_user (session , random .choice (prompts ))))
104+
105+ list_results = await asyncio .gather (* tasks )
106+ for results in list_results :
107+ for res in results :
108+ # print(res)
109+ if res .get ("usage" ):
110+ total_token_processed += res ["usage" ]["total_tokens" ]
111+ else :
112+ print (res )
89113 end = time .time ()
90114 print ("Finished in" , end - start , "s" )
91115 print ("Total token:" , total_token_processed )
92- print ("Throughput when run parallel:" , total_token_processed / (end - start ), "tokens/s" )
116+ print ("Throughput when run parallel:" ,
117+ total_token_processed / (end - start ), "tokens/s" )
93118 print ("------------------------------------------------------------------------" )
119+ with open ("result.log" ,"w" ) as writer :
120+ for results in list_results :
121+ for res in results :
122+ try :
123+ writer .write (res ["choices" ][0 ]["message" ]["content" ] + "\n \n " )
124+ except :
125+ continue
94126# start_server()
95127load_model ()
96128
97129asyncio .run (main ())
98130
99- asyncio .run (send_request_sequence ())
131+ # asyncio.run(send_request_sequence())
100132print ("--- %s seconds ---" % (time .time () - start_time ))
0 commit comments