Skip to content

Commit ebf4639

Browse files
committed
Performance testing tool based on the PyTest testing framework.
1 parent de63b7c commit ebf4639

19 files changed

+1509
-15
lines changed

test/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,4 +176,4 @@ api_config = config_utils.get_nested_config("easyPerf.api")
176176
2. Apply appropriate tags
177177
3. Naming: `test_*.py`
178178
4. Use fixtures & marks for data management
179-
5. Keep custom marks concise and aligned with overall goals
179+
5. Keep custom marks concise and aligned with overall goals

test/README_zh.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,4 +179,8 @@ api_config = config_utils.get_nested_config("easyPerf.api")
179179
2. 使用适当的测试标记
180180
3. 遵循命名规范:`test_*.py`
181181
4. 使用 fixture 及mark 进行测试数据管理
182-
5. 自定义 mark 标签不易过细,应当与整体功能目标相符合
182+
<<<<<<< HEAD
183+
5. 自定义 mark 标签不易过细,应当与整体功能目标相符合
184+
=======
185+
5. 自定义 mark 标签不易过细,应当与整体功能目标相符合
186+
>>>>>>> f01918a (Adapted to pytest framework)

test/common/llmperf/__init__.py

Whitespace-only changes.
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import json
2+
import os
3+
import random
4+
from pathlib import Path
5+
from typing import Any, Dict, List
6+
7+
import yaml
8+
from common.llmperf.utils.token_benchmark import run_token_benchmark
9+
from common.llmperf.utils.utils import reset_prefill_cache
10+
11+
12+
def run_test_cases(test_cases, timestamp_dir, model, server_url, tokenizer_path):
13+
"""
14+
Execute all test cases and return the list of failed case indices and hit_rate mapping for each case.
15+
Parameters:
16+
test_cases — List of test cases read from the configuration file
17+
timestamp_dir — Directory Path to save results
18+
model — Model name
19+
server_url — Base URL of the service
20+
tokenizer_path— Path to the tokenizer
21+
Returns:
22+
failed_cases — List of failed case indices
23+
"""
24+
print(f"[INFO] Total {len(test_cases)} test cases to be executed")
25+
all_summaries = []
26+
failed_case = []
27+
28+
# Clear proxy environment variables
29+
env = os.environ.copy()
30+
env.pop("http_proxy", None)
31+
env.pop("https_proxy", None)
32+
33+
for i, case in enumerate(test_cases):
34+
print(f"\n>>> Executing test case {i + 1} <<<")
35+
reset_prefill_cache(env, server_url)
36+
# Use a fixed random_seed for each test to control PC hit_rate
37+
random_seed = random.randint(1, 100000)
38+
summary = {}
39+
40+
# Read parameters from configuration file
41+
mean_input = case.get("mean_input_tokens", 5000)
42+
stddev_input = case.get("stddev_input_tokens", 0)
43+
mean_output = case.get("mean_output_tokens", 1000)
44+
stddev_output = case.get("stddev_output_tokens", 0)
45+
max_completed = case.get("max_num_completed_requests", 1)
46+
concurrent = case.get("concurrent_requests", 1)
47+
llm_api = case.get("llm_api", "openai")
48+
additional_sampling_params = case.get("additional_sampling_params", "{}")
49+
timeout = case.get("timeout", 60000)
50+
hit_rate = case.get("hit_rate", 0)
51+
52+
try:
53+
# Determine if two runs are needed (PC hit_rate test)
54+
if hit_rate == 0:
55+
summary = run_token_benchmark(
56+
llm_api=llm_api,
57+
model=model,
58+
test_timeout_s=timeout,
59+
max_num_completed_requests=max_completed,
60+
concurrent_requests=concurrent,
61+
mean_input_tokens=mean_input,
62+
stddev_input_tokens=stddev_input,
63+
mean_output_tokens=mean_output,
64+
stddev_output_tokens=stddev_output,
65+
additional_sampling_params=additional_sampling_params,
66+
results_dir=str(timestamp_dir),
67+
random_seed=random_seed,
68+
openai_api_base=server_url + "/v1",
69+
tokenizer_path=tokenizer_path,
70+
user_metadata={"case_idx": i},
71+
)
72+
else:
73+
print(
74+
f"[INFO] hit_rate > 0 detected, entering prefill mode, PC hit rate: {hit_rate} %"
75+
)
76+
# hit_rate > 0: first prefill mode
77+
prefill_mean_input = int(mean_input * hit_rate / 100)
78+
print(
79+
f"[INFO] Prefill execution: mean_input_tokens={prefill_mean_input}"
80+
)
81+
run_token_benchmark(
82+
llm_api=llm_api,
83+
model=model,
84+
test_timeout_s=timeout,
85+
max_num_completed_requests=max_completed,
86+
concurrent_requests=concurrent,
87+
mean_input_tokens=prefill_mean_input,
88+
stddev_input_tokens=stddev_input,
89+
mean_output_tokens=2,
90+
stddev_output_tokens=stddev_output,
91+
additional_sampling_params=additional_sampling_params,
92+
results_dir=str(timestamp_dir),
93+
random_seed=random_seed,
94+
openai_api_base=server_url + "/v1",
95+
tokenizer_path=tokenizer_path,
96+
user_metadata={"case_idx": i, "phase": "prefill"},
97+
)
98+
reset_prefill_cache(env, server_url)
99+
# Then run normal mode
100+
print("[INFO] Prefill completed, switching to normal mode execution")
101+
summary = run_token_benchmark(
102+
llm_api=llm_api,
103+
model=model,
104+
test_timeout_s=timeout,
105+
max_num_completed_requests=max_completed,
106+
concurrent_requests=concurrent,
107+
mean_input_tokens=mean_input,
108+
stddev_input_tokens=stddev_input,
109+
mean_output_tokens=mean_output,
110+
stddev_output_tokens=stddev_output,
111+
additional_sampling_params=additional_sampling_params,
112+
results_dir=str(timestamp_dir),
113+
random_seed=random_seed,
114+
openai_api_base=server_url + "/v1",
115+
tokenizer_path=tokenizer_path,
116+
user_metadata={"case_idx": i, "phase": "normal"},
117+
)
118+
all_summaries.append(summary)
119+
except Exception as e:
120+
failed_case.append(i)
121+
122+
return all_summaries, failed_case
123+
124+
125+
def inference_results():
126+
config_file = Path(__file__).parent.parent.parent / "config.yaml"
127+
all_smmaries = {}
128+
print("[INFO] Initialization complete, starting main process")
129+
print(f"[INFO] Reading configuration file: {config_file}")
130+
with open(config_file, "r", encoding="utf-8") as f:
131+
config = yaml.safe_load(f)
132+
model = config.get("llm_connection", {}).get("model", "")
133+
server_url = config.get("llm_connection", {}).get("server_url", "")
134+
tokenizer_path = config.get("llm_connection", {}).get("tokenizer_path", "")
135+
test_cases = config.get("llmperf_test_cases", [])
136+
timestamp_dir = Path("results")
137+
timestamp_dir.mkdir(parents=True, exist_ok=True)
138+
print(f"[INFO] Created results directory: {timestamp_dir}")
139+
140+
all_summaries, failed_cases = run_test_cases(
141+
test_cases, timestamp_dir, model, server_url, tokenizer_path
142+
)
143+
total = len(test_cases)
144+
print(
145+
f"\n[INFO] All tests completed! Success: {total - len(failed_cases)}/{total}"
146+
)
147+
if failed_cases:
148+
print(f"[WARN] Failed case indices: {failed_cases}")
149+
return all_summaries

test/common/llmperf/utils/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# TODO (Avnishn): compute metrics in class
2+
INTER_TOKEN_LAT = "inter_token_latency_s"
3+
TTFT = "ttft_s"
4+
E2E_LAT = "end_to_end_latency_s"
5+
NUM_INPUT_TOKENS = "number_input_tokens"
6+
NUM_OUTPUT_TOKENS = "number_output_tokens"
7+
NUM_TOTAL_TOKENS = "number_total_tokens"
8+
REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s"
9+
ERROR_MSG = "error_msg"
10+
ERROR_CODE = "error_code"
11+
ERROR_CODE_FREQ = "error_code_frequency"
12+
NUM_ERRORS = "number_errors"
13+
OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s"
14+
NUM_COMPLETED_REQUESTS = "num_completed_requests"
15+
COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
16+
ERROR_RATE = "error_rate"
17+
NUM_REQ_STARTED = "num_requests_started"
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import Any, Dict, Optional, Tuple
2+
3+
from pydantic import BaseModel
4+
5+
6+
class RequestConfig(BaseModel):
7+
"""The configuration for a request to the LLM API.
8+
9+
Args:
10+
model: The model to use.
11+
prompt: The prompt to provide to the LLM API.
12+
sampling_params: Additional sampling parameters to send with the request.
13+
For more information see the Router app's documentation for the completions
14+
llm_api: The name of the LLM API to send the request to.
15+
metadata: Additional metadata to attach to the request for logging or validation purposes.
16+
"""
17+
18+
model: str
19+
prompt: Tuple[str, int]
20+
sampling_params: Optional[Dict[str, Any]] = None
21+
llm_api: Optional[str] = None
22+
metadata: Optional[Dict[str, Any]] = None
23+
openai_api_base: Optional[str] = ""
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import json
2+
import os
3+
import time
4+
from typing import Any, Dict, Tuple
5+
6+
import requests
7+
from common.llmperf.utils import common_metrics
8+
from common.llmperf.utils.models import RequestConfig
9+
10+
11+
class OpenAIChatCompletionsClient:
12+
"""
13+
used for sending HTTP requests, receiving token streams, measuring latency, etc.
14+
"""
15+
16+
def llm_request(
17+
self, request_config: RequestConfig
18+
) -> Tuple[Dict[str, Any], str, RequestConfig]:
19+
prompt, prompt_len = request_config.prompt
20+
21+
message = [
22+
{"role": "system", "content": ""},
23+
{"role": "user", "content": prompt},
24+
]
25+
model = request_config.model
26+
body = {
27+
"model": model,
28+
"messages": message,
29+
"stream": True,
30+
"ignore_eos": True,
31+
}
32+
sampling_params = request_config.sampling_params
33+
body.update(sampling_params or {})
34+
35+
time_to_next_token = []
36+
tokens_received = 0
37+
ttft = 0.0
38+
error_response_code = None
39+
generated_text = ""
40+
error_msg = ""
41+
output_throughput = 0.0
42+
total_request_time = 0.0
43+
flag = False
44+
45+
metrics: Dict[str, Any] = {}
46+
47+
metrics[common_metrics.ERROR_CODE] = None
48+
metrics[common_metrics.ERROR_MSG] = ""
49+
50+
start_time = time.monotonic()
51+
most_recent_received_token_time = start_time
52+
53+
address = request_config.openai_api_base
54+
55+
if not address:
56+
raise ValueError("the environment variable OPENAI_API_BASE must be set.")
57+
key = os.environ.get("OPENAI_API_KEY", "secret_abcdefg")
58+
if not key:
59+
raise ValueError("the environment variable OPENAI_API_KEY must be set.")
60+
headers = {"Authorization": f"Bearer {key}"}
61+
if not address.endswith("/"):
62+
address = address + "/"
63+
address += "chat/completions"
64+
try:
65+
with requests.post(
66+
address,
67+
json=body,
68+
stream=True,
69+
timeout=180,
70+
headers=headers,
71+
) as response:
72+
if response.status_code != 200:
73+
error_msg = response.text
74+
error_response_code = response.status_code
75+
response.raise_for_status()
76+
77+
for chunk in response.iter_lines(chunk_size=None):
78+
if not chunk:
79+
continue
80+
stem = b"data: "
81+
if chunk.startswith(stem):
82+
chunk = chunk[len(stem) :]
83+
# Data might already be bytes or str
84+
if isinstance(chunk, bytes):
85+
chunk = chunk.decode("utf-8", errors="ignore")
86+
if chunk.strip() == "[DONE]":
87+
continue
88+
tokens_received += 1
89+
data = json.loads(chunk)
90+
if "error" in data:
91+
error_msg = data["error"]["message"]
92+
error_response_code = data["error"]["code"]
93+
raise RuntimeError(error_msg)
94+
delta = data["choices"][0]["delta"]
95+
content = delta.get("content", None) or delta.get(
96+
"reasoning_content", ""
97+
)
98+
if content:
99+
if tokens_received != 0 and flag == False:
100+
ttft = time.monotonic() - start_time
101+
flag = True
102+
else:
103+
time_to_next_token.append(
104+
time.monotonic() - most_recent_received_token_time
105+
)
106+
most_recent_received_token_time = time.monotonic()
107+
generated_text += content
108+
109+
total_request_time = time.monotonic() - start_time
110+
if total_request_time > 0:
111+
output_throughput = tokens_received / total_request_time
112+
113+
except Exception as e:
114+
metrics[common_metrics.ERROR_MSG] = error_msg
115+
metrics[common_metrics.ERROR_CODE] = error_response_code
116+
print(f"Warning Or Error: {e}")
117+
print(error_response_code)
118+
119+
metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token)
120+
metrics[common_metrics.TTFT] = ttft
121+
metrics[common_metrics.E2E_LAT] = total_request_time
122+
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
123+
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
124+
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
125+
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
126+
127+
return metrics, generated_text, request_config

0 commit comments

Comments
 (0)