Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,4 @@ api_config = config_utils.get_nested_config("easyPerf.api")
2. Apply appropriate tags
3. Naming: `test_*.py`
4. Use fixtures & marks for data management
5. Keep custom marks concise and aligned with overall goals
5. Keep custom marks concise and aligned with overall goals
2 changes: 1 addition & 1 deletion test/README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,4 @@ api_config = config_utils.get_nested_config("easyPerf.api")
2. 使用适当的测试标记
3. 遵循命名规范:`test_*.py`
4. 使用 fixture 及mark 进行测试数据管理
5. 自定义 mark 标签不易过细,应当与整体功能目标相符合
5. 自定义 mark 标签不易过细,应当与整体功能目标相符合
Empty file.
149 changes: 149 additions & 0 deletions test/common/llmperf/run_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import json
import os
import random
from pathlib import Path
from typing import Any, Dict, List

import yaml
from common.llmperf.utils.token_benchmark import run_token_benchmark
from common.llmperf.utils.utils import reset_prefill_cache


def run_test_cases(test_cases, timestamp_dir, model, server_url, tokenizer_path):

This comment was marked as resolved.

"""
Execute all test cases and return the list of failed case indices and hit_rate mapping for each case.
Parameters:
test_cases — List of test cases read from the configuration file
timestamp_dir — Directory Path to save results
model — Model name
server_url — Base URL of the service
tokenizer_path— Path to the tokenizer
Returns:
failed_cases — List of failed case indices
"""
print(f"[INFO] Total {len(test_cases)} test cases to be executed")
all_summaries = []
failed_case = []

# Clear proxy environment variables
env = os.environ.copy()
env.pop("http_proxy", None)
env.pop("https_proxy", None)

for i, case in enumerate(test_cases):
print(f"\n>>> Executing test case {i + 1} <<<")
reset_prefill_cache(env, server_url)
# Use a fixed random_seed for each test to control PC hit_rate
random_seed = random.randint(1, 100000)
summary = {}

# Read parameters from configuration file
mean_input = case.get("mean_input_tokens", 5000)
stddev_input = case.get("stddev_input_tokens", 0)
mean_output = case.get("mean_output_tokens", 1000)
stddev_output = case.get("stddev_output_tokens", 0)
max_completed = case.get("max_num_completed_requests", 1)
concurrent = case.get("concurrent_requests", 1)
llm_api = case.get("llm_api", "openai")
additional_sampling_params = case.get("additional_sampling_params", "{}")
timeout = case.get("timeout", 60000)
hit_rate = case.get("hit_rate", 0)

try:
# Determine if two runs are needed (PC hit_rate test)
if hit_rate == 0:
summary = run_token_benchmark(
llm_api=llm_api,
model=model,
test_timeout_s=timeout,
max_num_completed_requests=max_completed,
concurrent_requests=concurrent,
mean_input_tokens=mean_input,
stddev_input_tokens=stddev_input,
mean_output_tokens=mean_output,
stddev_output_tokens=stddev_output,
additional_sampling_params=additional_sampling_params,
results_dir=str(timestamp_dir),
random_seed=random_seed,
openai_api_base=server_url + "/v1",
tokenizer_path=tokenizer_path,
user_metadata={"case_idx": i},
)
else:
print(
f"[INFO] hit_rate > 0 detected, entering prefill mode, PC hit rate: {hit_rate} %"
)
# hit_rate > 0: first prefill mode
prefill_mean_input = int(mean_input * hit_rate / 100)
print(
f"[INFO] Prefill execution: mean_input_tokens={prefill_mean_input}"
)
run_token_benchmark(
llm_api=llm_api,
model=model,
test_timeout_s=timeout,
max_num_completed_requests=max_completed,
concurrent_requests=concurrent,
mean_input_tokens=prefill_mean_input,
stddev_input_tokens=stddev_input,
mean_output_tokens=2,
stddev_output_tokens=stddev_output,
additional_sampling_params=additional_sampling_params,
results_dir=str(timestamp_dir),
random_seed=random_seed,
openai_api_base=server_url + "/v1",
tokenizer_path=tokenizer_path,
user_metadata={"case_idx": i, "phase": "prefill"},
)
reset_prefill_cache(env, server_url)
# Then run normal mode
print("[INFO] Prefill completed, switching to normal mode execution")
summary = run_token_benchmark(
llm_api=llm_api,
model=model,
test_timeout_s=timeout,
max_num_completed_requests=max_completed,
concurrent_requests=concurrent,
mean_input_tokens=mean_input,
stddev_input_tokens=stddev_input,
mean_output_tokens=mean_output,
stddev_output_tokens=stddev_output,
additional_sampling_params=additional_sampling_params,
results_dir=str(timestamp_dir),
random_seed=random_seed,
openai_api_base=server_url + "/v1",
tokenizer_path=tokenizer_path,
user_metadata={"case_idx": i, "phase": "normal"},
)
all_summaries.append(summary)
except Exception as e:
failed_case.append(i)

return all_summaries, failed_case


def inference_results():
config_file = Path(__file__).parent.parent.parent / "config.yaml"
all_smmaries = {}
print("[INFO] Initialization complete, starting main process")
print(f"[INFO] Reading configuration file: {config_file}")
with open(config_file, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
model = config.get("llm_connection", {}).get("model", "")
server_url = config.get("llm_connection", {}).get("server_url", "")
tokenizer_path = config.get("llm_connection", {}).get("tokenizer_path", "")
test_cases = config.get("llmperf_test_cases", [])
timestamp_dir = Path("results")
timestamp_dir.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Created results directory: {timestamp_dir}")

all_summaries, failed_cases = run_test_cases(
test_cases, timestamp_dir, model, server_url, tokenizer_path
)
total = len(test_cases)
print(
f"\n[INFO] All tests completed! Success: {total - len(failed_cases)}/{total}"
)
if failed_cases:
print(f"[WARN] Failed case indices: {failed_cases}")
return all_summaries
Empty file.
17 changes: 17 additions & 0 deletions test/common/llmperf/utils/common_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# TODO (Avnishn): compute metrics in class
INTER_TOKEN_LAT = "inter_token_latency_s"
TTFT = "ttft_s"
E2E_LAT = "end_to_end_latency_s"
NUM_INPUT_TOKENS = "number_input_tokens"
NUM_OUTPUT_TOKENS = "number_output_tokens"
NUM_TOTAL_TOKENS = "number_total_tokens"
REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s"
ERROR_MSG = "error_msg"
ERROR_CODE = "error_code"
ERROR_CODE_FREQ = "error_code_frequency"
NUM_ERRORS = "number_errors"
OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s"
NUM_COMPLETED_REQUESTS = "num_completed_requests"
COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
ERROR_RATE = "error_rate"
NUM_REQ_STARTED = "num_requests_started"
23 changes: 23 additions & 0 deletions test/common/llmperf/utils/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Any, Dict, Optional, Tuple

from pydantic import BaseModel


class RequestConfig(BaseModel):
"""The configuration for a request to the LLM API.

Args:
model: The model to use.
prompt: The prompt to provide to the LLM API.
sampling_params: Additional sampling parameters to send with the request.
For more information see the Router app's documentation for the completions
llm_api: The name of the LLM API to send the request to.
metadata: Additional metadata to attach to the request for logging or validation purposes.
"""

model: str
prompt: Tuple[str, int]
sampling_params: Optional[Dict[str, Any]] = None
llm_api: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
openai_api_base: Optional[str] = ""
126 changes: 126 additions & 0 deletions test/common/llmperf/utils/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import json
import os
import time
from typing import Any, Dict, Tuple

import requests
from common.llmperf.utils import common_metrics
from common.llmperf.utils.models import RequestConfig


class OpenAIChatCompletionsClient:
"""
used for sending HTTP requests, receiving token streams, measuring latency, etc.
"""

def llm_request(
self, request_config: RequestConfig
) -> Tuple[Dict[str, Any], str, RequestConfig]:
prompt, prompt_len = request_config.prompt

message = [
{"role": "user", "content": prompt},
]
model = request_config.model
body = {
"model": model,
"messages": message,
"stream": True,
"ignore_eos": True,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to set this param in config files

}
sampling_params = request_config.sampling_params
body.update(sampling_params or {})

time_to_next_token = []
tokens_received = 0
ttft = 0.0
error_response_code = None
generated_text = ""
error_msg = ""
output_throughput = 0.0
total_request_time = 0.0
flag = False

metrics: Dict[str, Any] = {}

metrics[common_metrics.ERROR_CODE] = None
metrics[common_metrics.ERROR_MSG] = ""

start_time = time.monotonic()
most_recent_received_token_time = start_time

address = request_config.openai_api_base

if not address:
raise ValueError("the environment variable OPENAI_API_BASE must be set.")
key = os.environ.get("OPENAI_API_KEY", "secret_abcdefg")
if not key:
raise ValueError("the environment variable OPENAI_API_KEY must be set.")
headers = {"Authorization": f"Bearer {key}"}
if not address.endswith("/"):
address = address + "/"
address += "chat/completions"
try:
with requests.post(
address,
json=body,
stream=True,
timeout=180,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to set timeout in config files as there might be cases which cost a while to finish

headers=headers,
) as response:
if response.status_code != 200:
error_msg = response.text
error_response_code = response.status_code
response.raise_for_status()

for chunk in response.iter_lines(chunk_size=None):
if not chunk:
continue
stem = b"data: "
if chunk.startswith(stem):
chunk = chunk[len(stem) :]
# Data might already be bytes or str
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8", errors="ignore")
if chunk.strip() == "[DONE]":
continue
tokens_received += 1
data = json.loads(chunk)
if "error" in data:
error_msg = data["error"]["message"]
error_response_code = data["error"]["code"]
raise RuntimeError(error_msg)
delta = data["choices"][0]["delta"]
content = delta.get("content", None) or delta.get(
"reasoning_content", ""
)
if content:
if tokens_received != 0 and flag == False:
ttft = time.monotonic() - start_time
flag = True
else:
time_to_next_token.append(
time.monotonic() - most_recent_received_token_time
)
most_recent_received_token_time = time.monotonic()
generated_text += content

total_request_time = time.monotonic() - start_time
if total_request_time > 0:
output_throughput = tokens_received / total_request_time

except Exception as e:
metrics[common_metrics.ERROR_MSG] = error_msg
metrics[common_metrics.ERROR_CODE] = error_response_code
print(f"Warning Or Error: {e}")
print(error_response_code)

metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token)
metrics[common_metrics.TTFT] = ttft
metrics[common_metrics.E2E_LAT] = total_request_time
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len

return metrics, generated_text, request_config
Loading