Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,6 @@ src/ui/next-env.d.ts
!src/ui/public/manifest.json
!src/ui/serve.json
.eslintcache

# vllm-sim
bin/
12 changes: 12 additions & 0 deletions tests/e2e/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# E2E tests

The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:

```shell
docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
```

Then to run the tests:
```shell
tox -e test-e2e
```
72 changes: 72 additions & 0 deletions tests/e2e/test_max_error_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# E2E test for max error rate constraint functionality

from pathlib import Path

import pytest

from tests.e2e.utils import (
GuidellmClient,
assert_constraint_triggered,
assert_no_python_exceptions,
cleanup_report_file,
load_benchmark_report,
)
from tests.e2e.vllm_sim_server import VllmSimServer


@pytest.fixture(scope="module")
def server():
"""
Pytest fixture to start and stop the server for the entire module
using the TestServer class.
"""
server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo")
try:
server.start()
yield server # Yield the URL for tests to use
finally:
server.stop() # Teardown: Stop the server after tests are done


@pytest.mark.timeout(30)
def test_max_error_benchmark(server: VllmSimServer):
"""
Test that the max error rate constraint is properly triggered when server goes down.
"""
report_path = Path("tests/e2e/max_error_benchmarks.json")
rate = 10
max_error_rate = 0.1

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_seconds=25,
max_error_rate=max_error_rate,
)

# Wait for the benchmark to complete (server will be stopped after 10 seconds)
client.wait_for_completion(timeout=30, stop_server_after=10, server=server)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max error rate constraint was triggered
assert_constraint_triggered(
benchmark,
"max_error_rate",
{
"exceeded_error_rate": True,
"current_error_rate": lambda rate: rate >= max_error_rate,
},
)

finally:
cleanup_report_file(report_path)
6 changes: 0 additions & 6 deletions tests/e2e/test_placeholder.py

This file was deleted.

120 changes: 120 additions & 0 deletions tests/e2e/test_successful_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# E2E tests for successful benchmark scenarios with timing validation

from pathlib import Path

import pytest

from tests.e2e.utils import (
GuidellmClient,
assert_constraint_triggered,
assert_no_python_exceptions,
assert_successful_requests_fields,
cleanup_report_file,
load_benchmark_report,
)
from tests.e2e.vllm_sim_server import VllmSimServer


@pytest.fixture(scope="module")
def server():
"""
Pytest fixture to start and stop the server for the entire module
using the TestServer class.
"""
server = VllmSimServer(
port=8000,
model="databricks/dolly-v2-12b",
mode="echo",
time_to_first_token=1, # 1ms TTFT
inter_token_latency=1, # 1ms ITL
)
try:
server.start()
yield server # Yield the URL for tests to use
finally:
server.stop() # Teardown: Stop the server after tests are done


@pytest.mark.timeout(30)
def test_max_seconds_benchmark(server: VllmSimServer):
"""
Test that the max seconds constraint is properly triggered.
"""
report_path = Path("tests/e2e/max_duration_benchmarks.json")
rate = 10

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_seconds=1,
)

# Wait for the benchmark to complete
client.wait_for_completion(timeout=30)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max duration constraint was triggered
assert_constraint_triggered(
benchmark, "max_seconds", {"duration_exceeded": True}
)

# Validate successful requests have all expected fields
successful_requests = benchmark["requests"]["successful"]
assert_successful_requests_fields(successful_requests)

finally:
cleanup_report_file(report_path)


@pytest.mark.timeout(30)
def test_max_requests_benchmark(server: VllmSimServer):
"""
Test that the max requests constraint is properly triggered.
"""
report_path = Path("tests/e2e/max_number_benchmarks.json")
rate = 10

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_requests=rate,
)

# Wait for the benchmark to complete
client.wait_for_completion(timeout=30)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max requests constraint was triggered
assert_constraint_triggered(
benchmark, "max_requests", {"processed_exceeded": True}
)

# Validate successful requests have all expected fields
successful_requests = benchmark["requests"]["successful"]
assert len(successful_requests) == rate, (
f"Expected {rate} successful requests, got {len(successful_requests)}"
)
assert_successful_requests_fields(successful_requests)

finally:
cleanup_report_file(report_path)
Loading