Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tensorrt_llm/evaluate/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ class Evaluator(ABC):
def __init__(self,
random_seed: int = 0,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
system_prompt: Optional[str] = None):
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
self.apply_chat_template = apply_chat_template
self.fewshot_as_multiturn = fewshot_as_multiturn
self.system_prompt = system_prompt

@abstractmethod
Expand Down
46 changes: 33 additions & 13 deletions tensorrt_llm/evaluate/lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def __init__(self,
num_samples: Optional[int] = None,
random_seed: int = 0,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
system_prompt: Optional[str] = None):
try:
import lm_eval
Expand All @@ -141,8 +142,10 @@ def __init__(self,
f"Evaluation task {self.__class__.__name__} requires `lm_eval`. "
"Please install the package first, e.g., `pip install lm_eval`."
) from e
import lm_eval.tasks
super().__init__(random_seed=random_seed,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
system_prompt=system_prompt)
self.task_name = task_name
self.dataset_path = dataset_path
Expand Down Expand Up @@ -190,14 +193,16 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
def evaluate(self,
llm: Union[LLM, PyTorchLLM],
sampling_params: Optional[SamplingParams] = None,
streaming: bool = False) -> float:
streaming: bool = False,
scores_filter: str = None) -> float:
import lm_eval
results = lm_eval.evaluate(lm=LmEvalWrapper(llm, sampling_params,
streaming),
task_dict=self.task_dict,
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
system_instruction=self.system_prompt)
results = lm_eval.evaluate(
lm=LmEvalWrapper(llm, sampling_params, streaming),
task_dict=self.task_dict,
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
fewshot_as_multiturn=self.fewshot_as_multiturn,
system_instruction=self.system_prompt)
# Normalize scores to range 0~100
scores = results["results"][self.task_name]
for metric in scores.keys():
Expand All @@ -206,12 +211,17 @@ def evaluate(self,
logger.info(
f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
)

average_acc = np.mean(
[acc for m, acc in scores.items() if "_stderr" not in m])
logger.info(
f"lm-eval {self.task_name} average accuracy: {average_acc:.2f}")
return average_acc
if scores_filter is not None:
result_acc = results["results"][self.task_name][scores_filter]
logger.info(
f"lm-eval {self.task_name} {scores_filter} accuracy: {result_acc:.2f}"
)
else:
result_acc = np.mean(
[acc for m, acc in scores.items() if "_stderr" not in m])
logger.info(
f"lm-eval {self.task_name} average accuracy: {result_acc:.2f}")
return result_acc

@classmethod
def command_harness(cls, ctx, **kwargs):
Expand All @@ -221,6 +231,8 @@ def command_harness(cls, ctx, **kwargs):
random_seed=kwargs.pop("random_seed", 0),
apply_chat_template=kwargs.pop("apply_chat_template",
False),
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
False),
system_prompt=kwargs.pop("system_prompt", None))
sampling_params = SamplingParams(
max_tokens=kwargs.pop("max_output_length"),
Expand Down Expand Up @@ -254,6 +266,10 @@ def __init__(self, **kwargs):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option("--fewshot_as_multiturn",
is_flag=True,
default=False,
help="Apply fewshot as multiturn.")
@click.option("--system_prompt",
type=str,
default=None,
Expand All @@ -269,6 +285,10 @@ def __init__(self, **kwargs):
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
if kwargs.get("fewshot_as_multiturn", False):
assert kwargs.get(
"apply_chat_template", False
), "apply_chat_template must be True when fewshot_as_multiturn is True"
GSM8K.command_harness(ctx, **kwargs)


Expand Down
8 changes: 7 additions & 1 deletion tests/integration/defs/accuracy/accuracy_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,11 @@ def evaluate(self,
evaluator_kwargs.update(extra_evaluator_kwargs)
evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
**evaluator_kwargs)
accuracy = evaluator.evaluate(llm, sampling_params, streaming)
evaluate_kwargs = {}
if hasattr(self, 'EVALUATE_KWARGS'):
evaluate_kwargs.update(self.EVALUATE_KWARGS)
accuracy = evaluator.evaluate(llm, sampling_params, streaming,
**evaluate_kwargs)
if self.HIGHER_IS_BETTER:
assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
else:
Expand Down Expand Up @@ -298,6 +302,8 @@ class GSM8K(AccuracyTask):
EVALUATOR_CLS = tensorrt_llm.evaluate.GSM8K
EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR, random_seed=0)

EVALUATE_KWARGS = dict(scores_filter=None)


class GPQADiamond(AccuracyTask):
DATASET = "gpqa_diamond"
Expand Down
8 changes: 4 additions & 4 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,12 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
microsoft/Phi-4-mini-instruct:
- accuracy: 82.30
GPT-OSS/BF16:
- accuracy: 88.5
- accuracy: 90.3
GPT-OSS/MXFP4:
- accuracy: 88.5
- accuracy: 90.3
- quant_algo: W4A8_MXFP4_MXFP8
accuracy: 88.5
accuracy: 90.3
- quant_algo: W4A8_MXFP4_FP8
accuracy: 88.5
accuracy: 90.3
LGAI-EXAONE/EXAONE-4.0-32B:
- accuracy: 88.36
27 changes: 14 additions & 13 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2463,10 +2463,14 @@ def test_auto_dtype_long_rope(self):
class TestGPTOSS(LlmapiAccuracyTestHarness):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)

def get_gpt_oss_root(self):
gpt_oss_root = os.getenv("GPT_OSS_MODELS_ROOT")
assert gpt_oss_root, "GPT_OSS_MODELS_ROOT needs to be set as parent of checkpoints."
return gpt_oss_root
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"

def update_task_kwargs(self, task):
task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
task.EVALUATOR_KWARGS["apply_chat_template"] = True
task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
task.MAX_OUTPUT_LEN = 8192
return task

@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
ids=["cutlass", "trtllm", "triton"])
Expand All @@ -2481,7 +2485,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
disable_overlap_scheduler=not overlap_scheduler,
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=1,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
Expand All @@ -2491,9 +2495,8 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):

with llm:
model_name = "GPT-OSS/MXFP4"
task = MMLU(model_name)
task.evaluate(llm)
task = GSM8K(model_name)
task = self.update_task_kwargs(task)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
Expand All @@ -2519,7 +2522,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
disable_overlap_scheduler=not overlap_scheduler,
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

llm = LLM(f"{self.get_gpt_oss_root()}/gpt-oss-120b",
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
Expand All @@ -2530,9 +2533,8 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,

with llm:
model_name = "GPT-OSS/MXFP4"
task = MMLU(model_name)
task.evaluate(llm)
task = GSM8K(model_name)
task = self.update_task_kwargs(task)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
Expand All @@ -2551,7 +2553,7 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
disable_overlap_scheduler=not overlap_scheduler,
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

llm = LLM(f"{self.get_openai_root()}/gpt-oss-120b",
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
Expand All @@ -2561,9 +2563,8 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
moe_backend="TRITON")
with llm:
model_name = "GPT-OSS/BF16"
task = MMLU(model_name)
task.evaluate(llm)
task = GSM8K(model_name)
task = self.update_task_kwargs(task)
task.evaluate(llm)


Expand Down
13 changes: 13 additions & 0 deletions tests/integration/test_lists/qa/llm_function_full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,19 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/test-db/l0_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
Expand Down
10 changes: 10 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,13 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
21 changes: 21 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,27 @@ l0_dgx_h100:
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16]
- disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: pytorch
auto_trigger: gpt_oss
tests:
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
- condition:
ranges:
system_gpu_count:
Expand Down
Loading