From e5c4865a54d812267b6d3cb80a4dd4c548e6f46d Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Mon, 17 Nov 2025 02:13:10 -0800 Subject: [PATCH 1/3] fix dsv3 debug mode Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- .../_torch/models/modeling_deepseekv3.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 2d6f0db8699..6a06be8c1a2 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -640,11 +640,6 @@ def __init__( def get_scores(logits, e_score_correction_bias): scores = F.sigmoid(logits) scores_with_bias = scores + e_score_correction_bias - return scores, scores_with_bias - - def noaux_tc(self, logits, e_score_correction_bias): - n_group = self.n_group - if enable_llm_debug(): has_nan = torch.isnan(scores_with_bias).any() if has_nan: @@ -652,6 +647,11 @@ def noaux_tc(self, logits, e_score_correction_bias): "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation." ) + return scores, scores_with_bias + + def noaux_tc(self, logits, e_score_correction_bias): + n_group = self.n_group + _, num_experts = logits.shape if self.n_group > 1: if self.top_k > 8 or (num_experts / n_group) > 32 or ( @@ -672,6 +672,13 @@ def noaux_tc(self, logits, e_score_correction_bias): if not self.is_fused: scores, scores_with_bias = Deepseekv3RoutingImpl.get_scores( logits, e_score_correction_bias) + if enable_llm_debug(): + has_nan = torch.isnan(scores_with_bias).any() + if has_nan: + warnings.warn( + "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation." + ) + scores_shape = list(scores_with_bias.shape) group_scores = torch.sum(torch.topk( scores_with_bias.view(scores_shape[:-1] + From f37e5eb0d6a4c3d67555a1b4568a8e937ff0b0d1 Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Mon, 17 Nov 2025 23:51:08 -0800 Subject: [PATCH 2/3] debug h100 disagg perf test Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- .../_torch/models/modeling_deepseekv3.py | 17 +-- .../accuracy/test_disaggregated_serving.py | 105 ++++++++++++++++-- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 6a06be8c1a2..2d6f0db8699 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -640,6 +640,11 @@ def __init__( def get_scores(logits, e_score_correction_bias): scores = F.sigmoid(logits) scores_with_bias = scores + e_score_correction_bias + return scores, scores_with_bias + + def noaux_tc(self, logits, e_score_correction_bias): + n_group = self.n_group + if enable_llm_debug(): has_nan = torch.isnan(scores_with_bias).any() if has_nan: @@ -647,11 +652,6 @@ def get_scores(logits, e_score_correction_bias): "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation." ) - return scores, scores_with_bias - - def noaux_tc(self, logits, e_score_correction_bias): - n_group = self.n_group - _, num_experts = logits.shape if self.n_group > 1: if self.top_k > 8 or (num_experts / n_group) > 32 or ( @@ -672,13 +672,6 @@ def noaux_tc(self, logits, e_score_correction_bias): if not self.is_fused: scores, scores_with_bias = Deepseekv3RoutingImpl.get_scores( logits, e_score_correction_bias) - if enable_llm_debug(): - has_nan = torch.isnan(scores_with_bias).any() - if has_nan: - warnings.warn( - "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation." - ) - scores_shape = list(scores_with_bias.shape) group_scores = torch.sum(torch.topk( scores_with_bias.view(scores_shape[:-1] + diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 4e3d6523b55..d78f5dd324f 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -25,6 +25,69 @@ from .accuracy_core import (GSM8K, MMLU, JsonModeEval, LlmapiAccuracyTestHarness, get_accuracy_task) +MAX_PERF_METRICS_REQUESTS = 100 + + +def get_worker_env_vars(kv_cache_perf_dir: str = None): + env = os.environ.copy() + if kv_cache_perf_dir: + env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_perf_dir + return env + + +def show_debug_perf(thread_pool: ThreadPoolExecutor, + kv_cache_perf_dir: str = None, + perf_metrics_url: str = None): + + def wait_for_all_tasks_to_complete(): + #thread_pool.shutdown(wait=True) + try: + print("Waiting for all tasks to complete") + for future in getattr(thread_pool, "futures", []): + try: + future.result(timeout=300) + except concurrent.futures.TimeoutError: + print("Timeout waiting for a future to complete.") + except Exception as e: + print(f"Future completed with error: {e}") + except Exception as e: + print(f"Error while waiting for futures: {e}") + + def show_kvcache_time(kv_cache_perf_dir, max_lines=100): + for file in os.listdir(kv_cache_perf_dir): + print(f"{'-'*25} {file}:{max_lines} {'-'*25}") + with open(os.path.join(kv_cache_perf_dir, file), "r") as f: + for line in f.readlines()[-max_lines:]: + print(line.strip()) + + def show_perf_metrics(url): + perf_url = f"{url}/perf_metrics" + try: + print(f"Fetching perf metrics from {perf_url}") + resp = requests.get(perf_url, timeout=10) + if resp.status_code == 200: + try: + print("perf_metrics JSON:") + metrics = resp.json() + print(json.dumps(metrics, indent=2, ensure_ascii=False)) + print("-" * 100) + except ValueError: + print("perf_metrics returned non-JSON response:", resp.text) + else: + print( + f"perf_metrics returned status {resp.status_code}: {resp.text}" + ) + except requests.exceptions.RequestException as e: + print(f"Error fetching {perf_url}: {e}") + + wait_for_all_tasks_to_complete() + if kv_cache_perf_dir: + show_kvcache_time(kv_cache_perf_dir) + if perf_metrics_url: + show_perf_metrics(perf_metrics_url) + # force failure to see the logs + assert False + class Result(GenerationResultBase): @@ -76,15 +139,29 @@ def launch_disaggregated_llm( ctx_model: str = None, gen_model: str = None, server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT, - max_workers: int = 16): + max_workers: int = 16, + debug_perf: bool = False): temp_dir = tempfile.TemporaryDirectory() disaggregated_serving_config_path = os.path.join( temp_dir.name, "disaggregated_serving_config.yaml") - + if debug_perf: + kv_cache_perf_dir = os.path.join(temp_dir.name, "kvcache_perf") + os.makedirs(kv_cache_perf_dir, exist_ok=True) + else: + kv_cache_perf_dir = None if tensor_parallel_size > 1: print( f"Using unified tp parameter for testing is not recommended. Please use server configs instead." ) + if debug_perf: + disaggregated_server_config[ + "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS + ctx_server_config["return_perf_metrics"] = True + ctx_server_config[ + "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS + gen_server_config["return_perf_metrics"] = True + gen_server_config[ + "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS with open(disaggregated_serving_config_path, "w") as f: yaml.dump(disaggregated_server_config, f) @@ -144,7 +221,7 @@ def launch_disaggregated_llm( current_gpu_offset = 0 for i, port in enumerate(ctx_ports): - env_ctx = os.environ.copy() + env_ctx = get_worker_env_vars(kv_cache_perf_dir=kv_cache_perf_dir) env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1" gpu_range = range(current_gpu_offset, current_gpu_offset + ctx_total_gpus) @@ -165,7 +242,7 @@ def launch_disaggregated_llm( gen_servers = [] for i, port in enumerate(gen_ports): - env_gen = os.environ.copy() + env_gen = get_worker_env_vars(kv_cache_perf_dir=kv_cache_perf_dir) env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1" gpu_range = range(current_gpu_offset, current_gpu_offset + gen_total_gpus) @@ -289,6 +366,14 @@ def generate_async(prompt: str, tokenizer = load_hf_tokenizer(model_name) yield DuckLLM(args, tokenizer, generate_async) + if debug_perf: + show_debug_perf( + thread_pool, + kv_cache_perf_dir=kv_cache_perf_dir, + perf_metrics_url=f"http://localhost:8000" + if debug_perf else None, + ) + def run_parallel_test(model_name: str, model_path: str, @@ -357,7 +442,7 @@ def run_parallel_test(model_name: str, task.evaluate(llm) -@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT) +@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT * 5) class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" @@ -510,9 +595,13 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): "urls": ["localhost:8002"] } } - with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, gen_server_config, - self.MODEL_PATH) as llm: + with launch_disaggregated_llm( + disaggregated_server_config, + ctx_server_config, + gen_server_config, + self.MODEL_PATH, + debug_perf=True, + ) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) From 334569c3bd05aa50b844f1262c9dbe428e1421d6 Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Tue, 18 Nov 2025 00:06:34 -0800 Subject: [PATCH 3/3] unwaive test Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- .../defs/accuracy/test_disaggregated_serving.py | 8 ++++---- tests/integration/test_lists/waives.txt | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index d78f5dd324f..d07090dd51a 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -1162,12 +1162,12 @@ def test_auto_dtype(self, overlap_scheduler): } } with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, gen_server_config, - self.MODEL_PATH) as llm: + ctx_server_config, + gen_server_config, + self.MODEL_PATH, + debug_perf=True) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) def test_chunked_prefill(self): ctx_server_config = { diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 3fd599358f3..98b4abc59ca 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -393,9 +393,7 @@ unittest/_torch/modules SKIP (https://nvbugs/5637037) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill SKIP (https://nvbugs/5608930) -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugspro.nvidia.com/bug/5651854) test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836) test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836) test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836)