From e5c4865a54d812267b6d3cb80a4dd4c548e6f46d Mon Sep 17 00:00:00 2001
From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
Date: Mon, 17 Nov 2025 02:13:10 -0800
Subject: [PATCH 1/3] fix dsv3 debug mode

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
---
 .../_torch/models/modeling_deepseekv3.py        | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 2d6f0db8699..6a06be8c1a2 100755
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -640,11 +640,6 @@ def __init__(
     def get_scores(logits, e_score_correction_bias):
         scores = F.sigmoid(logits)
         scores_with_bias = scores + e_score_correction_bias
-        return scores, scores_with_bias
-
-    def noaux_tc(self, logits, e_score_correction_bias):
-        n_group = self.n_group
-
         if enable_llm_debug():
             has_nan = torch.isnan(scores_with_bias).any()
             if has_nan:
@@ -652,6 +647,11 @@ def noaux_tc(self, logits, e_score_correction_bias):
                     "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation."
                 )
 
+        return scores, scores_with_bias
+
+    def noaux_tc(self, logits, e_score_correction_bias):
+        n_group = self.n_group
+
         _, num_experts = logits.shape
         if self.n_group > 1:
             if self.top_k > 8 or (num_experts / n_group) > 32 or (
@@ -672,6 +672,13 @@ def noaux_tc(self, logits, e_score_correction_bias):
         if not self.is_fused:
             scores, scores_with_bias = Deepseekv3RoutingImpl.get_scores(
                 logits, e_score_correction_bias)
+            if enable_llm_debug():
+                has_nan = torch.isnan(scores_with_bias).any()
+                if has_nan:
+                    warnings.warn(
+                        "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation."
+                    )
+
             scores_shape = list(scores_with_bias.shape)
             group_scores = torch.sum(torch.topk(
                 scores_with_bias.view(scores_shape[:-1] +

From f37e5eb0d6a4c3d67555a1b4568a8e937ff0b0d1 Mon Sep 17 00:00:00 2001
From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
Date: Mon, 17 Nov 2025 23:51:08 -0800
Subject: [PATCH 2/3] debug h100 disagg perf test

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
---
 .../_torch/models/modeling_deepseekv3.py      |  17 +--
 .../accuracy/test_disaggregated_serving.py    | 105 ++++++++++++++++--
 2 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 6a06be8c1a2..2d6f0db8699 100755
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -640,6 +640,11 @@ def __init__(
     def get_scores(logits, e_score_correction_bias):
         scores = F.sigmoid(logits)
         scores_with_bias = scores + e_score_correction_bias
+        return scores, scores_with_bias
+
+    def noaux_tc(self, logits, e_score_correction_bias):
+        n_group = self.n_group
+
         if enable_llm_debug():
             has_nan = torch.isnan(scores_with_bias).any()
             if has_nan:
@@ -647,11 +652,6 @@ def get_scores(logits, e_score_correction_bias):
                     "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation."
                 )
 
-        return scores, scores_with_bias
-
-    def noaux_tc(self, logits, e_score_correction_bias):
-        n_group = self.n_group
-
         _, num_experts = logits.shape
         if self.n_group > 1:
             if self.top_k > 8 or (num_experts / n_group) > 32 or (
@@ -672,13 +672,6 @@ def noaux_tc(self, logits, e_score_correction_bias):
         if not self.is_fused:
             scores, scores_with_bias = Deepseekv3RoutingImpl.get_scores(
                 logits, e_score_correction_bias)
-            if enable_llm_debug():
-                has_nan = torch.isnan(scores_with_bias).any()
-                if has_nan:
-                    warnings.warn(
-                        "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation."
-                    )
-
             scores_shape = list(scores_with_bias.shape)
             group_scores = torch.sum(torch.topk(
                 scores_with_bias.view(scores_shape[:-1] +
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 4e3d6523b55..d78f5dd324f 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -25,6 +25,69 @@
 from .accuracy_core import (GSM8K, MMLU, JsonModeEval,
                             LlmapiAccuracyTestHarness, get_accuracy_task)
 
+MAX_PERF_METRICS_REQUESTS = 100
+
+
+def get_worker_env_vars(kv_cache_perf_dir: str = None):
+    env = os.environ.copy()
+    if kv_cache_perf_dir:
+        env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_perf_dir
+    return env
+
+
+def show_debug_perf(thread_pool: ThreadPoolExecutor,
+                    kv_cache_perf_dir: str = None,
+                    perf_metrics_url: str = None):
+
+    def wait_for_all_tasks_to_complete():
+        #thread_pool.shutdown(wait=True)
+        try:
+            print("Waiting for all tasks to complete")
+            for future in getattr(thread_pool, "futures", []):
+                try:
+                    future.result(timeout=300)
+                except concurrent.futures.TimeoutError:
+                    print("Timeout waiting for a future to complete.")
+                except Exception as e:
+                    print(f"Future completed with error: {e}")
+        except Exception as e:
+            print(f"Error while waiting for futures: {e}")
+
+    def show_kvcache_time(kv_cache_perf_dir, max_lines=100):
+        for file in os.listdir(kv_cache_perf_dir):
+            print(f"{'-'*25} {file}:{max_lines} {'-'*25}")
+            with open(os.path.join(kv_cache_perf_dir, file), "r") as f:
+                for line in f.readlines()[-max_lines:]:
+                    print(line.strip())
+
+    def show_perf_metrics(url):
+        perf_url = f"{url}/perf_metrics"
+        try:
+            print(f"Fetching perf metrics from {perf_url}")
+            resp = requests.get(perf_url, timeout=10)
+            if resp.status_code == 200:
+                try:
+                    print("perf_metrics JSON:")
+                    metrics = resp.json()
+                    print(json.dumps(metrics, indent=2, ensure_ascii=False))
+                    print("-" * 100)
+                except ValueError:
+                    print("perf_metrics returned non-JSON response:", resp.text)
+            else:
+                print(
+                    f"perf_metrics returned status {resp.status_code}: {resp.text}"
+                )
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching {perf_url}: {e}")
+
+    wait_for_all_tasks_to_complete()
+    if kv_cache_perf_dir:
+        show_kvcache_time(kv_cache_perf_dir)
+    if perf_metrics_url:
+        show_perf_metrics(perf_metrics_url)
+    # force failure to see the logs
+    assert False
+
 
 class Result(GenerationResultBase):
 
@@ -76,15 +139,29 @@ def launch_disaggregated_llm(
         ctx_model: str = None,
         gen_model: str = None,
         server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT,
-        max_workers: int = 16):
+        max_workers: int = 16,
+        debug_perf: bool = False):
     temp_dir = tempfile.TemporaryDirectory()
     disaggregated_serving_config_path = os.path.join(
         temp_dir.name, "disaggregated_serving_config.yaml")
-
+    if debug_perf:
+        kv_cache_perf_dir = os.path.join(temp_dir.name, "kvcache_perf")
+        os.makedirs(kv_cache_perf_dir, exist_ok=True)
+    else:
+        kv_cache_perf_dir = None
     if tensor_parallel_size > 1:
         print(
             f"Using unified tp parameter for testing is not recommended. Please use server configs instead."
         )
+    if debug_perf:
+        disaggregated_server_config[
+            "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS
+        ctx_server_config["return_perf_metrics"] = True
+        ctx_server_config[
+            "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS
+        gen_server_config["return_perf_metrics"] = True
+        gen_server_config[
+            "perf_metrics_max_requests"] = MAX_PERF_METRICS_REQUESTS
 
     with open(disaggregated_serving_config_path, "w") as f:
         yaml.dump(disaggregated_server_config, f)
@@ -144,7 +221,7 @@ def launch_disaggregated_llm(
     current_gpu_offset = 0
 
     for i, port in enumerate(ctx_ports):
-        env_ctx = os.environ.copy()
+        env_ctx = get_worker_env_vars(kv_cache_perf_dir=kv_cache_perf_dir)
         env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1"
         gpu_range = range(current_gpu_offset,
                           current_gpu_offset + ctx_total_gpus)
@@ -165,7 +242,7 @@ def launch_disaggregated_llm(
     gen_servers = []
 
     for i, port in enumerate(gen_ports):
-        env_gen = os.environ.copy()
+        env_gen = get_worker_env_vars(kv_cache_perf_dir=kv_cache_perf_dir)
         env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1"
         gpu_range = range(current_gpu_offset,
                           current_gpu_offset + gen_total_gpus)
@@ -289,6 +366,14 @@ def generate_async(prompt: str,
         tokenizer = load_hf_tokenizer(model_name)
         yield DuckLLM(args, tokenizer, generate_async)
 
+        if debug_perf:
+            show_debug_perf(
+                thread_pool,
+                kv_cache_perf_dir=kv_cache_perf_dir,
+                perf_metrics_url=f"http://localhost:8000"
+                if debug_perf else None,
+            )
+
 
 def run_parallel_test(model_name: str,
                       model_path: str,
@@ -357,7 +442,7 @@ def run_parallel_test(model_name: str,
             task.evaluate(llm)
 
 
-@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT * 5)
 class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
@@ -510,9 +595,13 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
                 "urls": ["localhost:8002"]
             }
         }
-        with launch_disaggregated_llm(disaggregated_server_config,
-                                      ctx_server_config, gen_server_config,
-                                      self.MODEL_PATH) as llm:
+        with launch_disaggregated_llm(
+                disaggregated_server_config,
+                ctx_server_config,
+                gen_server_config,
+                self.MODEL_PATH,
+                debug_perf=True,
+        ) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 

From 334569c3bd05aa50b844f1262c9dbe428e1421d6 Mon Sep 17 00:00:00 2001
From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
Date: Tue, 18 Nov 2025 00:06:34 -0800
Subject: [PATCH 3/3] unwaive test

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
---
 .../defs/accuracy/test_disaggregated_serving.py           | 8 ++++----
 tests/integration/test_lists/waives.txt                   | 2 --
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index d78f5dd324f..d07090dd51a 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -1162,12 +1162,12 @@ def test_auto_dtype(self, overlap_scheduler):
             }
         }
         with launch_disaggregated_llm(disaggregated_server_config,
-                                      ctx_server_config, gen_server_config,
-                                      self.MODEL_PATH) as llm:
+                                      ctx_server_config,
+                                      gen_server_config,
+                                      self.MODEL_PATH,
+                                      debug_perf=True) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
 
     def test_chunked_prefill(self):
         ctx_server_config = {
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 3fd599358f3..98b4abc59ca 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -393,9 +393,7 @@ unittest/_torch/modules SKIP (https://nvbugs/5637037)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill SKIP (https://nvbugs/5608930)
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugspro.nvidia.com/bug/5651854)
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836)
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836)
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] SKIP (https://nvbugs/5568836)