[None][fix] Update deployment guide and cherry-pick CI test fix from main (#7623)

dongfengy · byshiue · web-flow · commit d60dad6b9d0d · 2025-09-09T09:53:47.000+08:00
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
Signed-off-by: bhsueh &lt;11360707+byshiue@users.noreply.github.com&gt;
Co-authored-by: bhsueh_NV &lt;11360707+byshiue@users.noreply.github.com&gt;
diff --git a/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
@@ -72,7 +72,7 @@ cuda_graph_config:
   max_batch_size: 720
 moe_config:
     backend: TRTLLM
-stream_interval: 10
+stream_interval: 20
 num_postprocess_workers: 4
 EOF
 ```
@@ -89,8 +89,12 @@ cuda_graph_config:
   max_batch_size: 720
 moe_config:
     backend: CUTLASS
-stream_interval: 10
+stream_interval: 20
 num_postprocess_workers: 4
+attention_dp_config:
+    enable_balance: true
+    batching_wait_iters: 50
+    timeout_iters: 1
 EOF
 ```
 
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -189,5 +189,7 @@ GPT-OSS/MXFP4:
     accuracy: 90.3
   - quant_algo: W4A8_MXFP4_FP8
     accuracy: 90.3
+  - quant_algo: W4A16_MXFP4
+    accuracy: 90.3
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 88.36
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2866,8 +2866,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
     extra_evaluator_kwargs = {
         "fewshot_as_multiturn": True,
         "apply_chat_template": True,
-        "scores_filter": "exact_match,flexible-extract",
-        "MAX_OUTPUT_LEN": 8192
     }
 
     MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
@@ -2881,7 +2879,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
         (True, True),
     ])
     def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
-        pytest.skip("https://nvbugs/5481087")
+        mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
+        mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
+                          {"scores_filter": "exact_match,flexible-extract"})
         if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
             pytest.skip("Triton kernels are not available")
 
@@ -2899,7 +2899,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
 
         with llm:
             model_name = "GPT-OSS/MXFP4"
-            mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
             task = GSM8K(model_name)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
@@ -2919,7 +2918,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
         ids=["tp4", "ep4", "dp4"])
     def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
                       attention_dp, cuda_graph, overlap_scheduler, mocker):
-        pytest.skip("https://nvbugs/5481087")
+        mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
+        mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
+                          {"scores_filter": "exact_match,flexible-extract"})
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
@@ -2940,7 +2941,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
         with llm:
             model_name = "GPT-OSS/MXFP4"
             task = GSM8K(model_name)
-            mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
@@ -2952,6 +2952,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
         ids=["dp4"])
     def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                    overlap_scheduler, monkeypatch, mocker):
+        mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
+        mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
+                          {"scores_filter": "exact_match,flexible-extract"})
         if not IS_TRITON_KERNELS_AVAILABLE:
             pytest.skip("Triton kernels are not available")
         monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
@@ -2971,7 +2974,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         with llm:
             model_name = "GPT-OSS/BF16"
             task = GSM8K(model_name)
-            mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -331,10 +331,6 @@ accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5
 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075)
 accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5471106)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] SKIP (https://nvbugs/5481080)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass] SKIP (https://nvbugs/5481080)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass] SKIP (https://nvbugs/5481080)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass] SKIP (https://nvbugs/5481080)
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-False] SKIP (https://nvbugs/5481094)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-True] SKIP (https://nvbugs/5481094)