@@ -2866,8 +2866,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
28662866 extra_evaluator_kwargs = {
28672867 "fewshot_as_multiturn" : True ,
28682868 "apply_chat_template" : True ,
2869- "scores_filter" : "exact_match,flexible-extract" ,
2870- "MAX_OUTPUT_LEN" : 8192
28712869 }
28722870
28732871 MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2881,7 +2879,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
28812879 (True , True ),
28822880 ])
28832881 def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2884- pytest .skip ("https://nvbugs/5481087" )
2882+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2883+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2884+ {"scores_filter" : "exact_match,flexible-extract" })
28852885 if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
28862886 pytest .skip ("Triton kernels are not available" )
28872887
@@ -2899,7 +2899,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
28992899
29002900 with llm :
29012901 model_name = "GPT-OSS/MXFP4"
2902- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
29032902 task = GSM8K (model_name )
29042903 task .evaluate (llm ,
29052904 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2919,7 +2918,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
29192918 ids = ["tp4" , "ep4" , "dp4" ])
29202919 def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
29212920 attention_dp , cuda_graph , overlap_scheduler , mocker ):
2922- pytest .skip ("https://nvbugs/5481087" )
2921+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2922+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2923+ {"scores_filter" : "exact_match,flexible-extract" })
29232924 if moe_backend == "TRITON" :
29242925 if not IS_TRITON_KERNELS_AVAILABLE :
29252926 pytest .skip ("Triton kernels are not available" )
@@ -2940,7 +2941,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
29402941 with llm :
29412942 model_name = "GPT-OSS/MXFP4"
29422943 task = GSM8K (model_name )
2943- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
29442944 task .evaluate (llm ,
29452945 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
29462946
@@ -2952,6 +2952,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
29522952 ids = ["dp4" ])
29532953 def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
29542954 overlap_scheduler , monkeypatch , mocker ):
2955+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2956+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2957+ {"scores_filter" : "exact_match,flexible-extract" })
29552958 if not IS_TRITON_KERNELS_AVAILABLE :
29562959 pytest .skip ("Triton kernels are not available" )
29572960 monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2971,7 +2974,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
29712974 with llm :
29722975 model_name = "GPT-OSS/BF16"
29732976 task = GSM8K (model_name )
2974- mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
29752977 task .evaluate (llm ,
29762978 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
29772979
0 commit comments