test

lanluo-nvidia · lanluo-nvidia · commit aa8ea5d533ee · 2025-07-03T20:25:17.000-07:00
diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py
@@ -135,14 +135,16 @@ def forward_loop(mod):
     pipe.transformer = trt_gm
     seed = 42
     image = pipe(
-        ["Beach and Kids"],
+        [
+            "enchanted winter forest, soft diffuse light on a snow-filled day, serene nature scene, the forest is illuminated by the snow"
+        ],
         output_type="pil",
-        num_inference_steps=20,
+        num_inference_steps=30,
         num_images_per_prompt=batch_size,
         generator=torch.Generator("cuda").manual_seed(seed),
     ).images
     print(f"generated {len(image)} images")
-    image[0].save("beach_kids.png")
+    image[0].save("forest.png")
 
     torch.cuda.empty_cache()
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -106,7 +106,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
             import modelopt.torch.quantization as mtq
 
             assert torch.ops.tensorrt.quantize_op.default
+            assert torch.ops.tensorrt.dynamic_block_quantize_op.default
             self.quantization_ops.add(torch.ops.tensorrt.quantize_op.default)
+            self.quantization_ops.add(
+                torch.ops.tensorrt.dynamic_block_quantize_op.default
+            )
         except Exception as e:
             pass
 
diff --git a/tools/perf/Flux/flux_perf.py b/tools/perf/Flux/flux_perf.py
@@ -9,6 +9,24 @@
 from flux_demo import compile_model
 
 
+def profile(pipe, prompt, inference_step, batch_size=1):
+    print(f"Running torch profiler with {inference_step=} {batch_size=}")
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CUDA],
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+    ) as prof:
+        with torch.profiler.record_function("model_inference"):
+            pipe(
+                prompt,
+                output_type="pil",
+                num_inference_steps=inference_step,
+                num_images_per_prompt=batch_size,
+            ).images
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=100))
+
+
 def benchmark(pipe, prompt, inference_step, batch_size=1, iterations=1):
     print(f"Running warmup with {batch_size=} {inference_step=} iterations=10")
     # warmup
@@ -41,9 +59,6 @@ def benchmark(pipe, prompt, inference_step, batch_size=1, iterations=1):
         "Average Latency Per Step:",
         (end - start) / inference_step / iterations / batch_size,
     )
-
-    # run the perf tool
-    print(f"Running cudart perf tool with {inference_step=} {batch_size=}")
     return
 
 
@@ -52,6 +67,7 @@ def main(args):
     pipe, backbone, trt_gm = compile_model(args)
 
     benchmark(pipe, ["Test"], 20, batch_size=args.max_batch_size, iterations=3)
+    # profile(pipe, ["enchanted winter forest, soft diffuse light on a snow-filled day, serene nature scene, the forest is illuminated by the snow"], 20, batch_size=args.max_batch_size)
 
 
 if __name__ == "__main__":