[SD] Add CUDA A100 tuned model (huggingface#773)

yzhang93 · web-flow · commit 47a119a37f84 · 2023-01-09T15:22:27.000-08:00
diff --git a/shark/examples/shark_inference/stable_diffusion/opt_params.py b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -62,8 +62,13 @@ def get_params(bucket_key, model_key, model, is_tuned, precision):
 def get_unet():
     # Tuned model is present only for `fp16` precision.
     is_tuned = "tuned" if args.use_tuned else "untuned"
-    bucket_key = f"{args.variant}/{is_tuned}"
-    model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+    if "vulkan" not in args.device and is_tuned:
+        bucket_key = f"{args.variant}/{is_tuned}/{args.device}"
+        model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{args.variant}/{is_tuned}"
+        model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}"
+
     bucket, model_name, iree_flags = get_params(
         bucket_key, model_key, "unet", is_tuned, args.precision
     )
@@ -74,7 +79,9 @@ def get_unet():
 
 def get_vae():
     # Tuned model is present only for `fp16` precision.
-    is_tuned = "tuned" if args.use_tuned else "untuned"
+    is_tuned = (
+        "tuned" if (args.use_tuned and "vulkan" in args.device) else "untuned"
+    )
     is_base = "/base" if args.use_base_vae else ""
     bucket_key = f"{args.variant}/{is_tuned}"
     model_key = f"{args.variant}/{args.version}/vae/{args.precision}/length_77/{is_tuned}{is_base}"
diff --git a/shark/examples/shark_inference/stable_diffusion/resources/model_db.json b/shark/examples/shark_inference/stable_diffusion/resources/model_db.json
@@ -2,6 +2,7 @@
   {
     "stablediffusion/untuned":"gs://shark_tank/stable_diffusion",
     "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
     "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
     "anythingv3/tuned":"gs://shark_tank/sd_tuned",
     "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
@@ -23,6 +24,7 @@
     "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
     "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_19dec_v2p1base_fp16_64",
     "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
     "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae2base_19dec_fp16",
     "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
     "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
diff --git a/shark/examples/shark_inference/stable_diffusion/sd_annotation.py b/shark/examples/shark_inference/stable_diffusion/sd_annotation.py
@@ -12,11 +12,14 @@
 from utils import set_init_device_flags
 
 
-# Downloads the model (Unet or VAE fp16) from shark_tank
 set_init_device_flags()
+device = (
+    args.device if "://" not in args.device else args.device.split("://")[0]
+)
+
+# Downloads the model (Unet or VAE fp16) from shark_tank
 shark_args.local_tank_cache = args.local_tank_cache
 bucket_key = f"{args.variant}/untuned"
-use_winograd = True
 if args.annotation_model == "unet":
     model_key = f"{args.variant}/{args.version}/unet/{args.precision}/length_{args.max_length}/untuned"
 elif args.annotation_model == "vae":
@@ -34,29 +37,29 @@
 
 # Downloads the tuned config files from shark_tank
 config_bucket = "gs://shark_tank/sd_tuned/configs/"
-if use_winograd:
-    config_name = f"{args.annotation_model}_winograd.json"
+if args.use_winograd:
+    config_name = f"{args.annotation_model}_winograd_{device}.json"
     full_gs_url = config_bucket + config_name
     winograd_config_dir = f"{WORKDIR}configs/" + config_name
     download_public_file(full_gs_url, winograd_config_dir, True)
 
 if args.annotation_model == "unet":
     if args.variant in ["anythingv3", "analogdiffusion"]:
         args.max_length = 77
-    config_name = f"{args.annotation_model}_{args.version}_{args.precision}_len{args.max_length}.json"
+    config_name = f"{args.annotation_model}_{args.version}_{args.precision}_len{args.max_length}_{device}.json"
     full_gs_url = config_bucket + config_name
     lowering_config_dir = f"{WORKDIR}configs/" + config_name
     download_public_file(full_gs_url, lowering_config_dir, True)
 
 # Annotate the model with Winograd attribute on selected conv ops
-if use_winograd:
+if args.use_winograd:
     with create_context() as ctx:
         winograd_model = model_annotation(
             ctx,
             input_contents=mlir_model,
             config_path=winograd_config_dir,
             search_op="conv",
-            winograd=use_winograd,
+            winograd=args.use_winograd,
         )
         with open(
             f"{args.annotation_output}/{model_name}_tuned_torch.mlir", "w"
@@ -65,19 +68,30 @@
 
 # For Unet annotate the model with tuned lowering configs
 if args.annotation_model == "unet":
-    if use_winograd:
+    if args.use_winograd:
         input_mlir = f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
         dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
     else:
         input_mlir = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
         dump_after = "iree-flow-pad-linalg-ops"
 
     # Dump IR after padding/img2col/winograd passes
+    device_spec_args = ""
+    if device == "cuda":
+        from shark.iree_utils.gpu_utils import get_iree_gpu_args
+
+        gpu_flags = get_iree_gpu_args()
+        for flag in gpu_flags:
+            device_spec_args += flag + " "
+    elif device == "vulkan":
+        device_spec_args = (
+            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
+        )
     run_cmd(
         f"iree-compile {input_mlir} "
         "--iree-input-type=tm_tensor "
-        f"--iree-hal-target-backends={iree_target_map(args.device)} "
-        f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
+        f"--iree-hal-target-backends={iree_target_map(device)} "
+        f"{device_spec_args}"
         "--iree-stream-resource-index-bits=64 "
         "--iree-vm-target-index-bits=64 "
         "--iree-flow-enable-padding-linalg-ops "
diff --git a/shark/examples/shark_inference/stable_diffusion/stable_args.py b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -247,4 +247,11 @@ def path_expand(s):
     help="Options are unet and vae.",
 )
 
+p.add_argument(
+    "--use_winograd",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Apply Winograd on selected conv ops.",
+)
+
 args = p.parse_args()
diff --git a/shark/examples/shark_inference/stable_diffusion/utils.py b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -7,6 +7,7 @@
     set_iree_vulkan_runtime_flags,
     get_vulkan_target_triple,
 )
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc
 
 
 def _compile_module(shark_module, model_name, extra_args=[]):
@@ -46,6 +47,8 @@ def get_shark_model(tank_url, model_name, extra_args=[]):
 
     # Set local shark_tank cache directory.
     shark_args.local_tank_cache = args.local_tank_cache
+    if "cuda" in args.device:
+        shark_args.enable_tf32 = True
 
     mlir_model, func_name, inputs, golden_out = download_model(
         model_name,
@@ -185,22 +188,32 @@ def set_init_device_flags():
     elif args.variant == "openjourney":
         args.max_length = 64
 
-    # use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
+    # Use tuned models in the case of stablediffusion/fp16 and rdna3 cards.
     if (
         args.variant in ["openjourney", "dreamlike"]
         or args.precision != "fp16"
         or "vulkan" not in args.device
         or "rdna3" not in args.iree_vulkan_target_triple
     ):
         args.use_tuned = False
-        print("Tuned models are currently not supported for this setting.")
 
     elif args.use_base_vae and args.variant != "stablediffusion":
         args.use_tuned = False
-        print("Tuned models are currently not supported for this setting.")
+
+    # Use tuned model in the case of stablediffusion/fp16 and cuda device sm_80
+    if (
+        args.variant == "stablediffusion"
+        and args.precision == "fp16"
+        and "cuda" in args.device
+        and get_cuda_sm_cc() == "sm_80"
+        and args.version == "v2_1base"
+    ):
+        args.use_tuned = True
 
     if args.use_tuned:
-        print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
+        print(f"Using {args.device} tuned models for stablediffusion/fp16.")
+    else:
+        print("Tuned models are currently not supported for this setting.")
 
 
 # Utility to get list of devices available.