pytorch
diff --git a/‎.github/workflows/regression_test.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/regression_test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/sam2_amg_server/cli.py‎
Lines changed: 11 additions & 5 deletions b/‎examples/sam2_amg_server/cli.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/sam2_amg_server/server.py‎
Lines changed: 197 additions & 14 deletions b/‎examples/sam2_amg_server/server.py‎
Lines changed: 197 additions & 14 deletions
@@ -70,6 +70,7 @@ jobs:
             torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
+
           - name: CPU 2.3
             runs-on: linux.4xlarge
             torch-spec: 'torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu'
 
@@ -6,6 +6,8 @@
 from server import model_type_to_paths
 from server import MODEL_TYPES_TO_MODEL
 from server import set_fast
+from server import set_aot_fast
+from server import load_aot_fast
 from server import set_furious
 from torchao._models.sam2.build_sam import build_sam2
 from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
@@ -22,17 +24,20 @@ def main_docstring():
     """
 
 
-def main_headless(checkpoint_path, model_type, input_bytes, points_per_batch=1024, output_format='png', verbose=False, fast=False, furious=False):
+def main_headless(checkpoint_path, model_type, input_bytes, points_per_batch=1024, output_format='png', verbose=False, fast=False, furious=False, load_fast=""):
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
     if verbose:
         print(f"Loading model {sam2_checkpoint} with config {model_cfg}")
     sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
     mask_generator = SAM2AutomaticMaskGenerator(sam2, points_per_batch=points_per_batch, output_mode="uncompressed_rle")
-    if fast:
-        set_fast(mask_generator)
     if furious:
         set_furious(mask_generator)
+    if load_fast:
+        load_aot_fast(mask_generator, load_fast)
+    if fast:
+        set_fast(mask_generator, load_fast)
+
     image_tensor = file_bytes_to_image_tensor(input_bytes)
     if verbose:
         print(f"Loaded image of size {tuple(image_tensor.shape)} and generating mask.")
@@ -50,7 +55,7 @@ def main_headless(checkpoint_path, model_type, input_bytes, points_per_batch=102
     buf.seek(0)
     return buf.getvalue()
 
-def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=1024, output_format='png', verbose=False, fast=False, furious=False):
+def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=1024, output_format='png', verbose=False, fast=False, furious=False, load_fast=""):
     input_bytes = bytearray(open(input_path, 'rb').read())
     output_bytes = main_headless(checkpoint_path,
                                  model_type,
@@ -59,7 +64,8 @@ def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=
                                  output_format=output_format,
                                  verbose=verbose,
                                  fast=fast,
-                                 furious=furious)
+                                 furious=furious,
+                                 load_fast=load_fast)
     with open(output_path, "wb") as file:
         file.write(output_bytes)
 
 
@@ -332,14 +332,175 @@ def model_type_to_paths(checkpoint_path, model_type):
     model_cfg = f"configs/sam2.1/{MODEL_TYPES_TO_CONFIG[model_type]}"
     return sam2_checkpoint, model_cfg
 
-def set_fast(mask_generator):
-    # TODO: Using CUDA graphs can cause numerical differences?
-    mask_generator.predictor.model.image_encoder = torch.compile(
-        mask_generator.predictor.model.image_encoder,
-        mode="max-autotune",
-        fullgraph=True,
-        dynamic=False,
+
+def aot_compile(model_directory, name, fn, sample_args):
+    path = Path(model_directory) / Path(f"{name}.pt2")
+    print(f"Saving at {path=}")
+    options = {
+        "max_autotune": True,
+        "triton.cudagraphs": True,
+    }
+
+    exported = torch.export.export_for_inference(fn, sample_args)
+    output_path = torch._inductor.aoti_compile_and_package(
+        exported,
+        package_path=str(path),
+        inductor_configs=options,
     )
+    return output_path
+
+
+def aot_load(path):
+    return torch._export.aot_load(path, "cuda")
+
+class FunctionModel(torch.nn.Module):
+
+    def __init__(self, module, fn_name):
+        super().__init__()
+        self.module = module
+        self.fn_name = fn_name
+
+    def forward(self, *args):
+        return getattr(self.module, self.fn_name)(*args)
+
+
+def set_aot_fast(mask_generator, model_directory):
+    example_input = torch.empty(1, 3, 1024, 1024)
+    example_input = example_input.to(mask_generator.predictor._image_dtype)
+    example_input = (example_input.to(mask_generator.predictor.device),)
+    aot_compile(model_directory,
+                "sam2_image_encoder",
+                mask_generator.predictor.model.image_encoder,
+                example_input)
+
+    # NOTE: THIS DOESN'T WORK YET!
+    # example_input_0_0 = torch.empty(1, 32, 256, 256, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input_0_1 = torch.empty(1, 64, 128, 128, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input_1 = torch.empty(1, 256, 64, 64, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_2 = torch.empty(1024, 1, 2, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_3 = torch.empty(1024, 1, dtype=torch.int32, device=mask_generator.predictor.device)
+    # example_input = ([example_input_0_0, example_input_0_1],
+    #                  example_input_1,
+    #                  example_input_2,
+    #                  example_input_3,
+    #                  None,
+    #                  None,
+    #                  True,
+    #                  True,
+    #                  -1)
+    # mask_generator.forward = mask_generator.predictor._predict_masks_with_features
+    # mask_generator(*example_input)
+    # aot_compile("sam2__predict_masks_with_features",
+    #             mask_generator,
+    #             example_input)
+
+    # example_input_2 = torch.empty(1024, 1, 2, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_3 = torch.empty(1024, 1, dtype=torch.int32, device=mask_generator.predictor.device)
+    # aot_compile("sam2_sam_prompt_encoder",
+    #             mask_generator.predictor.model.sam_prompt_encoder,
+    #             ((example_input_2, example_input_3),
+    #              None,
+    #              None))
+
+    # NOTE: THIS DOESN'T WORK YET!
+    # example_input_0 = torch.empty(1, 256, 64, 64, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_1 = torch.empty(1, 256, 64, 64, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_2 = torch.empty(1024, 2, 256, dtype=torch.float32, device=mask_generator.predictor.device)
+    # example_input_3 = torch.empty(1024, 256, 64, 64, dtype=torch.float32, device=mask_generator.predictor.device)
+
+    # example_input_4_0 = torch.empty(1, 32, 256, 256, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input_4_1 = torch.empty(1, 64, 128, 128, dtype=torch.float16, device=mask_generator.predictor.device)
+
+    # example_input = (example_input_0,
+    #                  example_input_1,
+    #                  example_input_2,
+    #                  example_input_3,
+    #                  True,
+    #                  True,
+    #                  [example_input_4_0, example_input_4_1])
+    # print("Example")
+    # mask_generator.predictor.model.sam_mask_decoder(*example_input)
+    # print("Example done")
+    # aot_compile("sam2_sam_mask_decoder",
+    #             mask_generator.predictor.model.sam_mask_decoder,
+    #             example_input)
+
+    # example_input_0 = torch.empty(1024, 256, 64, 64, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input_1 = torch.empty(1024, 256, 64, 64, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input_2 = torch.empty(1024, 8, 256, dtype=torch.float16, device=mask_generator.predictor.device)
+    # example_input = (example_input_0, example_input_1, example_input_2)
+
+    # mask_generator.predictor.model.sam_mask_decoder.transformer(*example_input)
+    # aot_compile("sam2_sam_mask_decoder_transformer",
+    #             mask_generator.predictor.model.sam_mask_decoder.transformer,
+    #             example_input)
+
+
+
+
+class LoadedModel(torch.nn.Module):
+
+    def __init__(self, aoti_compiled_model):
+        super().__init__()
+        self.aoti_compiled_model = aoti_compiled_model
+
+    def forward(self, *args):
+        return self.aoti_compiled_model(*args)
+
+class LoadedDecoder(torch.nn.Module):
+
+    def __init__(self, aoti_compiled_model, other):
+        super().__init__()
+        self.aoti_compiled_model = aoti_compiled_model
+        self.other = other
+
+    def forward(self, *args):
+        return self.aoti_compiled_model(*args)
+
+    def get_dense_pe(self, *args, **kwargs) -> torch.Tensor:
+        return self.other.get_dense_pe(*args, **kwargs)
+
+def load_aot_fast(mask_generator, model_directory):
+    t0 = time.time()
+    path = Path(model_directory) / Path(f"sam2_image_encoder.pt2")
+    assert path.exists(), f"Expected {path} to exist."
+    print(f"Start load from {path}")
+    pkg = torch._inductor.aoti_load_package(str(path))
+    pkg_m = LoadedModel(pkg)
+    mask_generator.predictor.model.image_encoder = pkg_m
+    
+    # NOTE: This doesn't work yet!
+    # pkg = torch._inductor.aoti_load_package(os.path.join(os.getcwd(), "sam2__predict_masks_with_features.pt2"))
+    # pkg_m = LoadedModel(pkg)
+    # mask_generator.predictor._predict_masks_with_features = pkg_m.forward
+
+    # pkg = torch._inductor.aoti_load_package(os.path.join(os.getcwd(), "sam2_sam_prompt_encoder.pt2"))
+    # pkg_m = LoadedDecoder(pkg, mask_generator.predictor.model.sam_prompt_encoder)
+    # mask_generator.predictor.model.sam_prompt_encoder = pkg_m
+
+    # NOTE: This doesn't work yet!
+    # pkg = torch._inductor.aoti_load_package(os.path.join(os.getcwd(), "sam2_sam_mask_decoder.pt2"))
+    # pkg_m = LoadedModel(pkg)
+    # pkg_m.conv_s0 = mask_generator.predictor.model.sam_mask_decoder.conv_s0
+    # pkg_m.conv_s1 = mask_generator.predictor.model.sam_mask_decoder.conv_s1
+    # mask_generator.predictor.model.sam_mask_decoder = pkg_m
+
+    # pkg = torch._inductor.aoti_load_package(os.path.join(os.getcwd(), "sam2_sam_mask_decoder_transformer.pt2"))
+    # pkg_m = LoadedModel(pkg)
+    # mask_generator.predictor.model.sam_mask_decoder.transformer = pkg_m
+
+    print(f"End load. Took {time.time() - t0}s")
+
+
+def set_fast(mask_generator, load_fast=""):
+    if load_fast == "":
+        # TODO: Using CUDA graphs can cause numerical differences?
+        mask_generator.predictor.model.image_encoder = torch.compile(
+            mask_generator.predictor.model.image_encoder,
+            mode="max-autotune",
+            fullgraph=True,
+            dynamic=False,
+        )
 
     mask_generator.predictor._predict_masks = torch.compile(
         mask_generator.predictor._predict_masks,
@@ -371,6 +532,7 @@ def main(checkpoint_path,
          baseline=False,
          fast=False,
          furious=False,
+         use_autoquant=False,
          unittest=False,
          benchmark=False,
          profile=None,
@@ -380,7 +542,9 @@ def main(checkpoint_path,
          port=5000,
          host="127.0.0.1",
          dry=False,
-         batch_size=1):
+         batch_size=1,
+         load_fast="",
+         save_fast=""):
     if verbose:
         logging.basicConfig(level=logging.INFO,
                             format='%(asctime)s - %(levelname)s - %(message)s',
@@ -399,22 +563,41 @@ def main(checkpoint_path,
         from torchao._models.sam2.build_sam import build_sam2
         from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
         from torchao._models.sam2.utils.amg import rle_to_mask
-    
+
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
-    
+
     logging.info(f"Loading model {sam2_checkpoint} with config {model_cfg}")
     sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
- 
+
     logging.info(f"Using {points_per_batch} points_per_batch")
     mask_generator = SAM2AutomaticMaskGenerator(sam2, points_per_batch=points_per_batch, output_mode="uncompressed_rle")
 
+    if load_fast != "":
+        load_aot_fast(mask_generator, load_fast)
+
+    if save_fast != "":
+        assert load_fast == "", "Can't save compiled models while loading them with --load-fast."
+        assert not baseline, "--fast cannot be combined with baseline. code to be torch.compile(fullgraph=True) compatible."
+        print(f"Saving compiled models under directory {save_fast}")
+        set_aot_fast(mask_generator, save_fast)
+
     if fast:
         assert not baseline, "--fast cannot be combined with baseline. code to be torch.compile(fullgraph=True) compatible."
-        set_fast(mask_generator)
+        set_fast(mask_generator, load_fast)
 
     if furious:
         set_furious(mask_generator)
+    # since autoquant is replicating what furious mode is doing, don't use these two together
+    elif use_autoquant:
+        from torchao import autoquant
+        from torchao.quantization import DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST
+        mask_generator.predictor.model.image_encoder = autoquant(mask_generator.predictor.model.image_encoder, qtensor_class_list=DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)
+
+         #  mask_generator.predictor.model.image_encoder = mask_generator.predictor.model.image_encoder.to(torch.float16, min_sqnr=40)
+        # NOTE: Not baseline feature
+        mask_generator.predictor._transforms_device = mask_generator.predictor.device
+        torch.set_float32_matmul_precision('high')
 
     with open('dog.jpg', 'rb') as f:
         image_tensor = file_bytes_to_image_tensor(bytearray(f.read()))
@@ -494,7 +677,7 @@ async def upload_rle(image: UploadFile = File(...)):
         await request_queue.put((image_tensor, response_future))
         masks = await response_future
         return masks_to_rle_dict(masks)
-    
+
     @app.post("/upload")
     async def upload_image(image: UploadFile = File(...)):
         image_tensor = file_bytes_to_image_tensor(bytearray(await image.read()))
@@ -512,7 +695,7 @@ async def upload_image(image: UploadFile = File(...)):
         plt.savefig(buf, format='png')
         buf.seek(0)
         return StreamingResponse(buf, media_type="image/png")
-    
+
 
     # uvicorn.run(app, host=host, port=port, log_level="info")
     uvicorn.run(app, host=host, port=port)