1+ import cv2
2+ import matplotlib .pyplot as plt
13import numpy as np
24import torch
3- import matplotlib .pyplot as plt
4- import cv2
5- import torch .utils .benchmark as benchmark
6-
75from torch ._inductor import config as inductorconfig
6+
87inductorconfig .triton .unique_kernel_names = True
98inductorconfig .coordinate_descent_tuning = True
109inductorconfig .coordinate_descent_check_all_directions = True
1110
11+
1212def profiler_runner (path , fn , * args , ** kwargs ):
1313 with torch .profiler .profile (
14- activities = [torch .profiler .ProfilerActivity .CPU ,
15- torch .profiler .ProfilerActivity .CUDA ],
16- record_shapes = True ) as prof :
14+ activities = [
15+ torch .profiler .ProfilerActivity .CPU ,
16+ torch .profiler .ProfilerActivity .CUDA ,
17+ ],
18+ record_shapes = True ,
19+ ) as prof :
1720 result = fn (* args , ** kwargs )
1821 print (f"Saving trace under { path } " )
1922 prof .export_chrome_trace (path )
2023 return result
2124
25+
2226def show_anns (anns ):
2327 if len (anns ) == 0 :
2428 return
25- sorted_anns = sorted (anns , key = (lambda x : x [' area' ]), reverse = True )
29+ sorted_anns = sorted (anns , key = (lambda x : x [" area" ]), reverse = True )
2630 ax = plt .gca ()
2731 ax .set_autoscale_on (False )
2832
29- img = np .ones ((sorted_anns [0 ]['segmentation' ].shape [0 ], sorted_anns [0 ]['segmentation' ].shape [1 ], 4 ))
30- img [:,:,3 ] = 0
33+ img = np .ones (
34+ (
35+ sorted_anns [0 ]["segmentation" ].shape [0 ],
36+ sorted_anns [0 ]["segmentation" ].shape [1 ],
37+ 4 ,
38+ )
39+ )
40+ img [:, :, 3 ] = 0
3141 ms = []
3242 for ann in sorted_anns :
33- m = ann [' segmentation' ]
43+ m = ann [" segmentation" ]
3444 ms .append (torch .as_tensor (m ))
3545 color_mask = np .concatenate ([np .random .random (3 ), [0.35 ]])
3646 img [m ] = color_mask
3747 ax .imshow (img )
3848 return torch .stack (ms )
3949
40- image = cv2 .imread ('dog.jpg' )
50+
51+ image = cv2 .imread ("dog.jpg" )
4152image = cv2 .cvtColor (image , cv2 .COLOR_BGR2RGB )
4253
4354
4455# from segment_anything_fast import sam_model_registry, sam_model_fast_registry, SamAutomaticMaskGenerator
45- #
56+ #
4657# sam_checkpoint = "checkpoints/sam_vit_h_4b8939.pth"
4758# model_type = "vit_h"
4859device = "cuda"
49- #
60+ #
5061# sam = sam_model_fast_registry[model_type](checkpoint=sam_checkpoint)
5162# sam.to(device=device)
5263
53- from sam2 .build_sam import build_sam2
5464from sam2 .automatic_mask_generator import SAM2AutomaticMaskGenerator
65+ from sam2 .build_sam import build_sam2
5566
5667sam2_checkpoint = "checkpoints/sam2_hiera_large.pt"
5768model_cfg = "sam2_hiera_l.yaml"
@@ -66,7 +77,7 @@ def show_anns(anns):
6677## TODO: Implement mIoU to allow approximations.
6778# torch.set_float32_matmul_precision('high')
6879# torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
69- ##
80+ ##
7081
7182## TODO: Using CUDA graphs can cause numerical differences?
7283mask_generator .predictor .model .image_encoder = torch .compile (
@@ -93,24 +104,26 @@ def show_anns(anns):
93104)
94105
95106# with torch.backends.cuda.sdp_kernel(enable_cudnn=False): #, enable_math=False, enable_mem_efficient=False):
96- with torch .backends .cuda .sdp_kernel (enable_cudnn = True ): #, enable_math=False, enable_mem_efficient=False):
107+ with torch .backends .cuda .sdp_kernel (
108+ enable_cudnn = True
109+ ): # , enable_math=False, enable_mem_efficient=False):
97110 # Run thrice for warmup
98111 masks = mask_generator .generate (image )
99112 masks = mask_generator .generate (image )
100113 masks = mask_generator .generate (image )
101-
114+
102115 # Save an example
103- plt .figure (figsize = (image .shape [1 ]/ 100. , image .shape [0 ]/ 100. ), dpi = 100 )
116+ plt .figure (figsize = (image .shape [1 ] / 100.0 , image .shape [0 ] / 100.0 ), dpi = 100 )
104117 plt .imshow (image )
105118 ms = show_anns (masks )
106119 ms_ref = torch .load ("dog_mask_fast.pt" )
107120 torch .testing .assert_allclose (ms , ms_ref )
108121 print ("Masks match reference" )
109122 # # torch.save(ms, "dog_mask_fast.pt")
110- plt .axis (' off' )
123+ plt .axis (" off" )
111124 plt .tight_layout ()
112- plt .savefig (' dog_mask_fast.png' , format = ' png' )
113-
125+ plt .savefig (" dog_mask_fast.png" , format = " png" )
126+
114127 # Benchmark
115128 torch .cuda .synchronize ()
116129 start_event = torch .cuda .Event (enable_timing = True )
@@ -120,14 +133,18 @@ def show_anns(anns):
120133 masks = mask_generator .generate (image )
121134 end_event .record ()
122135 torch .cuda .synchronize ()
123- print (start_event .elapsed_time (end_event ) / 10. )
124-
136+ print (start_event .elapsed_time (end_event ) / 10.0 )
137+
125138 # Save a GPU trace
126- profiler_runner (f "amg_example_trace.json.gz" , mask_generator .generate , image )
127-
139+ profiler_runner ("amg_example_trace.json.gz" , mask_generator .generate , image )
140+
128141 # Write out memory usage
129142 max_memory_allocated_bytes = torch .cuda .max_memory_allocated ()
130143 _ , total_memory = torch .cuda .mem_get_info ()
131- max_memory_allocated_percentage = int (100 * (max_memory_allocated_bytes / total_memory ))
144+ max_memory_allocated_percentage = int (
145+ 100 * (max_memory_allocated_bytes / total_memory )
146+ )
132147 max_memory_allocated_bytes = max_memory_allocated_bytes >> 20
133- print (f"memory(MiB): { max_memory_allocated_bytes } memory(%): { max_memory_allocated_percentage } " )
148+ print (
149+ f"memory(MiB): { max_memory_allocated_bytes } memory(%): { max_memory_allocated_percentage } "
150+ )
0 commit comments