2828#
2929# **Required pip Dependencies**
3030#
31- # - ``torch >= 1.14 ``
31+ # - ``torch >= 2.0 ``
3232# - ``torchvision``
3333# - ``numpy``
3434# - ``scipy``
5252
5353import torch
5454
55- import torch ._inductor .config
56- torch ._inductor .config .cpp .cxx = ("g++" ,)
57-
5855def foo (x , y ):
5956 a = torch .sin (x )
6057 b = torch .cos (x )
@@ -133,6 +130,11 @@ def evaluate(mod, inp):
133130 return mod (inp )
134131
135132model = init_model ()
133+
134+ # Reset since we are using a different mode.
135+ import torch ._dynamo
136+ torch ._dynamo .reset ()
137+
136138evaluate_opt = torch .compile (evaluate , mode = "reduce-overhead" )
137139
138140inp = generate_data (16 )[0 ]
@@ -174,8 +176,7 @@ def evaluate(mod, inp):
174176
175177######################################################################
176178# And indeed, we can see that running our model with ``torch.compile``
177- # results in a significant speedup. On an NVIDIA A100 GPU, we observe a
178- # 2.3x speedup. Speedup mainly comes from reducing Python overhead and
179+ # results in a significant speedup. Speedup mainly comes from reducing Python overhead and
179180# GPU read/writes, and so the observed speedup may vary on factors such as model
180181# architecture and batch size. For example, if a model's architecture is simple
181182# and the amount of data is large, then the bottleneck would be
@@ -231,9 +232,8 @@ def train(mod, data):
231232
232233######################################################################
233234# Again, we can see that ``torch.compile`` takes longer in the first
234- # iteration, as it must compile the model, but afterward, we see
235- # significant speedups compared to eager. On an NVIDIA A100 GPU, we
236- # observe a 2.2x speedup.
235+ # iteration, as it must compile the model, but in subsequent iterations, we see
236+ # significant speedups compared to eager.
237237
238238######################################################################
239239# Comparison to TorchScript and FX Tracing
@@ -297,6 +297,9 @@ def test_fns(fn1, fn2, args):
297297# Now we can see that ``torch.compile`` correctly handles
298298# data-dependent control flow.
299299
300+ # Reset since we are using a different mode.
301+ torch ._dynamo .reset ()
302+
300303compile_f1 = torch .compile (f1 )
301304print ("compile 1, 1:" , test_fns (f1 , compile_f1 , (inp1 , inp2 )))
302305print ("compile 1, 2:" , test_fns (f1 , compile_f1 , (- inp1 , inp2 )))
@@ -394,7 +397,6 @@ def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
394397 gm .graph .print_tabular ()
395398 return gm .forward
396399
397- import torch ._dynamo
398400# Reset since we are using a different backend.
399401torch ._dynamo .reset ()
400402
@@ -489,4 +491,4 @@ def bar(a, b):
489491# In this tutorial, we introduced ``torch.compile`` by covering
490492# basic usage, demonstrating speedups over eager mode, comparing to previous
491493# PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions
492- # with FX graphs. We hope that you will give ``torch.compile`` a try!
494+ # with FX graphs. We hope that you will give ``torch.compile`` a try!
0 commit comments