update quantization tutorial by introudcing x86 backend (#2081)

XiaobingSuper · Svetlana Karslioglu · web-flow · commit f2fbe6b43c3f · 2023-03-15T16:20:41.000-07:00
Co-authored-by: Svetlana Karslioglu &lt;svekars@fb.com&gt;
diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst
@@ -458,7 +458,8 @@ quantizing for x86 architectures. This configuration does the following:
     per_channel_quantized_model = load_model(saved_model_dir + float_model_file)  
     per_channel_quantized_model.eval()  
     per_channel_quantized_model.fuse_model()  
-    per_channel_quantized_model.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+    # The old 'fbgemm' is still available but 'x86' is the recommended default.
+    per_channel_quantized_model.qconfig = torch.ao.quantization.get_default_qconfig('x86')
     print(per_channel_quantized_model.qconfig)  
 
     torch.ao.quantization.prepare(per_channel_quantized_model, inplace=True)
@@ -534,8 +535,9 @@ We fuse modules as before
     qat_model = load_model(saved_model_dir + float_model_file)  
     qat_model.fuse_model()  
 
-    optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001)  
-    qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('fbgemm')
+    optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001) 
+    # The old 'fbgemm' is still available but 'x86' is the recommended default. 
+    qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
   
 Finally, ``prepare_qat`` performs the "fake quantization", preparing the model for quantization-aware training
 
diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py
@@ -138,8 +138,8 @@
 # Now run the code below:
 #
 
-# Use 'fbgemm' for server inference and 'qnnpack' for mobile inference
-backend = "fbgemm" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook
+# Use 'x86' for server inference (the old 'fbgemm' is still available but 'x86' is the recommended default) and 'qnnpack' for mobile inference.
+backend = "x86" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook
 model.qconfig = torch.quantization.get_default_qconfig(backend)
 torch.backends.quantized.engine = backend
 
diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py
@@ -18,7 +18,8 @@
     from torch.quantization.quantize_fx import prepare_fx, convert_fx
 
     float_model.eval()
-    qconfig = get_default_qconfig("fbgemm")
+    # The old 'fbgemm' is still available but 'x86' is the recommended default.
+    qconfig = get_default_qconfig("x86")
     qconfig_mapping = QConfigMapping().set_global(qconfig)
     prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs)  # fuse modules and insert observers
     # no calibration is required for dynamic quantization
@@ -288,4 +289,4 @@ def time_model_evaluation(model, test_data):
 # 3. Conclusion
 # -------------
 # This tutorial introduces the api for post training dynamic quantization in FX Graph Mode,
-# which dynamically quantizes the same modules as Eager Mode Quantization.
+# which dynamically quantizes the same modules as Eager Mode Quantization.
diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst
@@ -17,7 +17,8 @@ tldr; The FX Graph Mode API looks like the following:
   from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
   from torch.ao.quantization import QConfigMapping
   float_model.eval()
-  qconfig = get_default_qconfig("fbgemm")
+  # The old 'fbgemm' is still available but 'x86' is the recommended default.
+  qconfig = get_default_qconfig("x86") 
   qconfig_mapping = QConfigMapping().set_global(qconfig)
   def calibrate(model, data_loader):
       model.eval()
@@ -256,7 +257,8 @@ while those for ``QConfigMapping`` can be found in the `qconfig_mapping <https:/
 
 .. code:: python
 
-    qconfig = get_default_qconfig("fbgemm")
+    # The old 'fbgemm' is still available but 'x86' is the recommended default.
+    qconfig = get_default_qconfig("x86") 
     qconfig_mapping = QConfigMapping().set_global(qconfig)
 
 5. Prepare the Model for Post Training Static Quantization
@@ -406,4 +408,4 @@ Running the model in AIBench (with single threading) gives the following result:
 
 As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speedup over the floating point model,
 which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary
-depending on model, device, build, input batch sizes, threading etc.
+depending on model, device, build, input batch sizes, threading etc.
diff --git a/prototype_source/fx_numeric_suite_tutorial.py b/prototype_source/fx_numeric_suite_tutorial.py
@@ -70,7 +70,7 @@ def plot(xdata, ydata, xlabel, ylabel, title):
 
 # create quantized model
 qconfig_dict = {
-    '': torch.quantization.get_default_qconfig('fbgemm'),
+    '': torch.quantization.get_default_qconfig('x86'), # The old 'fbgemm' is still available but 'x86' is the recommended default.
     # adjust the qconfig to make the results more interesting to explore
     'module_name': [
         # turn off quantization for the first couple of layers
diff --git a/recipes_source/quantization.rst b/recipes_source/quantization.rst
@@ -98,7 +98,7 @@ After this, running `print_model_size(model_static_quantized)` shows the static
 A complete model definition and static quantization example is `here <https://pytorch.org/docs/stable/quantization.html#quantization-api-summary>`_. A dedicated static quantization tutorial is `here <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
 
 .. note::
-  To make the model run on mobile devices which normally have arm architecture, you need to use `qnnpack` for `backend`; to run the model on computer with x86 architecture, use `fbgemm`.
+  To make the model run on mobile devices which normally have arm architecture, you need to use `qnnpack` for `backend`; to run the model on computer with x86 architecture, use `x86`` (the old `fbgemm` is still available but 'x86' is the recommended default).
 
 4. Quantization Aware Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Original file line number	Diff line number	Diff line change
`@@ -138,8 +138,8 @@`
`138`	`138`	`# Now run the code below:`
`139`	`139`	`#`
`140`	`140`
`141`		`-# Use 'fbgemm' for server inference and 'qnnpack' for mobile inference`
`142`		`-backend = "fbgemm" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook`
	`141`	`+# Use 'x86' for server inference (the old 'fbgemm' is still available but 'x86' is the recommended default) and 'qnnpack' for mobile inference.`
	`142`	`+backend = "x86" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook`
`143`	`143`	`model.qconfig = torch.quantization.get_default_qconfig(backend)`
`144`	`144`	`torch.backends.quantized.engine = backend`
`145`	`145`