pytorch
diff --git a/‎.github/scripts/github_utils.py‎
Lines changed: 1 addition & 3 deletions b/‎.github/scripts/github_utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎.github/scripts/gitutils.py‎
Lines changed: 2 additions & 2 deletions b/‎.github/scripts/gitutils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/label_utils.py‎
Lines changed: 2 additions & 3 deletions b/‎.github/scripts/label_utils.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/scripts/trymerge.py‎
Lines changed: 6 additions & 10 deletions b/‎.github/scripts/trymerge.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎.github/scripts/trymerge_explainer.py‎
Lines changed: 0 additions & 1 deletion b/‎.github/scripts/trymerge_explainer.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/bench_galore_fused_kernels.py‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/bench_galore_fused_kernels.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/benchmark_aq.py‎
Lines changed: 83 additions & 35 deletions b/‎benchmarks/benchmark_aq.py‎
Lines changed: 83 additions & 35 deletions
@@ -3,14 +3,12 @@
 import json
 import os
 import warnings
-
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
 
-
 GITHUB_API_URL = "https://api.github.com"
 
 
 
@@ -9,14 +9,14 @@
 from typing import (
     Any,
     Callable,
-    cast,
     Dict,
     Iterator,
     List,
     Optional,
     Tuple,
     TypeVar,
     Union,
+    cast,
 )
 
 T = TypeVar("T")
@@ -45,7 +45,7 @@ def fuzzy_list_to_dict(items: List[Tuple[str, str]]) -> Dict[str, List[str]]:
 
 
 def _check_output(items: List[str], encoding: str = "utf-8") -> str:
-    from subprocess import CalledProcessError, check_output, STDOUT
+    from subprocess import STDOUT, CalledProcessError, check_output
 
     try:
         return check_output(items, stderr=STDOUT).decode(encoding)
 
@@ -1,11 +1,10 @@
 """GitHub Label Utilities."""
 
 import json
-
 from functools import lru_cache
-from typing import Any, List, Tuple, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Any, List, Tuple, Union
 
-from github_utils import gh_fetch_url_and_headers, GitHubComment
+from github_utils import GitHubComment, gh_fetch_url_and_headers
 
 # TODO: this is a temp workaround to avoid circular dependencies,
 #       and should be removed once GitHubPR is refactored out of trymerge script.
 
@@ -23,44 +23,41 @@
 from typing import (
     Any,
     Callable,
-    cast,
     Dict,
     Iterable,
     List,
     NamedTuple,
     Optional,
     Pattern,
     Tuple,
+    cast,
 )
 from warnings import warn
 
 import yaml
 from github_utils import (
+    GitHubComment,
     gh_fetch_json_list,
     gh_fetch_merge_base,
     gh_fetch_url,
     gh_graphql,
     gh_post_commit_comment,
     gh_post_pr_comment,
     gh_update_pr_state,
-    GitHubComment,
 )
-
 from gitutils import (
+    GitRepo,
     are_ghstack_branches_in_sync,
     get_git_remote_name,
     get_git_repo_dir,
-    GitRepo,
     patterns_to_regex,
     retries_decorator,
 )
 from label_utils import (
     gh_add_labels,
     gh_remove_label,
-    has_required_labels,
-    LABEL_ERR_MSG,
 )
-from trymerge_explainer import get_revert_message, TryMergeExplainer
+from trymerge_explainer import TryMergeExplainer, get_revert_message
 
 # labels
 MERGE_IN_PROGRESS_LABEL = "merging"
@@ -1477,7 +1474,7 @@ def checks_to_str(checks: List[Tuple[str, Optional[str]]]) -> str:
 
 
 def checks_to_markdown_bullets(
-    checks: List[Tuple[str, Optional[str], Optional[int]]]
+    checks: List[Tuple[str, Optional[str], Optional[int]]],
 ) -> List[str]:
     return [
         f"- [{c[0]}]({c[1]})" if c[1] is not None else f"- {c[0]}" for c in checks[:5]
@@ -1716,7 +1713,7 @@ def get_readable_drci_results(drci_classifications: Any) -> str:
         try:
             print(f"From Dr.CI checkrun summary: {drci_summary}")
             drci_classifications = json.loads(str(drci_summary))
-        except json.JSONDecodeError as error:
+        except json.JSONDecodeError:
             warn("Invalid Dr.CI checkrun summary")
             drci_classifications = {}
 
@@ -1887,7 +1884,6 @@ def do_revert_prs(
     dry_run: bool = False,
 ) -> None:
     # Prepare and push revert commits
-    commit_shas: List[str] = []
     for commit_sha, pr in shas_and_prs:
         revert_msg = f"\nReverted {pr.get_pr_url()} on behalf of {prefix_with_github_url(author_login)}"
         revert_msg += extra_msg
 
@@ -2,7 +2,6 @@
 import re
 from typing import List, Optional, Pattern, Tuple
 
-
 BOT_COMMANDS_WIKI = "https://github.com/pytorch/pytorch/wiki/Bot-commands"
 
 CIFLOW_LABEL = re.compile(r"^ciflow/.+")
 
@@ -8,9 +8,7 @@
 def run(args):
     dtype = getattr(torch, args.dtype)
     allow_tf32 = args.allow_tf32
-    fp8_fast_accum = False
     torch.backends.cuda.matmul.allow_tf32 = allow_tf32
-    kernel = args.kernel
     M, N = args.M, args.N
     rank = args.rank
 
 
@@ -1,23 +1,26 @@
-"""Benchmarks for affine quantized tensor, this includes int8 dynamic quant, int8 weight only quant and int4 weight only quant APIs
-"""
+"""Benchmarks for affine quantized tensor, this includes int8 dynamic quant, int8 weight only quant and int4 weight only quant APIs"""
+
+import copy
+
 import torch
+
+from torchao.quantization.quant_api import (
+    _replace_with_custom_fn_if_matches_filter,
+    int4_weight_only,
+    int8_dynamic_activation_int8_weight,
+    int8_weight_only,
+    quantize_,
+)
 from torchao.quantization.subclass import (
-    Int8WeightOnlyQuantizedLinearWeight,
     Int4WeightOnlyQuantizedLinearWeight,
+    Int8WeightOnlyQuantizedLinearWeight,
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
+    unwrap_tensor_subclass,
 )
-from torchao.quantization.quant_api import (
-    int4_weight_only,
-    int8_weight_only,
-    int8_dynamic_activation_int8_weight,
-    quantize_,
-    _replace_with_custom_fn_if_matches_filter,
-)
-import copy
-from torchao.utils import unwrap_tensor_subclass
+
 
 def _int8wo_api(mod, **kwargs):
     if TORCH_VERSION_AT_LEAST_2_4:
@@ -27,14 +30,20 @@ def _int8wo_api(mod, **kwargs):
     else:
         change_linear_weights_to_int8_woqtensors(mod, **kwargs)
 
+
 def _int8da_int8w_api(mod, **kwargs):
     if TORCH_VERSION_AT_LEAST_2_4:
-        quantize_(mod, int8_dynamic_activation_int8_weight(**kwargs), set_inductor_config=False)
+        quantize_(
+            mod,
+            int8_dynamic_activation_int8_weight(**kwargs),
+            set_inductor_config=False,
+        )
         if not TORCH_VERSION_AT_LEAST_2_5:
             unwrap_tensor_subclass(mod)
     else:
         change_linear_weights_to_int8_dqtensors(mod, **kwargs)
 
+
 def _int4wo_api(mod, **kwargs):
     if TORCH_VERSION_AT_LEAST_2_4:
         kwargs_copy = kwargs.copy()
@@ -47,31 +56,43 @@ def _int4wo_api(mod, **kwargs):
     else:
         change_linear_weights_to_int4_woqtensors(mod, **kwargs)
 
+
 class ToyLinearModel(torch.nn.Module):
-    """Single linear for m * k * n problem size
-    """
-    def __init__(self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda"):
+    """Single linear for m * k * n problem size"""
+
+    def __init__(
+        self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda"
+    ):
         super().__init__()
         self.m = m
         self.dtype = dtype
         self.device = device
-        self.linear = torch.nn.Linear(k, n, bias=has_bias).to(dtype=self.dtype, device=self.device)
+        self.linear = torch.nn.Linear(k, n, bias=has_bias).to(
+            dtype=self.dtype, device=self.device
+        )
 
     def example_inputs(self):
-        return (torch.randn(self.m, self.linear.in_features, dtype=self.dtype, device=self.device),)
+        return (
+            torch.randn(
+                self.m, self.linear.in_features, dtype=self.dtype, device=self.device
+            ),
+        )
 
     def forward(self, x):
         x = self.linear(x)
         return x
 
+
 def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs):
     """
     The deprecated implementation for int8 dynamic quant API, used as a reference for
     numerics and performance
     """
-    from torchao.quantization.quant_api import _in_features_greater_than_16
-    from torchao.quantization.quant_api import _is_linear
-    from torchao.quantization.quant_api import _get_subclass_inserter
+    from torchao.quantization.quant_api import (
+        _get_subclass_inserter,
+        _in_features_greater_than_16,
+        _is_linear,
+    )
     from torchao.quantization.subclass import Int8DynamicallyQuantizedLinearWeight
 
     if filter_fn is None:
@@ -80,40 +101,54 @@ def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs
         )
 
     _replace_with_custom_fn_if_matches_filter(
-        model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
+        model,
+        _get_subclass_inserter(
+            Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs
+        ),
+        filter_fn,
     )
 
+
 def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
     def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
         """
         The deprecated implementation for weight only quant API, used as a reference for
         numerics and performance
         """
-        from torchao.quantization.quant_api import _is_linear
-        from torchao.quantization.quant_api import _get_subclass_inserter
+        from torchao.quantization.quant_api import _get_subclass_inserter, _is_linear
 
         filter_fn = kwargs.pop("filter_fn", _is_linear)
 
         _replace_with_custom_fn_if_matches_filter(
             model,
-            _get_subclass_inserter(deprecated_tenosr_subclass, enable_parametrization=True, **kwargs),
+            _get_subclass_inserter(
+                deprecated_tenosr_subclass, enable_parametrization=True, **kwargs
+            ),
             filter_fn,
         )
 
     return _ref_change_linear_weights_to_woqtensors
 
-_ref_change_linear_weights_to_int8_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
-_ref_change_linear_weights_to_int4_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
+
+_ref_change_linear_weights_to_int8_woqtensors = (
+    _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
+)
+_ref_change_linear_weights_to_int4_woqtensors = (
+    _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
+)
 
 
 torch._dynamo.config.cache_size_limit = 50000
 
+
 @torch.no_grad
 def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
     if kwargs is None:
         kwargs = {}
 
-    m = ToyLinearModel(M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda").eval()
+    m = ToyLinearModel(
+        M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda"
+    ).eval()
     m_bf16 = copy.deepcopy(m)
     m_ref = copy.deepcopy(m)
     example_inputs = m.example_inputs()
@@ -130,26 +165,30 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
 
     # perf comparison
     from torchao.utils import benchmark_model
+
     # warmup
     WARMUP = 20
     RUNS = 100
 
     torch._dynamo.reset()
-    m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
+    m_ref = torch.compile(m_ref, mode="max-autotune", fullgraph=True)
     benchmark_model(m_ref, WARMUP, example_inputs)
     ref_elapsed_time = benchmark_model(m_ref, RUNS, example_inputs)
 
     torch._dynamo.reset()
-    m = torch.compile(m, mode='max-autotune', fullgraph=True)
+    m = torch.compile(m, mode="max-autotune", fullgraph=True)
     benchmark_model(m, WARMUP, example_inputs)
     elapsed_time = benchmark_model(m, RUNS, example_inputs)
 
     torch._dynamo.reset()
-    m_bf16 = torch.compile(m_bf16, mode='max-autotune', fullgraph=True)
+    m_bf16 = torch.compile(m_bf16, mode="max-autotune", fullgraph=True)
     benchmark_model(m_bf16, WARMUP, example_inputs)
     bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs)
 
-    print(f"{(M, N, K)}: elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}")
+    print(
+        f"{(M, N, K)}: elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}"
+    )
+
 
 if __name__ == "__main__" and TORCH_VERSION_AT_LEAST_2_4 and torch.cuda.is_available():
     all_shapes = [
@@ -158,16 +197,25 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
 
     print("_int8da_int8w_api")
     from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
+
     for M, N, K in all_shapes:
-        _bench_quantized_tensor_subclass_perf(_int8da_int8w_api, _ref_change_linear_weights_to_int8_dqtensors, M, N, K)
+        _bench_quantized_tensor_subclass_perf(
+            _int8da_int8w_api, _ref_change_linear_weights_to_int8_dqtensors, M, N, K
+        )
 
     print("_int8wo_api")
     from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
+
     for M, N, K in all_shapes:
-        _bench_quantized_tensor_subclass_perf(_int8wo_api, _ref_change_linear_weights_to_int8_woqtensors, M, N, K)
+        _bench_quantized_tensor_subclass_perf(
+            _int8wo_api, _ref_change_linear_weights_to_int8_woqtensors, M, N, K
+        )
 
     print("_int4wo_api")
     kwargs = {"groupsize": 32}
     from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
+
     for M, N, K in all_shapes:
-        _bench_quantized_tensor_subclass_perf(_int4wo_api, _ref_change_linear_weights_to_int4_woqtensors, M, N, K, kwargs)
+        _bench_quantized_tensor_subclass_perf(
+            _int4wo_api, _ref_change_linear_weights_to_int4_woqtensors, M, N, K, kwargs
+        )