rename ModelArgs into TransformerArgs

Gasoonjia · web-flow · commit f3ffa0e3198a · 2024-07-29T12:34:59.000-07:00
diff --git a/build/convert_hf_checkpoint.py b/build/convert_hf_checkpoint.py
@@ -17,7 +17,7 @@
 sys.path.append(str(wd.resolve()))
 sys.path.append(str((wd / "build").resolve()))
 
-from build.model import ModelArgs
+from build.model import TransformerArgs
 
 
 @torch.inference_mode()
@@ -32,7 +32,7 @@ def convert_hf_checkpoint(
     if model_name is None:
         model_name = model_dir.name
 
-    config = ModelArgs.from_name(model_name)
+    config = TransformerArgs.from_name(model_name)
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
diff --git a/build/gguf_loader.py b/build/gguf_loader.py
@@ -17,7 +17,7 @@
 from quantization.qops import LinearInt4 as WeightOnlyInt4Linear
 from quantization.quantize import pack_scales_and_zeros
 from build.gguf_util import Q4_0, to_float
-from build.model import ModelArgs, Transformer
+from build.model import TransformerArgs, Transformer
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -107,7 +107,7 @@ def load_model(gguf_file: str) -> torch.nn.Module:
     arch = metadata["general.architecture"]
     assert arch == "llama", "Only LLaMa models are supported by this converter."
 
-    model_args = ModelArgs(
+    model_args = TransformerArgs(
         dim=metadata[f"{arch}.embedding_length"],
         n_layers=metadata[f"{arch}.block_count"],
         n_heads=metadata[f"{arch}.attention.head_count"],
diff --git a/build/model.py b/build/model.py
@@ -22,7 +22,7 @@
 
 
 @dataclass
-class ModelArgs:
+class TransformerArgs:
     block_size: int = 2048
     vocab_size: int = 32000
     n_layers: int = 32
@@ -45,7 +45,7 @@ def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_heads
         if self.hidden_dim is None:
-            # If hidden_dim is not explicitly set in the ModelArgs,
+            # If hidden_dim is not explicitly set in the TransformerArgs,
             # then calculate implicitly based on dim and
             # also multiple of `args.multiple_of`
             multiple_of = self.multiple_of
@@ -73,7 +73,7 @@ def from_params(cls, params_path):
     def from_table(cls, name: str):
         json_path = config_path / f"{name}.json"
         if json_path.is_file():
-            return ModelArgs.from_params(json_path)
+            return TransformerArgs.from_params(json_path)
         else:
             known_model_params = [
                 config.replace(".json", "") for config in os.listdir(config_path)
@@ -86,7 +86,7 @@ def from_table(cls, name: str):
     def from_name(cls, name: str):
         json_path = config_path / f"{name}.json"
         if Path(json_path).is_file():
-            return ModelArgs.from_params(json_path)
+            return TransformerArgs.from_params(json_path)
 
         known_model_params = [
             config.replace(".json", "") for config in os.listdir(config_path)
@@ -113,7 +113,7 @@ def from_name(cls, name: str):
                 f"Unknown model directory name {name}. Must be one of {known_model_params}."
             )
 
-        return ModelArgs.from_params(config_path / f"{config[0]}.json")
+        return TransformerArgs.from_params(config_path / f"{config[0]}.json")
 
 
 class KVCache(nn.Module):
@@ -145,7 +145,7 @@ def update(self, input_pos, k_val, v_val):
 
 
 class Transformer(nn.Module):
-    def __init__(self, config: ModelArgs) -> None:
+    def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
         self.config = config
 
@@ -203,15 +203,15 @@ def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
 
     @classmethod
     def from_name(cls, name: str):
-        return cls(ModelArgs.from_name(name))
+        return cls(TransformerArgs.from_name(name))
 
     @classmethod
     def from_table(cls, name: str):
-        return cls(ModelArgs.from_table(name))
+        return cls(TransformerArgs.from_table(name))
 
     @classmethod
     def from_params(cls, params_path: str):
-        return cls(ModelArgs.from_params(params_path))
+        return cls(TransformerArgs.from_params(params_path))
 
     @classmethod
     def from_gguf(cls, gguf_path: str, **kwargs):
@@ -224,7 +224,7 @@ def from_gguf(cls, gguf_path: str, **kwargs):
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, config: ModelArgs) -> None:
+    def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
         self.attention = Attention(config)
         self.feed_forward = FeedForward(config)
@@ -240,7 +240,7 @@ def forward(
 
 
 class Attention(nn.Module):
-    def __init__(self, config: ModelArgs):
+    def __init__(self, config: TransformerArgs):
         super().__init__()
         assert config.dim % config.n_heads == 0
 
@@ -340,7 +340,7 @@ def forward(
 
 
 class FeedForward(nn.Module):
-    def __init__(self, config: ModelArgs) -> None:
+    def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
         self.w1 = nn.Linear(config.dim, config.hidden_dim, bias=False)
         self.w2 = nn.Linear(config.hidden_dim, config.dim, bias=False)
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
@@ -123,13 +123,13 @@ For example, for the stories15M model, this would be expressed as
 
 For models using a configuration not in the list of known
 configurations, you can construct the model by initializing the
-`ModelArgs` dataclass that controls model construction from a
+`TransformerArgs` dataclass that controls model construction from a
 parameter json using the `params-path ${PARAMS_PATH}` containing the
-appropriate model parameters to initialize the `ModelArgs` for the
+appropriate model parameters to initialize the `TransformerArgs` for the
 model. (We use the model constructor `Transformer.from_params()`).
 
 The parameter file should be in JSON format specifying these
-parameters. You can find the `ModelArgs` data class in
+parameters. You can find the `TransformerArgs` data class in
 [`model.py`](https://github.com/pytorch/torchchat/blob/main/model.py#L22).
 
 The final way to initialize a torchchat model is from GGUF. You load a