Fix CI breakage

H-Huang · H-Huang · commit 64d17a6c8936 · 2025-07-03T10:45:46.000-07:00
diff --git a/torchtitan/components/tokenizer.py b/torchtitan/components/tokenizer.py
@@ -8,10 +8,10 @@
 import json
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 from tokenizers import AddedToken, Tokenizer as HfTokenizer
-
+from torchtitan.config_manager import JobConfig
 from typing_extensions import override
 
 
@@ -407,20 +407,18 @@ def id_to_token(self, token_id: int) -> Optional[str]:
         return self.tokenizer.id_to_token(token_id)
 
 
-def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer:
+def build_hf_tokenizer(
+    job_config: JobConfig,
+) -> Union[HuggingFaceTokenizer, Tokenizer]:
     """
     Builds a HuggingFaceTokenizer from the specified path.
-
     This function creates a HuggingFaceTokenizer instance that handles BOS/EOS token
     inference and intelligent encoding. The tokenizer automatically detects and loads
     from various file formats and infers special token behavior.
-
     Args:
-        tokenizer_path (str): Path to the directory containing tokenizer files.
-                             Should contain one or more of the supported file types.
-
+        JobConfig: A JobConfig object containing the path to the tokenizer directory.
     Returns:
         tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling
     """
-    tokenizer = HuggingFaceTokenizer(tokenizer_path)
+    tokenizer = HuggingFaceTokenizer(job_config.model.tokenizer_path)
     return tokenizer
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -126,7 +126,7 @@ def __init__(self, job_config: JobConfig):
 
         # build dataloader
         tokenizer = (
-            self.train_spec.build_tokenizer_fn(job_config.model.tokenizer_path)
+            self.train_spec.build_tokenizer_fn(job_config)
             if self.train_spec.build_tokenizer_fn is not None
             else None
         )

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ def __init__(self, job_config: JobConfig):`
`126`	`126`
`127`	`127`	`# build dataloader`
`128`	`128`	`tokenizer = (`
`129`		`- self.train_spec.build_tokenizer_fn(job_config.model.tokenizer_path)`
	`129`	`+ self.train_spec.build_tokenizer_fn(job_config)`
`130`	`130`	`if self.train_spec.build_tokenizer_fn is not None`
`131`	`131`	`else None`
`132`	`132`	`)`