diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py index 8efd481678..9abbefdade 100644 --- a/tests/unit_tests/test_tokenizer.py +++ b/tests/unit_tests/test_tokenizer.py @@ -19,7 +19,7 @@ parametrize, ) -from torchtitan.components.tokenizer import build_hf_tokenizer +from torchtitan.components.tokenizer import HuggingFaceTokenizer class TestTokenizerIntegration(unittest.TestCase): @@ -278,7 +278,7 @@ def test_download_and_build_tokenizer(self, test_repo_id): model_name = test_repo_id.split("/")[-1] tokenizer_dir = "tokenizer" if model_name == "FLUX.1-dev" else "." tokenizer_path = os.path.join(self.temp_dir, model_name, tokenizer_dir) - our_tokenizer = build_hf_tokenizer(tokenizer_path) + our_tokenizer = HuggingFaceTokenizer(tokenizer_path) # Step 3: Load tokenizer using official Tokenizer library (if available) official_tokenizer = None diff --git a/torchtitan/components/tokenizer.py b/torchtitan/components/tokenizer.py index abeb292194..6ff7eaaf7c 100644 --- a/torchtitan/components/tokenizer.py +++ b/torchtitan/components/tokenizer.py @@ -8,10 +8,10 @@ import json import os from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, Union from tokenizers import AddedToken, Tokenizer as HfTokenizer - +from torchtitan.config_manager import JobConfig from typing_extensions import override @@ -407,20 +407,18 @@ def id_to_token(self, token_id: int) -> Optional[str]: return self.tokenizer.id_to_token(token_id) -def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer: +def build_hf_tokenizer( + job_config: JobConfig, +) -> Union[HuggingFaceTokenizer, Tokenizer]: """ Builds a HuggingFaceTokenizer from the specified path. - This function creates a HuggingFaceTokenizer instance that handles BOS/EOS token inference and intelligent encoding. The tokenizer automatically detects and loads from various file formats and infers special token behavior. - Args: - tokenizer_path (str): Path to the directory containing tokenizer files. - Should contain one or more of the supported file types. - + JobConfig: A JobConfig object containing the path to the tokenizer directory. Returns: tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling """ - tokenizer = HuggingFaceTokenizer(tokenizer_path) + tokenizer = HuggingFaceTokenizer(job_config.model.tokenizer_path) return tokenizer diff --git a/torchtitan/train.py b/torchtitan/train.py index f4b4062d8d..e6a1ffa7d1 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -126,7 +126,7 @@ def __init__(self, job_config: JobConfig): # build dataloader tokenizer = ( - self.train_spec.build_tokenizer_fn(job_config.model.tokenizer_path) + self.train_spec.build_tokenizer_fn(job_config) if self.train_spec.build_tokenizer_fn is not None else None )