Skip to content
Merged

Fix CI #1366

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/unit_tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
parametrize,
)

from torchtitan.components.tokenizer import build_hf_tokenizer
from torchtitan.components.tokenizer import HuggingFaceTokenizer


class TestTokenizerIntegration(unittest.TestCase):
Expand Down Expand Up @@ -278,7 +278,7 @@ def test_download_and_build_tokenizer(self, test_repo_id):
model_name = test_repo_id.split("/")[-1]
tokenizer_dir = "tokenizer" if model_name == "FLUX.1-dev" else "."
tokenizer_path = os.path.join(self.temp_dir, model_name, tokenizer_dir)
our_tokenizer = build_hf_tokenizer(tokenizer_path)
our_tokenizer = HuggingFaceTokenizer(tokenizer_path)

# Step 3: Load tokenizer using official Tokenizer library (if available)
official_tokenizer = None
Expand Down
16 changes: 7 additions & 9 deletions torchtitan/components/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import json
import os
from abc import ABC, abstractmethod
from typing import Any, Optional
from typing import Any, Optional, Union

from tokenizers import AddedToken, Tokenizer as HfTokenizer

from torchtitan.config_manager import JobConfig
from typing_extensions import override


Expand Down Expand Up @@ -407,20 +407,18 @@ def id_to_token(self, token_id: int) -> Optional[str]:
return self.tokenizer.id_to_token(token_id)


def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer:
def build_hf_tokenizer(
job_config: JobConfig,
) -> Union[HuggingFaceTokenizer, Tokenizer]:
"""
Builds a HuggingFaceTokenizer from the specified path.

This function creates a HuggingFaceTokenizer instance that handles BOS/EOS token
inference and intelligent encoding. The tokenizer automatically detects and loads
from various file formats and infers special token behavior.

Args:
tokenizer_path (str): Path to the directory containing tokenizer files.
Should contain one or more of the supported file types.

JobConfig: A JobConfig object containing the path to the tokenizer directory.
Returns:
tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling
"""
tokenizer = HuggingFaceTokenizer(tokenizer_path)
tokenizer = HuggingFaceTokenizer(job_config.model.tokenizer_path)
return tokenizer
2 changes: 1 addition & 1 deletion torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, job_config: JobConfig):

# build dataloader
tokenizer = (
self.train_spec.build_tokenizer_fn(job_config.model.tokenizer_path)
self.train_spec.build_tokenizer_fn(job_config)
if self.train_spec.build_tokenizer_fn is not None
else None
)
Expand Down