diff --git a/setup.py b/setup.py index ebe9503..3fb9d12 100644 --- a/setup.py +++ b/setup.py @@ -15,17 +15,18 @@ # limitations under the License. from pathlib import Path - from setuptools import find_packages, setup -__version__ = "0.0.4" +version_vars = {} +exec("exec(open('tokenicer/version.py').read()); version=__version__", {}, version_vars) +tokenicer_version = version_vars['version'] with open("requirements.txt") as f: requirements = f.read().splitlines() setup( name="tokenicer", - version=__version__, + version=tokenicer_version, author="ModelCloud", author_email="qubitium@modelcloud.ai", description="A (nicer) tokenizer you want to use for model `inference` and `training`: with all known peventable `gotchas` normalized or auto-fixed.", diff --git a/tests/test_encode_decode.py b/tests/test_encode_decode.py index 72b81f8..768ec22 100644 --- a/tests/test_encode_decode.py +++ b/tests/test_encode_decode.py @@ -67,4 +67,4 @@ def test_decode(self): self.example, example, msg=f"Expected example: `{self.example}`, actual=`{example}`." - ) + ) \ No newline at end of file diff --git a/tests/test_model_config.py b/tests/test_model_config.py index adceb71..440842c 100644 --- a/tests/test_model_config.py +++ b/tests/test_model_config.py @@ -38,4 +38,4 @@ def test_model_config(self): tokenicer.model_config.eos_token_id, expect_eos_token_id, msg=f"Expected eos_token_id: `{expect_eos_token_id}`, actual=`{tokenicer.model_config.eos_token_id}`." - ) + ) \ No newline at end of file diff --git a/tests/test_pad_token.py b/tests/test_pad_token.py index 0b84b9b..27a1523 100644 --- a/tests/test_pad_token.py +++ b/tests/test_pad_token.py @@ -52,4 +52,4 @@ def test_pad_token(self, tokenicer.tokenizer.pad_token, expect_pad_token, msg=f"Expected pad_token: `{expect_pad_token}`, actual=`{tokenicer.tokenizer.pad_token}`." - ) + ) \ No newline at end of file diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 0000000..29336ee --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,39 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from tokenicer import Tokenicer +from tokenicer.const import VALIDATE_JSON_FILE_NAME +import tempfile + + +class TestValidate(unittest.TestCase): + def test_validate(self): + model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" + tokenicer = Tokenicer.load(model_path) + + with tempfile.TemporaryDirectory() as tmpdir: + tokenicer.save(tmpdir) + validate_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) + result = os.path.isfile(validate_json_path) + self.assertTrue( + result, + f"Save validate file failed: {validate_json_path} does not exist.", + ) + + validate = tokenicer.validate(tmpdir) + self.assertTrue(validate, f"Expected validate='True' but got '{validate}'.") diff --git a/tokenicer/config.py b/tokenicer/config.py new file mode 100644 index 0000000..c0336d1 --- /dev/null +++ b/tokenicer/config.py @@ -0,0 +1,87 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union, Any, Dict +from dataclasses import dataclass +from enum import Enum + + +class ValidateDataFormat(Enum): + SIMPLE = "simple" + + +@dataclass +class ValidateData: + format: ValidateDataFormat = ValidateDataFormat.SIMPLE + input: Union[str, Any] = None + output: List[int] = None + + def __post_init__(self): + if self.input is None: + self.input = [] + + if self.output is None: + self.output = [] + + +@dataclass +class ValidateMeta: + validator: str = None + uri: str = None + + def __post_init__(self): + if self.validator is None: + from .version import __version__ + + self.validator = f"tokenicer:{__version__}" + + if self.uri is None: + self.uri = "https://github.com/ModelCloud/Tokenicer" + + +@dataclass +class ValidateConfig: + meta: Optional[ValidateMeta] = None + data: List[ValidateData] = None + + def __post_init__(self): + if self.meta is None: + self.meta = ValidateMeta() + + if self.data is None: + self.data = [] + + def to_dict(self): + dataset_dict = [ + { + "format": data.format.value, + "input": data.input, + "output": data.output, + } + for data in self.data + ] + + meta_dict = {"validator": self.meta.validator, "uri": self.meta.uri} + + return {"meta": meta_dict, "data": dataset_dict} + + @classmethod + def from_dict(cls, data: Dict): + meta_data = data.get("meta", {}) + data_list = data.get("data", []) + meta = ValidateMeta(**meta_data) if meta_data else None + validate_data = [ValidateData(**item) for item in data_list] + return cls(meta=meta, data=validate_data) diff --git a/tokenicer/const.py b/tokenicer/const.py index f32bb85..51403eb 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -17,20 +17,62 @@ from collections import namedtuple DEFAULT_PAD_TOKENS = [ - "<|finetune_right_pad_id|>", - "<|pad|>", - "", - "<|unk|>", - "" + "<|finetune_right_pad_id|>", + "<|pad|>", + "", + "<|unk|>", + "", ] TOKEN_TUPLE = namedtuple("TokenTuple", ["token", "token_id"]) MODEL_PAD_TOKEN_MAP = { - "llama": TOKEN_TUPLE(token='<|finetune_right_pad_id|>', token_id=128004), - "qwen2_5_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654), - "qwen2_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654), - "qwen2": TOKEN_TUPLE(token='<|fim_pad|>', token_id=151662), - "deepseek_v3": TOKEN_TUPLE(token='<|▁pad▁|>', token_id=2), - "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) + "llama": TOKEN_TUPLE(token="<|finetune_right_pad_id|>", token_id=128004), + "qwen2_5_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654), + "qwen2_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654), + "qwen2": TOKEN_TUPLE(token="<|fim_pad|>", token_id=151662), + "deepseek_v3": TOKEN_TUPLE(token="<|▁pad▁|>", token_id=2), + "mpt": TOKEN_TUPLE(token="<|padding|>", token_id=1), } + +VALIDATE_JSON_FILE_NAME = "tokenizer_validate.json" +VALIDATE_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} + +VALIDATE_DATASETS = [ + # English + "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", + "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", + "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", + "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**", + # Simplified Chinese + "在一个清晨,阳光透过窗帘缝隙洒在床单上,空气里弥漫着刚煮好的咖啡香。街道还很安静,偶尔有几只鸟儿在枝头跳跃。", + "2025年,科技的发展速度令人惊叹!\n量子计算机的计算能力已达到10¹⁰次操作每秒,\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括:\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了,创新与效率是成功的关键!", + # Traditional Chinese + "2025年,科技的發展速度讓人驚訝!\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒,\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如,愛因斯坦的著名公式 E = mc²,\n揭示了質量和能量之間的關係。\n化學中,水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下:\n1. 完成數學模型的推導:x² + 3x - 4 = 0\n2. 實驗室研究化學反應:2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步,都是知識積累的過程。", + # French + "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.", + # German + "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.", + # Spanish + '# Este es un ejemplo de código en Python\ndef saludar(nombre):\n print(f"¡Hola, {nombre}!")\n\n# Llamada a la función\nsaludar("Juan")', + # Arabic + "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة.", + # Russian + "Привет! Как дела? Я рад познакомиться с тобой. Надеюсь, у тебя хороший день!", + # Danish + "Danmark er et smukt land med en rig kultur og historie. Det er kendt for sine maleriske landskaber, hyggelige byer og venlige mennesker. København, hovedstaden, er en moderne metropol, der samtidig bevarer sin historiske charme. Danmark har også en stærk tradition for bæredygtighed og innovation.", + # Portuguese + "Hoje está um dia lindo, perfeito para um passeio no parque.", + # Indonesian + "Selamat pagi! Apa kabar? Saya harap hari Anda menyenankan. Jika ada sesuatu yang bisa saya bantu, silakan beri tahu saya.", + # Italian + "La cucina italiana è famosa in tutto il mondo per la sua varietà e i suoi sapori deliziosi. Ogni regione ha le sue specialità uniche, ma piatti come la pasta, la pizza e il gelato sono amati da tutti. Mangiare in Italia non è solo un pasto, ma un'esperienza sociale che coinvolge amici e familiari.", + # Vietnamese + "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", + # Polish + "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", + # Japanese + "今日はとても良い天気ですね。朝から青空が広がっていて、散歩に出かけるのにぴったりな日です。最近、忙しくてなかなか外に出る時間がなかったので、今日はゆっくりと自然の中でリラックスしたいと思っています。", + # Korean + "오늘은 정말 좋은 날씨네요. 아침부터 맑은 하늘이 펼쳐져 있고, 산책을 하기에 딱 좋은 날이에요. 요즘 바빠서 밖에 나갈 시간이 없었는데, 오늘은 자연 속에서 여유롭게 시간을 보내고 싶어요.", +] diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 38bd7c7..6b68ce2 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -14,13 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import logging -from typing import List, Optional, Union - +from typing import List, Optional, Union, Tuple from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase - from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP from .util import auto_config, candidate_id, config_path +from .validate import _save, _validate, _validate_file_exist logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -32,17 +32,26 @@ def __init__(self): pass @classmethod - def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], strict: bool = False, pad_tokens: Optional[List[Union[str, int]]] = None, **kwargs): + def load( + cls, + pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], + strict: bool = False, + pad_tokens: Optional[List[Union[str, int]]] = None, + **kwargs, + ): if pretrained_model_name_or_path is None: raise ValueError("Tokenicer: `pretrained_model_name_or_path` cannot be `None`.") - trust_remote_code = kwargs.get('trust_remote_code', False) + trust_remote_code = kwargs.get("trust_remote_code", False) if isinstance(pretrained_model_name_or_path, PreTrainedTokenizerBase): tokenizer = pretrained_model_name_or_path path = config_path(tokenizer) elif isinstance(pretrained_model_name_or_path, str): - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, **kwargs + ) + print(f"cl->{tokenizer}") if isinstance(tokenizer, PreTrainedTokenizerBase): path = pretrained_model_name_or_path else: @@ -62,10 +71,14 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] tokenizer_cls = type(tokenizer) tokenicer_cls_wrapper = type(f"{tokenizer_cls.__name__}", (cls, tokenizer_cls), {}) + t = tokenicer_cls_wrapper() t.tokenizer = tokenizer t.model_config = model_config t.auto_fix_pad_token(strict=strict, pad_tokens=pad_tokens) + exist, _ = _validate_file_exist(tokenizer) + if exist and t.validate(): + logger.info("Tokenicer validate successful!") return t def auto_fix_pad_token( @@ -76,12 +89,15 @@ def auto_fix_pad_token( ): if model_or_path is not None: if isinstance(model_or_path, str): - model_config = auto_config(model_or_path, self.tokenizer.trust_remote_code) + model_config = auto_config( + model_or_path, self.tokenizer.trust_remote_code + ) elif isinstance(model_or_path, PreTrainedModel): model_config = getattr(model_or_path, "config", None) else: raise ValueError( - f"Tokenicer: Unsupported `model_or_path` type: Expected `str` or `PreTrainedModel`, actual = `{type(model_or_path)}`.") + f"Tokenicer: Unsupported `model_or_path` type: Expected `str` or `PreTrainedModel`, actual = `{type(model_or_path)}`." + ) if model_config is None: raise ValueError("Tokenicer: Can not retrieve config from the provided `model_or_path`.") @@ -98,8 +114,13 @@ def auto_fix_pad_token( pad_token_id = model_config.pad_token_id - if pad_token_id is None or pad_token_id in [model_config.bos_token_id, model_config.eos_token_id]: - pad_token_id = self._auto_map_pad_token(model_config=model_config, pad_tokens=pad_tokens) + if pad_token_id is None or pad_token_id in [ + model_config.bos_token_id, + model_config.eos_token_id, + ]: + pad_token_id = self._auto_map_pad_token( + model_config=model_config, pad_tokens=pad_tokens + ) if not strict: if pad_token_id is None and self.tokenizer.eos_token_id is not None: @@ -130,7 +151,10 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: pad_token_id = candidate_id(pad_tokens, vocab) # Match MODEL_PAD_TOKEN_MAP to get pad token - if pad_token_id is None and MODEL_PAD_TOKEN_MAP.get(model_config.model_type, None) is not None: + if ( + pad_token_id is None + and MODEL_PAD_TOKEN_MAP.get(model_config.model_type, None) is not None + ): token_tuple = MODEL_PAD_TOKEN_MAP.get(model_config.model_type) pad_token = token_tuple.token token_id = vocab.get(pad_token, None) @@ -143,7 +167,10 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: # Use eos_token as pad token if pad_token_id is None: - if isinstance(model_config.eos_token_id, list) and model_config.eos_token_id: + if ( + isinstance(model_config.eos_token_id, list) + and model_config.eos_token_id + ): pad_token_id = model_config.eos_token_id[0] else: pad_token_id = model_config.eos_token_id @@ -159,6 +186,27 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id + def save( + self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True + ) -> str: + return _save( + save_dir=save_dir, + tokenizer=self.tokenizer, + use_chat_template=use_chat_template, + ) + + def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: + return _validate(self.tokenizer, save_dir=save_dir) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + use_chat_template: bool = True, + **kwargs, + ) -> Tuple[str]: + self.save(save_dir=save_directory, use_chat_template=use_chat_template) + return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) + def __getattribute__(self, name): try: return super().__getattribute__("tokenizer").__getattribute__(name) diff --git a/tokenicer/util.py b/tokenicer/util.py index 97fcc20..2e90261 100644 --- a/tokenicer/util.py +++ b/tokenicer/util.py @@ -14,12 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import string from typing import List, Optional, Union - from transformers import AutoConfig, PretrainedConfig -def candidate_ids(token_list: List[Union[str, int]], vocab: dict) -> List[Optional[int]]: +def candidate_ids( + token_list: List[Union[str, int]], vocab: dict +) -> List[Optional[int]]: token_ids = [] for item in token_list: if isinstance(item, str): @@ -48,3 +51,24 @@ def auto_config(path, trust_remote) -> Optional[PretrainedConfig]: if isinstance(config, PretrainedConfig): model_config = config return model_config + + +def all_special_characters(): + # Get punctuation characters + punctuation_chars = string.punctuation + + # Get whitespace characters (such as space, newline, tab, etc.) + whitespace_chars = string.whitespace + + # Common mathematical symbols and operators (manually added) + math_and_operators = "+-*/=<>%&^|!~" + + # Combine all special characters into a single string + all_special_chars = punctuation_chars + whitespace_chars + math_and_operators + + # Return the combined string + return all_special_chars + + +def isfile(path): + return os.path.isfile(path) diff --git a/tokenicer/validate.py b/tokenicer/validate.py new file mode 100644 index 0000000..551058f --- /dev/null +++ b/tokenicer/validate.py @@ -0,0 +1,124 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from transformers import PreTrainedTokenizerBase +from typing import Union, Optional + +from .util import config_path, all_special_characters, isfile +from .const import VALIDATE_JSON_FILE_NAME, VALIDATE_ENCODE_PARAMS, VALIDATE_DATASETS +from .config import ValidateConfig, ValidateData + + +def _validate_file_exist(tokenizer): + path = config_path(tokenizer) + if path is None: + raise ValueError( + "Can not retrieve config path from the provided `pretrained_model_name_or_path`." + ) + + validate_json_path = os.path.join(path, VALIDATE_JSON_FILE_NAME) + return isfile(validate_json_path), validate_json_path + + +def _save( + save_dir: Union[str, os.PathLike], + tokenizer: PreTrainedTokenizerBase, + use_chat_template: bool = True, +) -> str: + os.makedirs(save_dir, exist_ok=True) + + validate_json_path = os.path.join(save_dir, VALIDATE_JSON_FILE_NAME) + exist = isfile(validate_json_path) + if exist: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Validate file:{validate_json_path} already exists.") + return validate_json_path + + if use_chat_template and tokenizer.chat_template is None: + import logging + + logger = logging.getLogger(__name__) + logger.warning("Tokenizer does not support chat template.") + use_chat_template = False + + VALIDATE_DATASETS.append(all_special_characters()) + + prompts = [] + if use_chat_template: + for data in VALIDATE_DATASETS: + message = [{"role": "user", "content": data}] + prompt = tokenizer.apply_chat_template( + message, add_generation_prompt=False, tokenize=False + ).rstrip() + prompts.append(prompt) + else: + prompts = VALIDATE_DATASETS + + results = [] + for prompt in prompts: + tokenized = tokenizer.encode_plus(prompt, **VALIDATE_ENCODE_PARAMS) + output = tokenized["input_ids"].tolist()[0] + data = ValidateData(input=prompt, output=output) + results.append(data) + + validate_dic = ValidateConfig(data=results).to_dict() + + with open(validate_json_path, "w", encoding="utf-8") as f: + json.dump(validate_dic, f, indent=4) + f.write("\n") + return validate_json_path + + +def _validate( + tokenizer: PreTrainedTokenizerBase, + save_dir: Optional[Union[str, os.PathLike]] = None, +) -> bool: + exist = False + + if save_dir is not None: + validate_json_path = os.path.join(save_dir, VALIDATE_JSON_FILE_NAME) + exist = isfile(validate_json_path) + + if not exist: + exist, validate_json_path = _validate_file_exist(tokenizer) + if not exist: + raise ValueError( + "Validate file does not exist, please call the `save()` API first." + ) + + with open(validate_json_path, "r", encoding="utf-8") as f: + data = json.loads(f.read()) + + config = ValidateConfig.from_dict(data) + + if config is None or len(config.data) == 0: + raise ValueError( + f"Init validate data failed, please check {validate_json_path}." + ) + + for data in config.data: + input = data.input + tokenized = tokenizer.encode_plus(input, **VALIDATE_ENCODE_PARAMS)[ + "input_ids" + ].tolist()[0] + if data.output != tokenized: + return False + + return True diff --git a/tokenicer/version.py b/tokenicer/version.py new file mode 100644 index 0000000..62631ca --- /dev/null +++ b/tokenicer/version.py @@ -0,0 +1,17 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.0.4"