From 81ce78ca7d47fa39f43769ff4cd66a08bf10d1c2 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 06:19:04 +0000 Subject: [PATCH 01/32] Add verify API --- tokenicer/const.py | 5 +++- tokenicer/tokenicer.py | 65 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 6c8ac90..6aa0cc3 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -33,4 +33,7 @@ "qwen2": TOKEN_TUPLE(token='<|fim_pad|>', token_id=151662), "deepseek_v3": TOKEN_TUPLE(token='<|▁pad▁|>', token_id=2), "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) -} \ No newline at end of file +} + +INPUT_KEY = "input" +TENSOR_KEY = "tensor" \ No newline at end of file diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 405fe18..9d95cd0 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -14,11 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import json import logging from typing import Union, List, Optional from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config -from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP +from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP, INPUT_KEY, TENSOR_KEY logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -28,6 +30,9 @@ class Tokenicer: tokenizer: Union[str, PreTrainedTokenizerBase] = None model_config = None + encode_params = {"return_tensors": "pt", "add_special_tokens": False} + VERIFY_JSON_FILE_NAME = "tokenizer_verify.jsonl" + @classmethod def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], strict: bool = False, pad_tokens: Optional[List[Union[str, int]]] = None, **kwargs): if pretrained_model_name_or_path is None: @@ -158,6 +163,64 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id + def save_verify(self, prompts: Union[str, List[str]]): + exist, verify_json_path = self._verify_file_exist() + if exist: + logger.warning("The verification file already exists.") + return + + if prompts is None: + raise ValueError("`prompts` cannot be None") + + if not isinstance(prompts, str) and not isinstance(prompts, list): + raise ValueError( + f"Unsupported `prompts` type: Expected `str` or `List[str]`, actual = `{type(prompts)}`.") + + if isinstance(prompts, str): + prompts = [prompts] + + if len(prompts) == 0: + raise ValueError("len(prompts) == 0, `prompts` must be greater than 0") + + results = [] + for prompt in prompts: + tokenized = self.tokenizer.encode_plus(prompt, **self.encode_params) + jsonl = {INPUT_KEY: prompt, TENSOR_KEY: tokenized["input_ids"].tolist()} + results.append(jsonl) + + with open(verify_json_path, 'w') as f: + for item in results: + json.dump(item, f) + f.write('\n') + + def verify(self) -> bool: + exist, verify_json_path = self._verify_file_exist() + if not exist: + raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") + + with open(verify_json_path, 'r', encoding='utf-8') as file: + for line in file: + json_obj = json.loads(line) + + input_text = json_obj[INPUT_KEY] + tensor = json_obj[TENSOR_KEY] + + tokenized = self.tokenizer.encode_plus(input_text, **self.encode_params) + if tensor != tokenized["input_ids"].tolist(): + return False + return True + + def _verify_file_exist(self): + path = config_path(self.tokenizer) + if path is None: + raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") + + verify_json_path = os.path.join(path, self.VERIFY_JSON_FILE_NAME) + + if os.path.isfile(verify_json_path): + return True, verify_json_path + return False, None + def __getattr__(self, name): if hasattr(self.tokenizer, name): return getattr(self.tokenizer, name) From 3cd941fb223394beb08b1d924c94aa5d711b9c27 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 06:25:28 +0000 Subject: [PATCH 02/32] code clean up --- tokenicer/const.py | 5 ++++- tokenicer/tokenicer.py | 18 +++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 6aa0cc3..8b2c2e4 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -35,5 +35,8 @@ "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) } +VERIFY_JSON_FILE_NAME = "tokenizer_verify.jsonl" +VERIFY_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} + INPUT_KEY = "input" -TENSOR_KEY = "tensor" \ No newline at end of file +TENSOR_KEY = "tensor" diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 9d95cd0..b432a79 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -20,7 +20,14 @@ from typing import Union, List, Optional from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config -from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP, INPUT_KEY, TENSOR_KEY +from .const import ( + DEFAULT_PAD_TOKENS, + MODEL_PAD_TOKEN_MAP, + INPUT_KEY, + TENSOR_KEY, + VERIFY_JSON_FILE_NAME, + VERIFY_ENCODE_PARAMS +) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -30,9 +37,6 @@ class Tokenicer: tokenizer: Union[str, PreTrainedTokenizerBase] = None model_config = None - encode_params = {"return_tensors": "pt", "add_special_tokens": False} - VERIFY_JSON_FILE_NAME = "tokenizer_verify.jsonl" - @classmethod def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], strict: bool = False, pad_tokens: Optional[List[Union[str, int]]] = None, **kwargs): if pretrained_model_name_or_path is None: @@ -184,7 +188,7 @@ def save_verify(self, prompts: Union[str, List[str]]): results = [] for prompt in prompts: - tokenized = self.tokenizer.encode_plus(prompt, **self.encode_params) + tokenized = self.tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) jsonl = {INPUT_KEY: prompt, TENSOR_KEY: tokenized["input_ids"].tolist()} results.append(jsonl) @@ -205,7 +209,7 @@ def verify(self) -> bool: input_text = json_obj[INPUT_KEY] tensor = json_obj[TENSOR_KEY] - tokenized = self.tokenizer.encode_plus(input_text, **self.encode_params) + tokenized = self.tokenizer.encode_plus(input_text, **VERIFY_ENCODE_PARAMS) if tensor != tokenized["input_ids"].tolist(): return False return True @@ -215,7 +219,7 @@ def _verify_file_exist(self): if path is None: raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") - verify_json_path = os.path.join(path, self.VERIFY_JSON_FILE_NAME) + verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) if os.path.isfile(verify_json_path): return True, verify_json_path From ae82929dc24ee5e434dfd4b5d30b556668c779c7 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 06:57:57 +0000 Subject: [PATCH 03/32] add test code --- tests/test_verify.py | 46 ++++++++++++++++++++++++++++++++++++++++++ tokenicer/tokenicer.py | 5 +++-- 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 tests/test_verify.py diff --git a/tests/test_verify.py b/tests/test_verify.py new file mode 100644 index 0000000..cea9996 --- /dev/null +++ b/tests/test_verify.py @@ -0,0 +1,46 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from tokenicer import Tokenicer + + +class TestVerify(unittest.TestCase): + + def test_verify(self): + model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" + tokenicer = Tokenicer.load(model_path) + messages = [{"role": "user", "content": "Test Case String"}, {"role": "assistant", "content": "Test"}] + prompts = tokenicer.apply_chat_template( + messages, add_generation_prompt=False, tokenize=False + ).rstrip() + + verify_json_path = tokenicer.save_verify(prompts=prompts) + result = os.path.isfile(verify_json_path) + self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") + + result = tokenicer.verify() + self.assertTrue(result, f"Verify file failed") + + if os.path.isfile(verify_json_path): + os.remove(verify_json_path) + + + + + + diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index b432a79..88a22ad 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -171,7 +171,7 @@ def save_verify(self, prompts: Union[str, List[str]]): exist, verify_json_path = self._verify_file_exist() if exist: logger.warning("The verification file already exists.") - return + return verify_json_path if prompts is None: raise ValueError("`prompts` cannot be None") @@ -196,6 +196,7 @@ def save_verify(self, prompts: Union[str, List[str]]): for item in results: json.dump(item, f) f.write('\n') + return verify_json_path def verify(self) -> bool: exist, verify_json_path = self._verify_file_exist() @@ -223,7 +224,7 @@ def _verify_file_exist(self): if os.path.isfile(verify_json_path): return True, verify_json_path - return False, None + return False, verify_json_path def __getattr__(self, name): if hasattr(self.tokenizer, name): From 224e1d7c8b7ce0754089c86f7712da7301507847 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 07:37:46 +0000 Subject: [PATCH 04/32] move verfy logic to validate --- tokenicer/tokenicer.py | 69 ++------------------------------- tokenicer/validate.py | 86 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 65 deletions(-) create mode 100644 tokenicer/validate.py diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 88a22ad..9184699 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -14,20 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import json import logging from typing import Union, List, Optional from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config -from .const import ( - DEFAULT_PAD_TOKENS, - MODEL_PAD_TOKEN_MAP, - INPUT_KEY, - TENSOR_KEY, - VERIFY_JSON_FILE_NAME, - VERIFY_ENCODE_PARAMS -) +from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP +from .validate import _save_verify, _verify logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -168,63 +160,10 @@ def auto_fix_model_config(self, model_config): model_config.eos_token_id = self.tokenizer.eos_token_id def save_verify(self, prompts: Union[str, List[str]]): - exist, verify_json_path = self._verify_file_exist() - if exist: - logger.warning("The verification file already exists.") - return verify_json_path - - if prompts is None: - raise ValueError("`prompts` cannot be None") - - if not isinstance(prompts, str) and not isinstance(prompts, list): - raise ValueError( - f"Unsupported `prompts` type: Expected `str` or `List[str]`, actual = `{type(prompts)}`.") - - if isinstance(prompts, str): - prompts = [prompts] - - if len(prompts) == 0: - raise ValueError("len(prompts) == 0, `prompts` must be greater than 0") - - results = [] - for prompt in prompts: - tokenized = self.tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) - jsonl = {INPUT_KEY: prompt, TENSOR_KEY: tokenized["input_ids"].tolist()} - results.append(jsonl) - - with open(verify_json_path, 'w') as f: - for item in results: - json.dump(item, f) - f.write('\n') - return verify_json_path + return _save_verify(prompts, self.tokenizer) def verify(self) -> bool: - exist, verify_json_path = self._verify_file_exist() - if not exist: - raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") - - with open(verify_json_path, 'r', encoding='utf-8') as file: - for line in file: - json_obj = json.loads(line) - - input_text = json_obj[INPUT_KEY] - tensor = json_obj[TENSOR_KEY] - - tokenized = self.tokenizer.encode_plus(input_text, **VERIFY_ENCODE_PARAMS) - if tensor != tokenized["input_ids"].tolist(): - return False - return True - - def _verify_file_exist(self): - path = config_path(self.tokenizer) - if path is None: - raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") - - verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) - - if os.path.isfile(verify_json_path): - return True, verify_json_path - return False, verify_json_path + return _verify(self.tokenizer) def __getattr__(self, name): if hasattr(self.tokenizer, name): diff --git a/tokenicer/validate.py b/tokenicer/validate.py new file mode 100644 index 0000000..30fbb54 --- /dev/null +++ b/tokenicer/validate.py @@ -0,0 +1,86 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from typing import Union, List +from .util import config_path +from .const import VERIFY_JSON_FILE_NAME, INPUT_KEY, TENSOR_KEY, VERIFY_ENCODE_PARAMS + + +def _verify_file_exist(tokenizer): + path = config_path(tokenizer) + if path is None: + raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") + + verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) + + if os.path.isfile(verify_json_path): + return True, verify_json_path + return False, verify_json_path + + +def _save_verify(prompts: Union[str, List[str]], tokenizer): + exist, verify_json_path = _verify_file_exist(tokenizer) + if exist: + logger.warning("The verification file already exists.") + return verify_json_path + + if prompts is None: + raise ValueError("`prompts` cannot be None") + + if not isinstance(prompts, str) and not isinstance(prompts, list): + raise ValueError( + f"Unsupported `prompts` type: Expected `str` or `List[str]`, actual = `{type(prompts)}`.") + + if isinstance(prompts, str): + prompts = [prompts] + + if len(prompts) == 0: + raise ValueError("len(prompts) == 0, `prompts` must be greater than 0") + + results = [] + for prompt in prompts: + tokenized = tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) + jsonl = {INPUT_KEY: prompt, TENSOR_KEY: tokenized["input_ids"].tolist()} + results.append(jsonl) + + with open(verify_json_path, 'w') as f: + for item in results: + json.dump(item, f) + f.write('\n') + return verify_json_path + + +def _verify(tokenizer) -> bool: + exist, verify_json_path = _verify_file_exist(tokenizer) + if not exist: + raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") + + with open(verify_json_path, 'r', encoding='utf-8') as file: + for line in file: + json_obj = json.loads(line) + + input_text = json_obj[INPUT_KEY] + tensor = json_obj[TENSOR_KEY] + + tokenized = tokenizer.encode_plus(input_text, **VERIFY_ENCODE_PARAMS) + if tensor != tokenized["input_ids"].tolist(): + return False + return True + + + From 9d56dd8387fcdf207ff3eded2a0bb0893b4990b2 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 08:53:58 +0000 Subject: [PATCH 05/32] Add config & verify datasets --- setup.py | 6 +++-- tokenicer/config.py | 52 +++++++++++++++++++++++++++++++++++++++++++ tokenicer/const.py | 34 ++++++++++++++++++++++++++++ tokenicer/validate.py | 7 +++--- tokenicer/version.py | 17 ++++++++++++++ 5 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 tokenicer/config.py create mode 100644 tokenicer/version.py diff --git a/setup.py b/setup.py index 8571aa8..89aedb3 100644 --- a/setup.py +++ b/setup.py @@ -17,14 +17,16 @@ from setuptools import setup, find_packages from pathlib import Path -__version__ = "0.1.0-dev" +version_vars = {} +exec("exec(open('tokenicer/version.py').read()); version=__version__", {}, version_vars) +tokenicer_version = version_vars['version'] with open("requirements.txt") as f: requirements = f.read().splitlines() setup( name="tokenicer", - version=__version__, + version=tokenicer_version, author="ModelCloud", author_email="qubitium@modelcloud.ai", description="A (nicer) tokenizer you want to use for model `inference` and `training`: with all known peventable `gotchas` normalized or auto-fixed.", diff --git a/tokenicer/config.py b/tokenicer/config.py new file mode 100644 index 0000000..e05928c --- /dev/null +++ b/tokenicer/config.py @@ -0,0 +1,52 @@ +from typing import List + + +class VerifyData: + format: str + input: str + output: List[int] + + def __init__(self, input: str, output: List[int], format: str = 'simple'): + self.format = format + self.input = input + self.output = output + + +class VerifyMeta: + validator: str + url: str + + def __init__(self, version, url): + self.validator = version + self.url = url + + +class VerifyConfig: + meta: VerifyMeta + datasets: List[VerifyData] + + def __init__(self, datasets: List[VerifyData], meta: VerifyMeta = None): + if meta is None: + from .version import __version__ + meta = VerifyMeta(version=__version__, url='https://github.com/ModelCloud/Tokenicer') + self.meta = meta + self.datasets = datasets + + def to_dict(self): + dataset_dict = [ + { + 'format': data.format, + 'input': data.input, + 'output': data.output, + } for data in self.datasets + ] + + meta_dict = { + 'validator': self.meta.validator, + 'url': self.meta.url + } + + return { + 'meta': meta_dict, + 'dataset': dataset_dict + } diff --git a/tokenicer/const.py b/tokenicer/const.py index 8b2c2e4..4548410 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -40,3 +40,37 @@ INPUT_KEY = "input" TENSOR_KEY = "tensor" + +VERIFY_DATASETS = [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", +] + +VERIFY_CHAT_TEMPLAT_DATASETS = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Can you help me with a writing prompt?"}, + {"role": "assistant", "content": "Sure! I'd be happy to help. What kind of writing prompt are you looking for?"} + ], + [ + {"role": "system", "content": "Please provide a comprehensive response, covering various aspects—syntax, grammar, punctuation, etc. (We can discuss in detail.)"}, + {"role": "user", "content": "What is the difference between commas, semicolons, and colons? Can you give me some examples?"}, + {"role": "assistant", "content": "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'"} + ], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], + [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], +] diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 30fbb54..663559b 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -18,7 +18,7 @@ import json from typing import Union, List from .util import config_path -from .const import VERIFY_JSON_FILE_NAME, INPUT_KEY, TENSOR_KEY, VERIFY_ENCODE_PARAMS +from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, INPUT_KEY, TENSOR_KEY def _verify_file_exist(tokenizer): @@ -36,6 +36,8 @@ def _verify_file_exist(tokenizer): def _save_verify(prompts: Union[str, List[str]], tokenizer): exist, verify_json_path = _verify_file_exist(tokenizer) if exist: + import logging + logger = logging.getLogger(__name__) logger.warning("The verification file already exists.") return verify_json_path @@ -81,6 +83,3 @@ def _verify(tokenizer) -> bool: if tensor != tokenized["input_ids"].tolist(): return False return True - - - diff --git a/tokenicer/version.py b/tokenicer/version.py new file mode 100644 index 0000000..75b8039 --- /dev/null +++ b/tokenicer/version.py @@ -0,0 +1,17 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.0-dev" From 38e220a0bf45f856eb2d25923708639a64616484 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 08:58:25 +0000 Subject: [PATCH 06/32] code review --- tokenicer/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tokenicer/config.py b/tokenicer/config.py index e05928c..4bf7950 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -16,8 +16,8 @@ class VerifyMeta: validator: str url: str - def __init__(self, version, url): - self.validator = version + def __init__(self, validator, url): + self.validator = validator self.url = url @@ -28,7 +28,7 @@ class VerifyConfig: def __init__(self, datasets: List[VerifyData], meta: VerifyMeta = None): if meta is None: from .version import __version__ - meta = VerifyMeta(version=__version__, url='https://github.com/ModelCloud/Tokenicer') + meta = VerifyMeta(validator=f"tokenicer:{__version__}", url='https://github.com/ModelCloud/Tokenicer') self.meta = meta self.datasets = datasets From 52360297d372dd1428e838b11639852a4904b7c5 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 09:00:49 +0000 Subject: [PATCH 07/32] update verify datasets --- tokenicer/const.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 4548410..440d756 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -42,6 +42,9 @@ TENSOR_KEY = "tensor" VERIFY_DATASETS = [ + "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", + "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", + "Let's break it down:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", "", "", "", @@ -49,28 +52,4 @@ "", "", "", - "", - "", - "", -] - -VERIFY_CHAT_TEMPLAT_DATASETS = [ - [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Can you help me with a writing prompt?"}, - {"role": "assistant", "content": "Sure! I'd be happy to help. What kind of writing prompt are you looking for?"} - ], - [ - {"role": "system", "content": "Please provide a comprehensive response, covering various aspects—syntax, grammar, punctuation, etc. (We can discuss in detail.)"}, - {"role": "user", "content": "What is the difference between commas, semicolons, and colons? Can you give me some examples?"}, - {"role": "assistant", "content": "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'"} - ], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], - [{"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""}], -] +] \ No newline at end of file From 4207fde369147c43d91f655e86f14bcfb45da51a Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 09:23:32 +0000 Subject: [PATCH 08/32] update verify datasets --- tokenicer/const.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 440d756..55423f5 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -42,14 +42,21 @@ TENSOR_KEY = "tensor" VERIFY_DATASETS = [ + # English "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", - "Let's break it down:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", - "", - "", - "", - "", - "", - "", - "", + "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", + "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**", + # Chinese + "在一个清晨,阳光透过窗帘缝隙洒在床单上,空气里弥漫着刚煮好的咖啡香。街道还很安静,偶尔有几只鸟儿在枝头跳跃。", + "2025年,科技的发展速度令人惊叹!\n量子计算机的计算能力已达到10¹⁰次操作每秒,\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括:\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了,创新与效率是成功的关键!", + "2025年,科技的發展速度讓人驚訝!\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒,\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如,愛因斯坦的著名公式 E = mc²,\n揭示了質量和能量之間的關係。\n化學中,水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下:\n1. 完成數學模型的推導:x² + 3x - 4 = 0\n2. 實驗室研究化學反應:2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步,都是知識積累的過程。", + # Franch + "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.", + # German + "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.", + # Spanish + "# Este es un ejemplo de código en Python\ndef saludar(nombre):\n print(f\"¡Hola, {nombre}!\")\n\n# Llamada a la función\nsaludar(\"Juan\")", + # Arabic + "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة." ] \ No newline at end of file From 2ee9c7aac78ceb5d05bfc613734e70d4bd4b6ff9 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 09:59:03 +0000 Subject: [PATCH 09/32] verify datasets opt --- tokenicer/const.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 55423f5..cfe7191 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -47,16 +47,31 @@ "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**", - # Chinese + # Simplified Chinese "在一个清晨,阳光透过窗帘缝隙洒在床单上,空气里弥漫着刚煮好的咖啡香。街道还很安静,偶尔有几只鸟儿在枝头跳跃。", "2025年,科技的发展速度令人惊叹!\n量子计算机的计算能力已达到10¹⁰次操作每秒,\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括:\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了,创新与效率是成功的关键!", + # Traditional Chinese "2025年,科技的發展速度讓人驚訝!\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒,\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如,愛因斯坦的著名公式 E = mc²,\n揭示了質量和能量之間的關係。\n化學中,水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下:\n1. 完成數學模型的推導:x² + 3x - 4 = 0\n2. 實驗室研究化學反應:2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步,都是知識積累的過程。", - # Franch + # French "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.", # German "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.", # Spanish "# Este es un ejemplo de código en Python\ndef saludar(nombre):\n print(f\"¡Hola, {nombre}!\")\n\n# Llamada a la función\nsaludar(\"Juan\")", # Arabic - "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة." + "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة.", + # Russian + "Привет! Как дела? Я рад познакомиться с тобой. Надеюсь, у тебя хороший день!", + # Danish + "Danmark er et smukt land med en rig kultur og historie. Det er kendt for sine maleriske landskaber, hyggelige byer og venlige mennesker. København, hovedstaden, er en moderne metropol, der samtidig bevarer sin historiske charme. Danmark har også en stærk tradition for bæredygtighed og innovation.", + # Portuguese + "Hoje está um dia lindo, perfeito para um passeio no parque.", + # Indonesian + "Selamat pagi! Apa kabar? Saya harap hari Anda menyenankan. Jika ada sesuatu yang bisa saya bantu, silakan beri tahu saya.", + # Italian + "La cucina italiana è famosa in tutto il mondo per la sua varietà e i suoi sapori deliziosi. Ogni regione ha le sue specialità uniche, ma piatti come la pasta, la pizza e il gelato sono amati da tutti. Mangiare in Italia non è solo un pasto, ma un'esperienza sociale che coinvolge amici e familiari.", + # Vietnamese + "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", + # Polish + "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", ] \ No newline at end of file From e87c8b2cfe1f74078f9f38eb9192c507094ea0a7 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 10:46:29 +0000 Subject: [PATCH 10/32] update save_verify() and verify() APi --- tests/test_verify.py | 7 +--- tokenicer/tokenicer.py | 4 +-- tokenicer/validate.py | 73 +++++++++++++++++++++++++----------------- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/tests/test_verify.py b/tests/test_verify.py index cea9996..bb2da4d 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -24,12 +24,7 @@ class TestVerify(unittest.TestCase): def test_verify(self): model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" tokenicer = Tokenicer.load(model_path) - messages = [{"role": "user", "content": "Test Case String"}, {"role": "assistant", "content": "Test"}] - prompts = tokenicer.apply_chat_template( - messages, add_generation_prompt=False, tokenize=False - ).rstrip() - - verify_json_path = tokenicer.save_verify(prompts=prompts) + verify_json_path = tokenicer.save_verify() result = os.path.isfile(verify_json_path) self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 9184699..92bb676 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -159,8 +159,8 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id - def save_verify(self, prompts: Union[str, List[str]]): - return _save_verify(prompts, self.tokenizer) + def save_verify(self, enable_chat_template: bool = True): + return _save_verify(tokenizer=self.tokenizer, enable_chat_template=enable_chat_template) def verify(self) -> bool: return _verify(self.tokenizer) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 663559b..f5fb6f6 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -16,10 +16,10 @@ import os import json -from typing import Union, List +from transformers import PreTrainedTokenizerBase from .util import config_path -from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, INPUT_KEY, TENSOR_KEY - +from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, INPUT_KEY, TENSOR_KEY, VERIFY_DATASETS +from .config import VerifyData, VerifyConfig, VerifyMeta def _verify_file_exist(tokenizer): path = config_path(tokenizer) @@ -33,7 +33,7 @@ def _verify_file_exist(tokenizer): return False, verify_json_path -def _save_verify(prompts: Union[str, List[str]], tokenizer): +def _save_verify(tokenizer: PreTrainedTokenizerBase, enable_chat_template: bool = True): exist, verify_json_path = _verify_file_exist(tokenizer) if exist: import logging @@ -41,45 +41,58 @@ def _save_verify(prompts: Union[str, List[str]], tokenizer): logger.warning("The verification file already exists.") return verify_json_path - if prompts is None: - raise ValueError("`prompts` cannot be None") - - if not isinstance(prompts, str) and not isinstance(prompts, list): - raise ValueError( - f"Unsupported `prompts` type: Expected `str` or `List[str]`, actual = `{type(prompts)}`.") + if enable_chat_template and tokenizer.chat_template is None: + raise ValueError('Tokenizer does not support chat template') - if isinstance(prompts, str): - prompts = [prompts] - - if len(prompts) == 0: - raise ValueError("len(prompts) == 0, `prompts` must be greater than 0") + prompts = [] + if enable_chat_template: + for data in VERIFY_DATASETS: + message = [{"role": "user", "content": data}] + prompt = tokenizer.apply_chat_template( + message, add_generation_prompt=False, tokenize=False + ).rstrip() + prompts.append(prompt) + else: + prompts = VERIFY_DATASETS results = [] for prompt in prompts: tokenized = tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) - jsonl = {INPUT_KEY: prompt, TENSOR_KEY: tokenized["input_ids"].tolist()} - results.append(jsonl) + output = tokenized["input_ids"].tolist()[0] + data = VerifyData(input=prompt, output=output) + results.append(data) + + verify_dic = VerifyConfig(datasets=results).to_dict() - with open(verify_json_path, 'w') as f: - for item in results: - json.dump(item, f) - f.write('\n') + with open(verify_json_path, 'w', encoding='utf-8') as f: + json.dump(verify_dic, f, indent=4) + f.write('\n') return verify_json_path -def _verify(tokenizer) -> bool: +def _verify(tokenizer: PreTrainedTokenizerBase) -> bool: exist, verify_json_path = _verify_file_exist(tokenizer) if not exist: raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") - with open(verify_json_path, 'r', encoding='utf-8') as file: - for line in file: - json_obj = json.loads(line) + with open(verify_json_path, 'r', encoding='utf-8') as f: + data = json.loads(f.read()) + + meta_data = data['meta'] + dataset_data = data['dataset'] + + meta = VerifyMeta(validator=meta_data['validator'], url=meta_data['url']) + datasets = [ + VerifyData(input=d['input'], output=d['output'], format=d['format']) + for d in dataset_data + ] + + config = VerifyConfig(datasets=datasets, meta=meta) - input_text = json_obj[INPUT_KEY] - tensor = json_obj[TENSOR_KEY] + for verify_data in config.datasets: + input = verify_data.input + tokenized = tokenizer.encode_plus(input, **VERIFY_ENCODE_PARAMS)["input_ids"].tolist()[0] + if verify_data.output != tokenized: + return False - tokenized = tokenizer.encode_plus(input_text, **VERIFY_ENCODE_PARAMS) - if tensor != tokenized["input_ids"].tolist(): - return False return True From 082b3ebcaa1cc4afb784c0cc1fbece35b49be95c Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Tue, 11 Feb 2025 10:57:32 +0000 Subject: [PATCH 11/32] add special char to verify datasets --- tokenicer/const.py | 1 + tokenicer/util.py | 21 ++++++++++++++++++++- tokenicer/validate.py | 6 ++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index cfe7191..8d52128 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -74,4 +74,5 @@ "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", # Polish "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", + # ] \ No newline at end of file diff --git a/tokenicer/util.py b/tokenicer/util.py index 4dfd0af..86f4c5c 100644 --- a/tokenicer/util.py +++ b/tokenicer/util.py @@ -16,6 +16,7 @@ from typing import Union, List, Optional from transformers import AutoConfig, PretrainedConfig +import string def candidate_ids(token_list: List[Union[str, int]], vocab: dict) -> List[Optional[int]]: @@ -46,4 +47,22 @@ def auto_config(path, trust_remote) -> Optional[PretrainedConfig]: model_config = None if isinstance(config, PretrainedConfig): model_config = config - return model_config \ No newline at end of file + return model_config + + +def all_special_characters(): + # Get punctuation characters + punctuation_chars = string.punctuation + + # Get whitespace characters (such as space, newline, tab, etc.) + whitespace_chars = string.whitespace + + # Common mathematical symbols and operators (manually added) + math_and_operators = "+-*/=<>%&^|!~" + + # Combine all special characters into a single string + all_special_chars = punctuation_chars + whitespace_chars + math_and_operators + + # Return the combined string + return all_special_chars + diff --git a/tokenicer/validate.py b/tokenicer/validate.py index f5fb6f6..817a745 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -17,8 +17,8 @@ import os import json from transformers import PreTrainedTokenizerBase -from .util import config_path -from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, INPUT_KEY, TENSOR_KEY, VERIFY_DATASETS +from .util import config_path, all_special_characters +from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS from .config import VerifyData, VerifyConfig, VerifyMeta def _verify_file_exist(tokenizer): @@ -44,6 +44,8 @@ def _save_verify(tokenizer: PreTrainedTokenizerBase, enable_chat_template: bool if enable_chat_template and tokenizer.chat_template is None: raise ValueError('Tokenizer does not support chat template') + VERIFY_DATASETS.append(all_special_characters()) + prompts = [] if enable_chat_template: for data in VERIFY_DATASETS: From b55ecf78462db2aee07a7a2cb1c5c4e1a82e3d6b Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 02:16:44 +0000 Subject: [PATCH 12/32] Updated VERIFY_DATASETS --- tokenicer/const.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index 8d52128..c902e12 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -74,5 +74,8 @@ "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", # Polish "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", - # + # Japanese + "今日はとても良い天気ですね。朝から青空が広がっていて、散歩に出かけるのにぴったりな日です。最近、忙しくてなかなか外に出る時間がなかったので、今日はゆっくりと自然の中でリラックスしたいと思っています。", + # Korean + "오늘은 정말 좋은 날씨네요. 아침부터 맑은 하늘이 펼쳐져 있고, 산책을 하기에 딱 좋은 날이에요. 요즘 바빠서 밖에 나갈 시간이 없었는데, 오늘은 자연 속에서 여유롭게 시간을 보내고 싶어요." ] \ No newline at end of file From 4f90aa36b4f5ac5cb76c6f7552cf2ac734a734f0 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 02:42:50 +0000 Subject: [PATCH 13/32] code clean up --- tokenicer/const.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tokenicer/const.py b/tokenicer/const.py index c902e12..c54badb 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -38,9 +38,6 @@ VERIFY_JSON_FILE_NAME = "tokenizer_verify.jsonl" VERIFY_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} -INPUT_KEY = "input" -TENSOR_KEY = "tensor" - VERIFY_DATASETS = [ # English "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", From abcb65761cee9c17a561a6898948a60715c7d7db Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 05:40:48 +0000 Subject: [PATCH 14/32] Update Verify API --- tests/test_verify.py | 19 ++++++++-------- tokenicer/config.py | 47 +++++++++++++++++++++++++++++++++------ tokenicer/const.py | 2 +- tokenicer/tokenicer.py | 25 +++++++++++++++------ tokenicer/util.py | 6 ++++- tokenicer/validate.py | 50 +++++++++++++++++++++++------------------- 6 files changed, 102 insertions(+), 47 deletions(-) diff --git a/tests/test_verify.py b/tests/test_verify.py index bb2da4d..6d1d550 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -17,23 +17,24 @@ import os import unittest from tokenicer import Tokenicer +from tokenicer.const import VERIFY_JSON_FILE_NAME +import tempfile class TestVerify(unittest.TestCase): - def test_verify(self): + def test_save(self): model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" tokenicer = Tokenicer.load(model_path) - verify_json_path = tokenicer.save_verify() - result = os.path.isfile(verify_json_path) - self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") - result = tokenicer.verify() - self.assertTrue(result, f"Verify file failed") - - if os.path.isfile(verify_json_path): - os.remove(verify_json_path) + with tempfile.TemporaryDirectory() as tmpdir: + tokenicer.save_pretrained(tmpdir) + verify_json_path = os.path.join(tmpdir, VERIFY_JSON_FILE_NAME) + result = os.path.isfile(verify_json_path) + self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") + result = tokenicer.verify(verify_json_path) + self.assertTrue(result, f"Verification failed") diff --git a/tokenicer/config.py b/tokenicer/config.py index 4bf7950..cad9852 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -1,10 +1,12 @@ -from typing import List +from typing import List, Optional +from dataclasses import dataclass, field +@dataclass class VerifyData: - format: str - input: str - output: List[int] + format: str = 'simple' + input: str = '' + output: List[int] = field(default_factory=list) def __init__(self, input: str, output: List[int], format: str = 'simple'): self.format = format @@ -12,6 +14,7 @@ def __init__(self, input: str, output: List[int], format: str = 'simple'): self.output = output +@dataclass class VerifyMeta: validator: str url: str @@ -21,9 +24,10 @@ def __init__(self, validator, url): self.url = url +@dataclass class VerifyConfig: - meta: VerifyMeta - datasets: List[VerifyData] + meta: Optional[VerifyMeta] = None + datasets: List[VerifyData] = field(default_factory=list) def __init__(self, datasets: List[VerifyData], meta: VerifyMeta = None): if meta is None: @@ -48,5 +52,34 @@ def to_dict(self): return { 'meta': meta_dict, - 'dataset': dataset_dict + 'datasets': dataset_dict } + + @classmethod + def from_dict(cls, data: dict): + try: + datasets_data = data.get('datasets') + datasets = [] + if datasets_data is not None and isinstance(datasets_data, list): + for data_item in datasets_data: + if isinstance(data_item, dict): + input = data_item.get('input') + output = data_item.get('output') + format = data_item.get('format') + if input is not None and output is not None and format is not None: + datasets.append(VerifyData(input=input, output=output, format=format)) + + meta_data = data.get('meta') + meta = None + + if meta_data is not None: + validator = meta_data.get('validator') + url = meta_data.get('url') + + if validator is not None and url is not None: + meta = VerifyMeta(validator=validator, url=url) + + return cls(datasets=datasets, meta=meta) + + except Exception as e: + return None diff --git a/tokenicer/const.py b/tokenicer/const.py index c54badb..c76fc45 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -35,7 +35,7 @@ "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) } -VERIFY_JSON_FILE_NAME = "tokenizer_verify.jsonl" +VERIFY_JSON_FILE_NAME = "tokenizer_verify.json" VERIFY_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} VERIFY_DATASETS = [ diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 92bb676..b5337f7 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -14,12 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import logging -from typing import Union, List, Optional +from typing import Union, List, Optional, Tuple from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP -from .validate import _save_verify, _verify +from .validate import _save, _verify logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -159,11 +160,21 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id - def save_verify(self, enable_chat_template: bool = True): - return _save_verify(tokenizer=self.tokenizer, enable_chat_template=enable_chat_template) - - def verify(self) -> bool: - return _verify(self.tokenizer) + def save(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True): + return _save(save_directory=save_directory, tokenizer=self.tokenizer, use_chat_template=use_chat_template) + + def verify(self, verify_file_path: Optional[Union[str, os.PathLike]] = None) -> bool: + return _verify(self.tokenizer, verify_file_path=verify_file_path) + + # Override tokenizer save_pretrained + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + use_chat_template: bool = True, + **kwargs, + ) -> Tuple[str]: + self.save(save_directory=save_directory, use_chat_template=use_chat_template) + return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) def __getattr__(self, name): if hasattr(self.tokenizer, name): diff --git a/tokenicer/util.py b/tokenicer/util.py index 86f4c5c..596afd2 100644 --- a/tokenicer/util.py +++ b/tokenicer/util.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import string from typing import Union, List, Optional from transformers import AutoConfig, PretrainedConfig -import string def candidate_ids(token_list: List[Union[str, int]], vocab: dict) -> List[Optional[int]]: @@ -66,3 +67,6 @@ def all_special_characters(): # Return the combined string return all_special_chars + +def isfile(path): + return os.path.isfile(path) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 817a745..ce9106c 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -17,9 +17,11 @@ import os import json from transformers import PreTrainedTokenizerBase -from .util import config_path, all_special_characters +from .util import config_path, all_special_characters, isfile from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS -from .config import VerifyData, VerifyConfig, VerifyMeta +from .config import VerifyData, VerifyConfig +from typing import Union, Optional + def _verify_file_exist(tokenizer): path = config_path(tokenizer) @@ -27,27 +29,29 @@ def _verify_file_exist(tokenizer): raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) - - if os.path.isfile(verify_json_path): - return True, verify_json_path - return False, verify_json_path + return isfile(verify_json_path), verify_json_path -def _save_verify(tokenizer: PreTrainedTokenizerBase, enable_chat_template: bool = True): - exist, verify_json_path = _verify_file_exist(tokenizer) +def _save( + save_directory: Union[str, os.PathLike], + tokenizer: PreTrainedTokenizerBase, + use_chat_template: bool = True + ): + verify_json_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) + exist = isfile(verify_json_path) if exist: import logging logger = logging.getLogger(__name__) logger.warning("The verification file already exists.") return verify_json_path - if enable_chat_template and tokenizer.chat_template is None: + if use_chat_template and tokenizer.chat_template is None: raise ValueError('Tokenizer does not support chat template') VERIFY_DATASETS.append(all_special_characters()) prompts = [] - if enable_chat_template: + if use_chat_template: for data in VERIFY_DATASETS: message = [{"role": "user", "content": data}] prompt = tokenizer.apply_chat_template( @@ -72,24 +76,26 @@ def _save_verify(tokenizer: PreTrainedTokenizerBase, enable_chat_template: bool return verify_json_path -def _verify(tokenizer: PreTrainedTokenizerBase) -> bool: - exist, verify_json_path = _verify_file_exist(tokenizer) +def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union[str, os.PathLike]] = None) -> bool: + exist = False + + if verify_file_path is not None: + exist = isfile(verify_file_path) + if not exist: - raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") + exist, verify_json_path = _verify_file_exist(tokenizer) + if not exist: + raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") + else: + verify_json_path = verify_file_path with open(verify_json_path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) - meta_data = data['meta'] - dataset_data = data['dataset'] - - meta = VerifyMeta(validator=meta_data['validator'], url=meta_data['url']) - datasets = [ - VerifyData(input=d['input'], output=d['output'], format=d['format']) - for d in dataset_data - ] + config = VerifyConfig.from_dict(data) - config = VerifyConfig(datasets=datasets, meta=meta) + if config is None or len(config.datasets) == 0: + raise ValueError(f"Initialization verification data failed, please check {verify_json_path}") for verify_data in config.datasets: input = verify_data.input From 1ae1ffc55ffbbe01446bf3a15a43a1985abfe460 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 05:59:12 +0000 Subject: [PATCH 15/32] Add exception --- tokenicer/exception.py | 23 +++++++++++++++++++++++ tokenicer/validate.py | 17 ++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 tokenicer/exception.py diff --git a/tokenicer/exception.py b/tokenicer/exception.py new file mode 100644 index 0000000..9460d60 --- /dev/null +++ b/tokenicer/exception.py @@ -0,0 +1,23 @@ +class VerificationError(Exception): + """Base class for all exceptions related to verification""" + pass + + +class VerificationFileNotFoundError(VerificationError): + def __init__(self): + super().__init__("The verification file does not exist, please call the `save` API first.") + + +class VerificationInitializationError(VerificationError): + def __init__(self, verify_json_path: str): + super().__init__(f"Initialization verification data failed, please check {verify_json_path}.") + + +class ChatTemplateError(VerificationError): + def __init__(self): + super().__init__("Tokenizer does not support chat template.") + + +class ModelCofnfigNotFoundError(VerificationError): + def __init__(self): + super().__init__("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") \ No newline at end of file diff --git a/tokenicer/validate.py b/tokenicer/validate.py index ce9106c..85b42b3 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -17,16 +17,23 @@ import os import json from transformers import PreTrainedTokenizerBase +from typing import Union, Optional + from .util import config_path, all_special_characters, isfile from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS from .config import VerifyData, VerifyConfig -from typing import Union, Optional +from .exception import ( + VerificationFileNotFoundError, + VerificationInitializationError, + ChatTemplateError, + ModelCofnfigNotFoundError +) def _verify_file_exist(tokenizer): path = config_path(tokenizer) if path is None: - raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") + raise ModelCofnfigNotFoundError() verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) return isfile(verify_json_path), verify_json_path @@ -46,7 +53,7 @@ def _save( return verify_json_path if use_chat_template and tokenizer.chat_template is None: - raise ValueError('Tokenizer does not support chat template') + raise ChatTemplateError() VERIFY_DATASETS.append(all_special_characters()) @@ -85,7 +92,7 @@ def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union if not exist: exist, verify_json_path = _verify_file_exist(tokenizer) if not exist: - raise ValueError(f"The verification file does not exist, please call the `save_verify` API first") + raise VerificationFileNotFoundError() else: verify_json_path = verify_file_path @@ -95,7 +102,7 @@ def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union config = VerifyConfig.from_dict(data) if config is None or len(config.datasets) == 0: - raise ValueError(f"Initialization verification data failed, please check {verify_json_path}") + raise VerificationInitializationError(verify_json_path=verify_json_path) for verify_data in config.datasets: input = verify_data.input From c461346e9dbc4be7f6ef627e574bcd28f29c64b5 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 06:10:47 +0000 Subject: [PATCH 16/32] code review --- tokenicer/exception.py | 23 ----------------------- tokenicer/tokenicer.py | 6 +++++- tokenicer/validate.py | 14 ++++---------- 3 files changed, 9 insertions(+), 34 deletions(-) delete mode 100644 tokenicer/exception.py diff --git a/tokenicer/exception.py b/tokenicer/exception.py deleted file mode 100644 index 9460d60..0000000 --- a/tokenicer/exception.py +++ /dev/null @@ -1,23 +0,0 @@ -class VerificationError(Exception): - """Base class for all exceptions related to verification""" - pass - - -class VerificationFileNotFoundError(VerificationError): - def __init__(self): - super().__init__("The verification file does not exist, please call the `save` API first.") - - -class VerificationInitializationError(VerificationError): - def __init__(self, verify_json_path: str): - super().__init__(f"Initialization verification data failed, please check {verify_json_path}.") - - -class ChatTemplateError(VerificationError): - def __init__(self): - super().__init__("Tokenizer does not support chat template.") - - -class ModelCofnfigNotFoundError(VerificationError): - def __init__(self): - super().__init__("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") \ No newline at end of file diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index b5337f7..3ffd113 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -20,7 +20,7 @@ from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP -from .validate import _save, _verify +from .validate import _save, _verify, _verify_file_exist logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -66,6 +66,10 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] tokenicer.auto_fix_pad_token(strict=strict, pad_tokens=pad_tokens) + exist, _ = _verify_file_exist(tokenizer) + if exist: + tokenicer.verify() + return tokenicer def auto_fix_pad_token( diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 85b42b3..8c7e5ab 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -22,18 +22,12 @@ from .util import config_path, all_special_characters, isfile from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS from .config import VerifyData, VerifyConfig -from .exception import ( - VerificationFileNotFoundError, - VerificationInitializationError, - ChatTemplateError, - ModelCofnfigNotFoundError -) def _verify_file_exist(tokenizer): path = config_path(tokenizer) if path is None: - raise ModelCofnfigNotFoundError() + raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) return isfile(verify_json_path), verify_json_path @@ -53,7 +47,7 @@ def _save( return verify_json_path if use_chat_template and tokenizer.chat_template is None: - raise ChatTemplateError() + raise ValueError("Tokenizer does not support chat template.") VERIFY_DATASETS.append(all_special_characters()) @@ -92,7 +86,7 @@ def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union if not exist: exist, verify_json_path = _verify_file_exist(tokenizer) if not exist: - raise VerificationFileNotFoundError() + raise ValueError("The verification file does not exist, please call the `save` API first.") else: verify_json_path = verify_file_path @@ -102,7 +96,7 @@ def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union config = VerifyConfig.from_dict(data) if config is None or len(config.datasets) == 0: - raise VerificationInitializationError(verify_json_path=verify_json_path) + raise ValueError(f"Initialization verification data failed, please check {verify_json_path}.") for verify_data in config.datasets: input = verify_data.input From 5fed595c5679ea262701d22bb5167fe12387b5ec Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 06:27:26 +0000 Subject: [PATCH 17/32] update api --- tokenicer/tokenicer.py | 4 ++-- tokenicer/validate.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 3ffd113..5fb8d6a 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -167,8 +167,8 @@ def auto_fix_model_config(self, model_config): def save(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True): return _save(save_directory=save_directory, tokenizer=self.tokenizer, use_chat_template=use_chat_template) - def verify(self, verify_file_path: Optional[Union[str, os.PathLike]] = None) -> bool: - return _verify(self.tokenizer, verify_file_path=verify_file_path) + def verify(self, save_directory: Union[str, os.PathLike] = None) -> bool: + return _verify(self.tokenizer, save_directory=save_directory) # Override tokenizer save_pretrained def save_pretrained( diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 8c7e5ab..e124099 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -77,18 +77,17 @@ def _save( return verify_json_path -def _verify(tokenizer: PreTrainedTokenizerBase, verify_file_path: Optional[Union[str, os.PathLike]] = None) -> bool: +def _verify(tokenizer: PreTrainedTokenizerBase, save_directory: Optional[Union[str, os.PathLike]] = None) -> bool: exist = False - if verify_file_path is not None: + if save_directory is not None: + verify_file_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) exist = isfile(verify_file_path) if not exist: exist, verify_json_path = _verify_file_exist(tokenizer) if not exist: raise ValueError("The verification file does not exist, please call the `save` API first.") - else: - verify_json_path = verify_file_path with open(verify_json_path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) From 2805e4627756e934b4fe8cd52ebd10cfc0c3a964 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 06:29:01 +0000 Subject: [PATCH 18/32] update code --- tokenicer/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index e124099..e1c01a8 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -81,8 +81,8 @@ def _verify(tokenizer: PreTrainedTokenizerBase, save_directory: Optional[Union[s exist = False if save_directory is not None: - verify_file_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) - exist = isfile(verify_file_path) + verify_json_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) + exist = isfile(verify_json_path) if not exist: exist, verify_json_path = _verify_file_exist(tokenizer) From 7dca9beec250aa922eb393cbf598d2b2d484648d Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 06:37:00 +0000 Subject: [PATCH 19/32] code clean up --- tests/test_verify.py | 2 +- tokenicer/tokenicer.py | 12 ++---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/test_verify.py b/tests/test_verify.py index 6d1d550..b18530c 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -28,7 +28,7 @@ def test_save(self): tokenicer = Tokenicer.load(model_path) with tempfile.TemporaryDirectory() as tmpdir: - tokenicer.save_pretrained(tmpdir) + tokenicer.save(tmpdir) verify_json_path = os.path.join(tmpdir, VERIFY_JSON_FILE_NAME) result = os.path.isfile(verify_json_path) self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 5fb8d6a..f48797d 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -56,6 +56,8 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] raise ValueError( f"Unsupported `pretrained_model_name_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(pretrained_model_name_or_path)}`.") + + tokenicer.model_config = auto_config(path, trust_remote_code) if tokenicer.model_config is None: @@ -170,16 +172,6 @@ def save(self, save_directory: Union[str, os.PathLike], use_chat_template: bool def verify(self, save_directory: Union[str, os.PathLike] = None) -> bool: return _verify(self.tokenizer, save_directory=save_directory) - # Override tokenizer save_pretrained - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - use_chat_template: bool = True, - **kwargs, - ) -> Tuple[str]: - self.save(save_directory=save_directory, use_chat_template=use_chat_template) - return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) - def __getattr__(self, name): if hasattr(self.tokenizer, name): return getattr(self.tokenizer, name) From 215c1598657b4f6473f1c358d7f8fcd6cee5d87b Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 06:58:02 +0000 Subject: [PATCH 20/32] code review --- tokenicer/tokenicer.py | 4 ++-- tokenicer/validate.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index f48797d..340750e 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -166,8 +166,8 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id - def save(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True): - return _save(save_directory=save_directory, tokenizer=self.tokenizer, use_chat_template=use_chat_template) + def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True): + return _save(save_dir=save_dir, tokenizer=self.tokenizer, use_chat_template=use_chat_template) def verify(self, save_directory: Union[str, os.PathLike] = None) -> bool: return _verify(self.tokenizer, save_directory=save_directory) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index e1c01a8..d1e8d53 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -34,11 +34,11 @@ def _verify_file_exist(tokenizer): def _save( - save_directory: Union[str, os.PathLike], + save_dir: Union[str, os.PathLike], tokenizer: PreTrainedTokenizerBase, use_chat_template: bool = True ): - verify_json_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) + verify_json_path = os.path.join(save_dir, VERIFY_JSON_FILE_NAME) exist = isfile(verify_json_path) if exist: import logging From db56d572fbf46e5854c8e1359fb69d89f3d620e7 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 07:06:55 +0000 Subject: [PATCH 21/32] code opt --- tests/test_verify.py | 2 +- tokenicer/tokenicer.py | 6 +++--- tokenicer/validate.py | 11 +++++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/test_verify.py b/tests/test_verify.py index b18530c..9925488 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -33,7 +33,7 @@ def test_save(self): result = os.path.isfile(verify_json_path) self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") - result = tokenicer.verify(verify_json_path) + result = tokenicer.verify(tmpdir) self.assertTrue(result, f"Verification failed") diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 340750e..41f1cb4 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -166,11 +166,11 @@ def auto_fix_model_config(self, model_config): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id - def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True): + def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True) -> str: return _save(save_dir=save_dir, tokenizer=self.tokenizer, use_chat_template=use_chat_template) - def verify(self, save_directory: Union[str, os.PathLike] = None) -> bool: - return _verify(self.tokenizer, save_directory=save_directory) + def verify(self, save_dir: Union[str, os.PathLike] = None) -> bool: + return _verify(self.tokenizer, save_dir=save_dir) def __getattr__(self, name): if hasattr(self.tokenizer, name): diff --git a/tokenicer/validate.py b/tokenicer/validate.py index d1e8d53..785f569 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -47,7 +47,10 @@ def _save( return verify_json_path if use_chat_template and tokenizer.chat_template is None: - raise ValueError("Tokenizer does not support chat template.") + import logging + logger = logging.getLogger(__name__) + logger.warning("Tokenizer does not support chat template.") + use_chat_template = False VERIFY_DATASETS.append(all_special_characters()) @@ -77,11 +80,11 @@ def _save( return verify_json_path -def _verify(tokenizer: PreTrainedTokenizerBase, save_directory: Optional[Union[str, os.PathLike]] = None) -> bool: +def _verify(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, os.PathLike]] = None) -> bool: exist = False - if save_directory is not None: - verify_json_path = os.path.join(save_directory, VERIFY_JSON_FILE_NAME) + if save_dir is not None: + verify_json_path = os.path.join(save_dir, VERIFY_JSON_FILE_NAME) exist = isfile(verify_json_path) if not exist: From 09db128806bc34cf74981d2347d6bf81a00685d3 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 07:30:45 +0000 Subject: [PATCH 22/32] code review --- tokenicer/tokenicer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 41f1cb4..f7f3bc4 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -56,8 +56,6 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] raise ValueError( f"Unsupported `pretrained_model_name_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(pretrained_model_name_or_path)}`.") - - tokenicer.model_config = auto_config(path, trust_remote_code) if tokenicer.model_config is None: @@ -69,8 +67,8 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] tokenicer.auto_fix_pad_token(strict=strict, pad_tokens=pad_tokens) exist, _ = _verify_file_exist(tokenizer) - if exist: - tokenicer.verify() + if exist and tokenicer.verify(): + logger.info("Tokenicer verification successful!") return tokenicer From 24c4cd5b6062ca01b39479555997ffc50f694fa7 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 07:35:35 +0000 Subject: [PATCH 23/32] fix bug --- tokenicer/validate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 785f569..8ae8e9d 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -38,6 +38,8 @@ def _save( tokenizer: PreTrainedTokenizerBase, use_chat_template: bool = True ): + os.makedirs(save_dir, exist_ok=True) + verify_json_path = os.path.join(save_dir, VERIFY_JSON_FILE_NAME) exist = isfile(verify_json_path) if exist: From 4a6379f50afb2913afb622e8e7ef4d439f8c4f03 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Wed, 12 Feb 2025 16:07:42 +0800 Subject: [PATCH 24/32] Update config.py --- tokenicer/config.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tokenicer/config.py b/tokenicer/config.py index cad9852..1fcd41c 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -1,14 +1,16 @@ from typing import List, Optional from dataclasses import dataclass, field +class ValidateDataFormat(str, Enum): + "simple": SIMPLE @dataclass class VerifyData: - format: str = 'simple' - input: str = '' - output: List[int] = field(default_factory=list) + format: ValidateDataFormat = ValidateDataFormat.SIMPLE, + input: Untion[str, Any] = None + output: List[int] = field(default_factory=list) # what is default_factory? - def __init__(self, input: str, output: List[int], format: str = 'simple'): + def __init__(self, input: Union[str, Any], output: List[int], format: ValidateDataFormat = ValidateDataFormat.SIMPLE): self.format = format self.input = input self.output = output From d994a2d23c9afd342d8754585b9f09ec873d764a Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 08:54:06 +0000 Subject: [PATCH 25/32] code clean up --- tokenicer/config.py | 104 +++++++++++++++++++----------------------- tokenicer/validate.py | 14 +++--- 2 files changed, 53 insertions(+), 65 deletions(-) diff --git a/tokenicer/config.py b/tokenicer/config.py index 1fcd41c..7f4e6cf 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -1,87 +1,75 @@ -from typing import List, Optional -from dataclasses import dataclass, field +from typing import List, Optional, Union, Any, Dict +from dataclasses import dataclass +from enum import Enum + + +class ValidateDataFormat(Enum): + SIMPLE = "simple" -class ValidateDataFormat(str, Enum): - "simple": SIMPLE @dataclass -class VerifyData: - format: ValidateDataFormat = ValidateDataFormat.SIMPLE, - input: Untion[str, Any] = None - output: List[int] = field(default_factory=list) # what is default_factory? +class ValidateData: + format: ValidateDataFormat = ValidateDataFormat.SIMPLE + input: Union[str, Any] = None + output: List[int] = None - def __init__(self, input: Union[str, Any], output: List[int], format: ValidateDataFormat = ValidateDataFormat.SIMPLE): - self.format = format - self.input = input - self.output = output + def __post_init__(self): + if self.input is None: + self.input = [] + + if self.output is None: + self.output = [] @dataclass -class VerifyMeta: - validator: str - url: str +class ValidateMeta: + validator: str = None + uri: str = None - def __init__(self, validator, url): - self.validator = validator - self.url = url + def __post_init__(self): + if self.validator is None: + from .version import __version__ + self.validator = f"tokenicer:{__version__}" + + if self.uri is None: + self.uri = "https://github.com/ModelCloud/Tokenicer" @dataclass -class VerifyConfig: - meta: Optional[VerifyMeta] = None - datasets: List[VerifyData] = field(default_factory=list) +class ValidateConfig: + meta: Optional[ValidateMeta] = None + data: List[ValidateData] = None - def __init__(self, datasets: List[VerifyData], meta: VerifyMeta = None): - if meta is None: - from .version import __version__ - meta = VerifyMeta(validator=f"tokenicer:{__version__}", url='https://github.com/ModelCloud/Tokenicer') - self.meta = meta - self.datasets = datasets + def __post_init__(self): + if self.meta is None: + self.meta = ValidateMeta() + + if self.data is None: + self.data = [] def to_dict(self): dataset_dict = [ { - 'format': data.format, + 'format': data.format.value, 'input': data.input, 'output': data.output, - } for data in self.datasets + } for data in self.data ] meta_dict = { 'validator': self.meta.validator, - 'url': self.meta.url + 'uri': self.meta.uri } return { 'meta': meta_dict, - 'datasets': dataset_dict + 'data': dataset_dict } @classmethod - def from_dict(cls, data: dict): - try: - datasets_data = data.get('datasets') - datasets = [] - if datasets_data is not None and isinstance(datasets_data, list): - for data_item in datasets_data: - if isinstance(data_item, dict): - input = data_item.get('input') - output = data_item.get('output') - format = data_item.get('format') - if input is not None and output is not None and format is not None: - datasets.append(VerifyData(input=input, output=output, format=format)) - - meta_data = data.get('meta') - meta = None - - if meta_data is not None: - validator = meta_data.get('validator') - url = meta_data.get('url') - - if validator is not None and url is not None: - meta = VerifyMeta(validator=validator, url=url) - - return cls(datasets=datasets, meta=meta) - - except Exception as e: - return None + def from_dict(cls, data: Dict): + meta_data = data.get("meta", {}) + data_list = data.get("data", []) + meta = ValidateMeta(**meta_data) if meta_data else None + validate_data = [ValidateData(**item) for item in data_list] + return cls(meta=meta, data=validate_data) diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 8ae8e9d..8039f2b 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -21,7 +21,7 @@ from .util import config_path, all_special_characters, isfile from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS -from .config import VerifyData, VerifyConfig +from .config import ValidateConfig, ValidateData def _verify_file_exist(tokenizer): @@ -71,13 +71,13 @@ def _save( for prompt in prompts: tokenized = tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) output = tokenized["input_ids"].tolist()[0] - data = VerifyData(input=prompt, output=output) + data = ValidateData(input=prompt, output=output) results.append(data) - verify_dic = VerifyConfig(datasets=results).to_dict() + validate_dic = ValidateConfig(data=results).to_dict() with open(verify_json_path, 'w', encoding='utf-8') as f: - json.dump(verify_dic, f, indent=4) + json.dump(validate_dic, f, indent=4) f.write('\n') return verify_json_path @@ -97,12 +97,12 @@ def _verify(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, os with open(verify_json_path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) - config = VerifyConfig.from_dict(data) + config = ValidateConfig.from_dict(data) - if config is None or len(config.datasets) == 0: + if config is None or len(config.data) == 0: raise ValueError(f"Initialization verification data failed, please check {verify_json_path}.") - for verify_data in config.datasets: + for verify_data in config.data: input = verify_data.input tokenized = tokenizer.encode_plus(input, **VERIFY_ENCODE_PARAMS)["input_ids"].tolist()[0] if verify_data.output != tokenized: From 3693c30900132983860fb9c1d13fc42bff7715f8 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 09:19:15 +0000 Subject: [PATCH 26/32] code opt & code clean up --- tests/{test_verify.py => test_validate.py} | 16 +++---- tokenicer/const.py | 6 +-- tokenicer/tokenicer.py | 12 +++--- tokenicer/validate.py | 50 +++++++++++----------- 4 files changed, 42 insertions(+), 42 deletions(-) rename tests/{test_verify.py => test_validate.py} (67%) diff --git a/tests/test_verify.py b/tests/test_validate.py similarity index 67% rename from tests/test_verify.py rename to tests/test_validate.py index 9925488..5dbbe09 100644 --- a/tests/test_verify.py +++ b/tests/test_validate.py @@ -17,24 +17,24 @@ import os import unittest from tokenicer import Tokenicer -from tokenicer.const import VERIFY_JSON_FILE_NAME +from tokenicer.const import VALIDATE_JSON_FILE_NAME import tempfile -class TestVerify(unittest.TestCase): +class TestValidate(unittest.TestCase): - def test_save(self): + def test_validate(self): model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" tokenicer = Tokenicer.load(model_path) with tempfile.TemporaryDirectory() as tmpdir: tokenicer.save(tmpdir) - verify_json_path = os.path.join(tmpdir, VERIFY_JSON_FILE_NAME) - result = os.path.isfile(verify_json_path) - self.assertTrue(result, f"Save verify file failed: {verify_json_path} does not exist.") + validate_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) + result = os.path.isfile(validate_json_path) + self.assertTrue(result, f"Save validate file failed: {validate_json_path} does not exist.") - result = tokenicer.verify(tmpdir) - self.assertTrue(result, f"Verification failed") + result = tokenicer.validate(tmpdir) + self.assertTrue(result, f"Validate failed") diff --git a/tokenicer/const.py b/tokenicer/const.py index c76fc45..2aef2f0 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -35,10 +35,10 @@ "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) } -VERIFY_JSON_FILE_NAME = "tokenizer_verify.json" -VERIFY_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} +VALIDATE_JSON_FILE_NAME = "tokenizer_validate.json" +VALIDATE_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} -VERIFY_DATASETS = [ +VALIDATE_DATASETS = [ # English "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index f7f3bc4..4f7b2fe 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -20,7 +20,7 @@ from transformers import PreTrainedTokenizerBase, PreTrainedModel, AutoTokenizer from .util import candidate_id, config_path, auto_config from .const import DEFAULT_PAD_TOKENS, MODEL_PAD_TOKEN_MAP -from .validate import _save, _verify, _verify_file_exist +from .validate import _save, _validate, _validate_file_exist logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -66,9 +66,9 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] tokenicer.auto_fix_pad_token(strict=strict, pad_tokens=pad_tokens) - exist, _ = _verify_file_exist(tokenizer) - if exist and tokenicer.verify(): - logger.info("Tokenicer verification successful!") + exist, _ = _validate_file_exist(tokenizer) + if exist and tokenicer.validate(): + logger.info("Tokenicer validate successful!") return tokenicer @@ -167,8 +167,8 @@ def auto_fix_model_config(self, model_config): def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True) -> str: return _save(save_dir=save_dir, tokenizer=self.tokenizer, use_chat_template=use_chat_template) - def verify(self, save_dir: Union[str, os.PathLike] = None) -> bool: - return _verify(self.tokenizer, save_dir=save_dir) + def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: + return _validate(self.tokenizer, save_dir=save_dir) def __getattr__(self, name): if hasattr(self.tokenizer, name): diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 8039f2b..05e2ff6 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -20,17 +20,17 @@ from typing import Union, Optional from .util import config_path, all_special_characters, isfile -from .const import VERIFY_JSON_FILE_NAME, VERIFY_ENCODE_PARAMS, VERIFY_DATASETS +from .const import VALIDATE_JSON_FILE_NAME, VALIDATE_ENCODE_PARAMS, VALIDATE_DATASETS from .config import ValidateConfig, ValidateData -def _verify_file_exist(tokenizer): +def _validate_file_exist(tokenizer): path = config_path(tokenizer) if path is None: raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") - verify_json_path = os.path.join(path, VERIFY_JSON_FILE_NAME) - return isfile(verify_json_path), verify_json_path + validate_json_path = os.path.join(path, VALIDATE_JSON_FILE_NAME) + return isfile(validate_json_path), validate_json_path def _save( @@ -40,13 +40,13 @@ def _save( ): os.makedirs(save_dir, exist_ok=True) - verify_json_path = os.path.join(save_dir, VERIFY_JSON_FILE_NAME) - exist = isfile(verify_json_path) + validate_json_path = os.path.join(save_dir, VALIDATE_JSON_FILE_NAME) + exist = isfile(validate_json_path) if exist: import logging logger = logging.getLogger(__name__) - logger.warning("The verification file already exists.") - return verify_json_path + logger.warning(f"Validate file:{validate_json_path} already exists.") + return validate_json_path if use_chat_template and tokenizer.chat_template is None: import logging @@ -54,58 +54,58 @@ def _save( logger.warning("Tokenizer does not support chat template.") use_chat_template = False - VERIFY_DATASETS.append(all_special_characters()) + VALIDATE_DATASETS.append(all_special_characters()) prompts = [] if use_chat_template: - for data in VERIFY_DATASETS: + for data in VALIDATE_DATASETS: message = [{"role": "user", "content": data}] prompt = tokenizer.apply_chat_template( message, add_generation_prompt=False, tokenize=False ).rstrip() prompts.append(prompt) else: - prompts = VERIFY_DATASETS + prompts = VALIDATE_DATASETS results = [] for prompt in prompts: - tokenized = tokenizer.encode_plus(prompt, **VERIFY_ENCODE_PARAMS) + tokenized = tokenizer.encode_plus(prompt, **VALIDATE_ENCODE_PARAMS) output = tokenized["input_ids"].tolist()[0] data = ValidateData(input=prompt, output=output) results.append(data) validate_dic = ValidateConfig(data=results).to_dict() - with open(verify_json_path, 'w', encoding='utf-8') as f: + with open(validate_json_path, 'w', encoding='utf-8') as f: json.dump(validate_dic, f, indent=4) f.write('\n') - return verify_json_path + return validate_json_path -def _verify(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, os.PathLike]] = None) -> bool: +def _validate(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, os.PathLike]] = None) -> bool: exist = False if save_dir is not None: - verify_json_path = os.path.join(save_dir, VERIFY_JSON_FILE_NAME) - exist = isfile(verify_json_path) + validate_json_path = os.path.join(save_dir, VALIDATE_JSON_FILE_NAME) + exist = isfile(validate_json_path) if not exist: - exist, verify_json_path = _verify_file_exist(tokenizer) + exist, validate_json_path = _validate_file_exist(tokenizer) if not exist: - raise ValueError("The verification file does not exist, please call the `save` API first.") + raise ValueError("Validate file does not exist, please call the `save()` API first.") - with open(verify_json_path, 'r', encoding='utf-8') as f: + with open(validate_json_path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) config = ValidateConfig.from_dict(data) if config is None or len(config.data) == 0: - raise ValueError(f"Initialization verification data failed, please check {verify_json_path}.") + raise ValueError(f"Init validate data failed, please check {validate_json_path}.") - for verify_data in config.data: - input = verify_data.input - tokenized = tokenizer.encode_plus(input, **VERIFY_ENCODE_PARAMS)["input_ids"].tolist()[0] - if verify_data.output != tokenized: + for data in config.data: + input = data.input + tokenized = tokenizer.encode_plus(input, **VALIDATE_ENCODE_PARAMS)["input_ids"].tolist()[0] + if data.output != tokenized: return False return True From a0cea36b20ac13a6bb4bdb8a69ed92b4b0b6a6a3 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 09:33:23 +0000 Subject: [PATCH 27/32] add copyright --- tokenicer/config.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tokenicer/config.py b/tokenicer/config.py index 7f4e6cf..b147a1f 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -1,3 +1,19 @@ +# Copyright 2025 ModelCloud.ai +# Copyright 2025 qubitium@modelcloud.ai +# Contact: qubitium@modelcloud.ai, x.com/qubitium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import List, Optional, Union, Any, Dict from dataclasses import dataclass from enum import Enum From 5cc320115a46db3355925273c0cbfabbc89d138b Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 09:34:37 +0000 Subject: [PATCH 28/32] code opt --- tests/test_validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 5dbbe09..a59c899 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -33,8 +33,8 @@ def test_validate(self): result = os.path.isfile(validate_json_path) self.assertTrue(result, f"Save validate file failed: {validate_json_path} does not exist.") - result = tokenicer.validate(tmpdir) - self.assertTrue(result, f"Validate failed") + validate = tokenicer.validate(tmpdir) + self.assertTrue(validate, f"Expected validate='True' but got '{validate}'.") From 20d18d78b690561f6f2ab382856eafbdb971d5eb Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 10:10:28 +0000 Subject: [PATCH 29/32] fix bug --- tokenicer/tokenicer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 4f7b2fe..f87159b 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -170,6 +170,10 @@ def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: return _validate(self.tokenizer, save_dir=save_dir) + def save_pretrained(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True, **kwargs,): + self.save(save_dir=save_directory, use_chat_template=use_chat_template) + return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) + def __getattr__(self, name): if hasattr(self.tokenizer, name): return getattr(self.tokenizer, name) From 3e8f5ef0865af51fbe9806578edb9905c7c30189 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Wed, 12 Feb 2025 10:11:08 +0000 Subject: [PATCH 30/32] code review --- tokenicer/tokenicer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index f87159b..44a588a 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -170,7 +170,7 @@ def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: return _validate(self.tokenizer, save_dir=save_dir) - def save_pretrained(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True, **kwargs,): + def save_pretrained(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True, **kwargs,) -> Tuple[str]: self.save(save_dir=save_directory, use_chat_template=use_chat_template) return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) From 8e86b182474c9e60338a5fce4caa86e44ec6fcf7 Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Thu, 13 Feb 2025 05:15:29 +0000 Subject: [PATCH 31/32] ruff check --- tests/test_model_config.py | 7 +-- tests/test_pad_token.py | 44 ++++++++------ tests/test_tokenicer_forward.py | 38 ++++++------ tests/test_validate.py | 11 ++-- tokenicer/config.py | 22 +++---- tokenicer/const.py | 96 +++++++++++++++--------------- tokenicer/tokenicer.py | 102 +++++++++++++++++++++++--------- tokenicer/util.py | 4 +- tokenicer/validate.py | 37 ++++++++---- 9 files changed, 211 insertions(+), 150 deletions(-) diff --git a/tests/test_model_config.py b/tests/test_model_config.py index 1f0d788..a41e541 100644 --- a/tests/test_model_config.py +++ b/tests/test_model_config.py @@ -19,7 +19,6 @@ class TestModelConfig(unittest.TestCase): - def test_model_config(self): model_path = "/monster/data/model/mpt-7b-instruct" tokenicer = Tokenicer.load(model_path) @@ -30,11 +29,11 @@ def test_model_config(self): self.assertEqual( tokenicer.model_config.bos_token_id, expect_bos_token_id, - msg=f"Expected bos_token_id: `{expect_bos_token_id}`, actual=`{tokenicer.model_config.bos_token_id}`." + msg=f"Expected bos_token_id: `{expect_bos_token_id}`, actual=`{tokenicer.model_config.bos_token_id}`.", ) self.assertEqual( tokenicer.model_config.eos_token_id, expect_eos_token_id, - msg=f"Expected eos_token_id: `{expect_eos_token_id}`, actual=`{tokenicer.model_config.eos_token_id}`." - ) \ No newline at end of file + msg=f"Expected eos_token_id: `{expect_eos_token_id}`, actual=`{tokenicer.model_config.eos_token_id}`.", + ) diff --git a/tests/test_pad_token.py b/tests/test_pad_token.py index 1b4c6d1..ea007f5 100644 --- a/tests/test_pad_token.py +++ b/tests/test_pad_token.py @@ -25,24 +25,32 @@ class TestPadToken(unittest.TestCase): @parameterized.expand( [ - ('/monster/data/model/Llama-3.2-1B-Instruct', '<|reserved_special_token_0|>', ['<|reserved_special_token_0|>']), - ('/monster/data/model/Phi-3-mini-4k-instruct', ''), - ('/monster/data/model/Llama-3.2-1B-Instruct', '<|finetune_right_pad_id|>'), - ('/monster/data/model/Qwen2.5-0.5B-Instruct', '<|fim_pad|>'), - ('/monster/data/model/Qwen2-VL-2B-Instruct', '<|vision_pad|>'), - ('/monster/data/model/gemma-2-9b', ''), - ('/monster/data/model/Hymba-1.5B-Instruct', '', None, True), - ('/monster/data/model/Mistral-7B-Instruct-v0.2', ''), - ('/monster/data/model/Yi-Coder-1.5B-Chat', ''), - (AutoTokenizer.from_pretrained('/monster/data/model/glm-4-9b-chat-hf'), '<|endoftext|>') + ( + "/monster/data/model/Llama-3.2-1B-Instruct", + "<|reserved_special_token_0|>", + ["<|reserved_special_token_0|>"], + ), + ("/monster/data/model/Phi-3-mini-4k-instruct", ""), + ("/monster/data/model/Llama-3.2-1B-Instruct", "<|finetune_right_pad_id|>"), + ("/monster/data/model/Qwen2.5-0.5B-Instruct", "<|fim_pad|>"), + ("/monster/data/model/Qwen2-VL-2B-Instruct", "<|vision_pad|>"), + ("/monster/data/model/gemma-2-9b", ""), + ("/monster/data/model/Hymba-1.5B-Instruct", "", None, True), + ("/monster/data/model/Mistral-7B-Instruct-v0.2", ""), + ("/monster/data/model/Yi-Coder-1.5B-Chat", ""), + ( + AutoTokenizer.from_pretrained("/monster/data/model/glm-4-9b-chat-hf"), + "<|endoftext|>", + ), ] ) - def test_pad_token(self, - tokenizer_or_path: str, - expect_pad_token: str, - pad_tokens: Optional[List[Union[str, int]]] = None, - trust_remote: bool = False - ): + def test_pad_token( + self, + tokenizer_or_path: str, + expect_pad_token: str, + pad_tokens: Optional[List[Union[str, int]]] = None, + trust_remote: bool = False, + ): tokenicer = Tokenicer.load(tokenizer_or_path, trust_remote_code=trust_remote) if pad_tokens is not None: @@ -51,5 +59,5 @@ def test_pad_token(self, self.assertEqual( tokenicer.tokenizer.pad_token, expect_pad_token, - msg=f"Expected pad_token: `{expect_pad_token}`, actual=`{tokenicer.tokenizer.pad_token}`." - ) \ No newline at end of file + msg=f"Expected pad_token: `{expect_pad_token}`, actual=`{tokenicer.tokenizer.pad_token}`.", + ) diff --git a/tests/test_tokenicer_forward.py b/tests/test_tokenicer_forward.py index 39c34a7..6bbb455 100644 --- a/tests/test_tokenicer_forward.py +++ b/tests/test_tokenicer_forward.py @@ -18,56 +18,56 @@ from parameterized import parameterized import unittest -class TestTokenicer(unittest.TestCase): +class TestTokenicer(unittest.TestCase): @classmethod def setUpClass(self): self.pretrained_model_id = "/monster/data/model/Qwen2.5-0.5B-Instruct/" self.tokenizer = Tokenicer.load(self.pretrained_model_id) - self.example = 'Test Case String' + self.example = "Test Case String" self.expect_input_ids = [2271, 11538, 923] def test_tokenicer_func(self): - input_ids = self.tokenizer(self.example)['input_ids'] + input_ids = self.tokenizer(self.example)["input_ids"] self.assertEqual( input_ids, self.expect_input_ids, - msg=f"Expected input_ids=`{self.expect_input_ids}`, actual=`{input_ids}`." + msg=f"Expected input_ids=`{self.expect_input_ids}`, actual=`{input_ids}`.", ) @parameterized.expand( [ - ('eos_token', "<|im_end|>"), - ('pad_token', "<|fim_pad|>"), - ('vocab_size', 151643) + ("eos_token", "<|im_end|>"), + ("pad_token", "<|fim_pad|>"), + ("vocab_size", 151643), ] ) def test_tokenicer_property(self, property, expect_token): - if property == 'eos_token': + if property == "eos_token": result = self.tokenizer.eos_token - elif property == 'pad_token': + elif property == "pad_token": result = self.tokenizer.pad_token - elif property == 'vocab_size': + elif property == "vocab_size": result = self.tokenizer.vocab_size self.assertEqual( result, expect_token, - msg=f"Expected {property}: `{expect_token}`, actual=`{result}`." + msg=f"Expected {property}: `{expect_token}`, actual=`{result}`.", ) def test_tokenicer_encode(self): - input_ids = self.tokenizer.encode(self.example, add_special_tokens=False) - self.assertEqual( - input_ids, - self.expect_input_ids, - msg=f"Expected input_ids: `{self.expect_input_ids}`, actual=`{input_ids}`." - ) + input_ids = self.tokenizer.encode(self.example, add_special_tokens=False) + self.assertEqual( + input_ids, + self.expect_input_ids, + msg=f"Expected input_ids: `{self.expect_input_ids}`, actual=`{input_ids}`.", + ) def test_tokenicer_decode(self): example = self.tokenizer.decode(self.expect_input_ids, skip_special_tokens=True) self.assertEqual( self.example, example, - msg=f"Expected example: `{self.example}`, actual=`{example}`." - ) \ No newline at end of file + msg=f"Expected example: `{self.example}`, actual=`{example}`.", + ) diff --git a/tests/test_validate.py b/tests/test_validate.py index a59c899..29336ee 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -22,7 +22,6 @@ class TestValidate(unittest.TestCase): - def test_validate(self): model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct" tokenicer = Tokenicer.load(model_path) @@ -31,12 +30,10 @@ def test_validate(self): tokenicer.save(tmpdir) validate_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME) result = os.path.isfile(validate_json_path) - self.assertTrue(result, f"Save validate file failed: {validate_json_path} does not exist.") + self.assertTrue( + result, + f"Save validate file failed: {validate_json_path} does not exist.", + ) validate = tokenicer.validate(tmpdir) self.assertTrue(validate, f"Expected validate='True' but got '{validate}'.") - - - - - diff --git a/tokenicer/config.py b/tokenicer/config.py index b147a1f..c0336d1 100644 --- a/tokenicer/config.py +++ b/tokenicer/config.py @@ -45,6 +45,7 @@ class ValidateMeta: def __post_init__(self): if self.validator is None: from .version import __version__ + self.validator = f"tokenicer:{__version__}" if self.uri is None: @@ -58,7 +59,7 @@ class ValidateConfig: def __post_init__(self): if self.meta is None: - self.meta = ValidateMeta() + self.meta = ValidateMeta() if self.data is None: self.data = [] @@ -66,21 +67,16 @@ def __post_init__(self): def to_dict(self): dataset_dict = [ { - 'format': data.format.value, - 'input': data.input, - 'output': data.output, - } for data in self.data + "format": data.format.value, + "input": data.input, + "output": data.output, + } + for data in self.data ] - meta_dict = { - 'validator': self.meta.validator, - 'uri': self.meta.uri - } + meta_dict = {"validator": self.meta.validator, "uri": self.meta.uri} - return { - 'meta': meta_dict, - 'data': dataset_dict - } + return {"meta": meta_dict, "data": dataset_dict} @classmethod def from_dict(cls, data: Dict): diff --git a/tokenicer/const.py b/tokenicer/const.py index 2aef2f0..51403eb 100644 --- a/tokenicer/const.py +++ b/tokenicer/const.py @@ -17,62 +17,62 @@ from collections import namedtuple DEFAULT_PAD_TOKENS = [ - "<|finetune_right_pad_id|>", - "<|pad|>", - "", - "<|unk|>", - "" + "<|finetune_right_pad_id|>", + "<|pad|>", + "", + "<|unk|>", + "", ] TOKEN_TUPLE = namedtuple("TokenTuple", ["token", "token_id"]) MODEL_PAD_TOKEN_MAP = { - "llama": TOKEN_TUPLE(token='<|finetune_right_pad_id|>', token_id=128004), - "qwen2_5_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654), - "qwen2_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654), - "qwen2": TOKEN_TUPLE(token='<|fim_pad|>', token_id=151662), - "deepseek_v3": TOKEN_TUPLE(token='<|▁pad▁|>', token_id=2), - "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1) + "llama": TOKEN_TUPLE(token="<|finetune_right_pad_id|>", token_id=128004), + "qwen2_5_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654), + "qwen2_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654), + "qwen2": TOKEN_TUPLE(token="<|fim_pad|>", token_id=151662), + "deepseek_v3": TOKEN_TUPLE(token="<|▁pad▁|>", token_id=2), + "mpt": TOKEN_TUPLE(token="<|padding|>", token_id=1), } VALIDATE_JSON_FILE_NAME = "tokenizer_validate.json" VALIDATE_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False} VALIDATE_DATASETS = [ - # English - "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", - "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", - "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", - "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**", - # Simplified Chinese - "在一个清晨,阳光透过窗帘缝隙洒在床单上,空气里弥漫着刚煮好的咖啡香。街道还很安静,偶尔有几只鸟儿在枝头跳跃。", - "2025年,科技的发展速度令人惊叹!\n量子计算机的计算能力已达到10¹⁰次操作每秒,\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括:\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了,创新与效率是成功的关键!", - # Traditional Chinese - "2025年,科技的發展速度讓人驚訝!\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒,\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如,愛因斯坦的著名公式 E = mc²,\n揭示了質量和能量之間的關係。\n化學中,水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下:\n1. 完成數學模型的推導:x² + 3x - 4 = 0\n2. 實驗室研究化學反應:2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步,都是知識積累的過程。", - # French - "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.", - # German - "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.", - # Spanish - "# Este es un ejemplo de código en Python\ndef saludar(nombre):\n print(f\"¡Hola, {nombre}!\")\n\n# Llamada a la función\nsaludar(\"Juan\")", - # Arabic - "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة.", - # Russian - "Привет! Как дела? Я рад познакомиться с тобой. Надеюсь, у тебя хороший день!", - # Danish - "Danmark er et smukt land med en rig kultur og historie. Det er kendt for sine maleriske landskaber, hyggelige byer og venlige mennesker. København, hovedstaden, er en moderne metropol, der samtidig bevarer sin historiske charme. Danmark har også en stærk tradition for bæredygtighed og innovation.", - # Portuguese - "Hoje está um dia lindo, perfeito para um passeio no parque.", - # Indonesian - "Selamat pagi! Apa kabar? Saya harap hari Anda menyenankan. Jika ada sesuatu yang bisa saya bantu, silakan beri tahu saya.", - # Italian - "La cucina italiana è famosa in tutto il mondo per la sua varietà e i suoi sapori deliziosi. Ogni regione ha le sue specialità uniche, ma piatti come la pasta, la pizza e il gelato sono amati da tutti. Mangiare in Italia non è solo un pasto, ma un'esperienza sociale che coinvolge amici e familiari.", - # Vietnamese - "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", - # Polish - "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", - # Japanese - "今日はとても良い天気ですね。朝から青空が広がっていて、散歩に出かけるのにぴったりな日です。最近、忙しくてなかなか外に出る時間がなかったので、今日はゆっくりと自然の中でリラックスしたいと思っています。", - # Korean - "오늘은 정말 좋은 날씨네요. 아침부터 맑은 하늘이 펼쳐져 있고, 산책을 하기에 딱 좋은 날이에요. 요즘 바빠서 밖에 나갈 시간이 없었는데, 오늘은 자연 속에서 여유롭게 시간을 보내고 싶어요." -] \ No newline at end of file + # English + "Sure! I'd be happy to help. What kind of writing prompt are you looking for?", + "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'", + "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).", + "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**", + # Simplified Chinese + "在一个清晨,阳光透过窗帘缝隙洒在床单上,空气里弥漫着刚煮好的咖啡香。街道还很安静,偶尔有几只鸟儿在枝头跳跃。", + "2025年,科技的发展速度令人惊叹!\n量子计算机的计算能力已达到10¹⁰次操作每秒,\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括:\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了,创新与效率是成功的关键!", + # Traditional Chinese + "2025年,科技的發展速度讓人驚訝!\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒,\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如,愛因斯坦的著名公式 E = mc²,\n揭示了質量和能量之間的關係。\n化學中,水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下:\n1. 完成數學模型的推導:x² + 3x - 4 = 0\n2. 實驗室研究化學反應:2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步,都是知識積累的過程。", + # French + "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.", + # German + "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.", + # Spanish + '# Este es un ejemplo de código en Python\ndef saludar(nombre):\n print(f"¡Hola, {nombre}!")\n\n# Llamada a la función\nsaludar("Juan")', + # Arabic + "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة.", + # Russian + "Привет! Как дела? Я рад познакомиться с тобой. Надеюсь, у тебя хороший день!", + # Danish + "Danmark er et smukt land med en rig kultur og historie. Det er kendt for sine maleriske landskaber, hyggelige byer og venlige mennesker. København, hovedstaden, er en moderne metropol, der samtidig bevarer sin historiske charme. Danmark har også en stærk tradition for bæredygtighed og innovation.", + # Portuguese + "Hoje está um dia lindo, perfeito para um passeio no parque.", + # Indonesian + "Selamat pagi! Apa kabar? Saya harap hari Anda menyenankan. Jika ada sesuatu yang bisa saya bantu, silakan beri tahu saya.", + # Italian + "La cucina italiana è famosa in tutto il mondo per la sua varietà e i suoi sapori deliziosi. Ogni regione ha le sue specialità uniche, ma piatti come la pasta, la pizza e il gelato sono amati da tutti. Mangiare in Italia non è solo un pasto, ma un'esperienza sociale che coinvolge amici e familiari.", + # Vietnamese + "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?", + # Polish + "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!", + # Japanese + "今日はとても良い天気ですね。朝から青空が広がっていて、散歩に出かけるのにぴったりな日です。最近、忙しくてなかなか外に出る時間がなかったので、今日はゆっくりと自然の中でリラックスしたいと思っています。", + # Korean + "오늘은 정말 좋은 날씨네요. 아침부터 맑은 하늘이 펼쳐져 있고, 산책을 하기에 딱 좋은 날이에요. 요즘 바빠서 밖에 나갈 시간이 없었는데, 오늘은 자연 속에서 여유롭게 시간을 보내고 싶어요.", +] diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 44a588a..02cdfff 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -31,11 +31,17 @@ class Tokenicer: model_config = None @classmethod - def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], strict: bool = False, pad_tokens: Optional[List[Union[str, int]]] = None, **kwargs): + def load( + cls, + pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], + strict: bool = False, + pad_tokens: Optional[List[Union[str, int]]] = None, + **kwargs, + ): if pretrained_model_name_or_path is None: raise ValueError("`pretrained_model_name_or_path` cannot be `None`.") - trust_remote_code = kwargs.get('trust_remote_code', False) + trust_remote_code = kwargs.get("trust_remote_code", False) tokenicer = cls() @@ -45,23 +51,27 @@ def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase] tokenicer.tokenizer = tokenizer path = config_path(tokenizer) elif isinstance(pretrained_model_name_or_path, str): - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, **kwargs + ) if isinstance(tokenizer, PreTrainedTokenizerBase): tokenicer.tokenizer = tokenizer path = pretrained_model_name_or_path else: ValueError( - f"Failed to initialize `tokenizer`: please ensure that the `pretrained_model_name_or_path` parameter is set correctly.") + "Failed to initialize `tokenizer`: please ensure that the `pretrained_model_name_or_path` parameter is set correctly." + ) else: raise ValueError( - f"Unsupported `pretrained_model_name_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(pretrained_model_name_or_path)}`.") + f"Unsupported `pretrained_model_name_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(pretrained_model_name_or_path)}`." + ) tokenicer.model_config = auto_config(path, trust_remote_code) if tokenicer.model_config is None: logger.warning( - f"Auto model config retrieval from `pretrained_model_name_or_path` failed. " - f"Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.", + "Auto model config retrieval from `pretrained_model_name_or_path` failed. " + "Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.", ) tokenicer.auto_fix_pad_token(strict=strict, pad_tokens=pad_tokens) @@ -81,38 +91,48 @@ def auto_fix_pad_token( model_config = None if model_or_path is not None: if isinstance(model_or_path, str): - model_config = auto_config(model_or_path, self.tokenizer.trust_remote_code) + model_config = auto_config( + model_or_path, self.tokenizer.trust_remote_code + ) elif isinstance(model_or_path, PreTrainedModel): model_config = getattr(model_or_path, "config", None) else: raise ValueError( - f"Unsupported `model_or_path` type: Expected `str` or `PreTrainedModel`, actual = `{type(model_or_path)}`.") + f"Unsupported `model_or_path` type: Expected `str` or `PreTrainedModel`, actual = `{type(model_or_path)}`." + ) if model_config is None: - raise ValueError("Can not retrieve config from the provided `model_or_path`.") + raise ValueError( + "Can not retrieve config from the provided `model_or_path`." + ) else: if self.model_config is not None: model_config = self.model_config else: raise ValueError( - f"Auto model config retrieval from `pretrained_model_name_or_path` failed. " - f"Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.", - ) + "Auto model config retrieval from `pretrained_model_name_or_path` failed. " + "Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.", + ) self.auto_fix_model_config(model_config) pad_token_id = model_config.pad_token_id - if pad_token_id is None or pad_token_id in [model_config.bos_token_id, model_config.eos_token_id]: - pad_token_id = self._auto_map_pad_token(model_config=model_config, pad_tokens=pad_tokens) + if pad_token_id is None or pad_token_id in [ + model_config.bos_token_id, + model_config.eos_token_id, + ]: + pad_token_id = self._auto_map_pad_token( + model_config=model_config, pad_tokens=pad_tokens + ) if not strict: if pad_token_id is None and self.tokenizer.eos_token_id is not None: pad_token_id = self.tokenizer.eos_token_id logger.warning( - f"Auto model config unable to fix `pad_token`, Use tokenizer.eos_token as pad_token" - f"pad_token = eos_token, There may be problems with the model during training or inference." - f"It is recommended that you manually pass a `pad_tokens` to `load()`", + "Auto model config unable to fix `pad_token`, Use tokenizer.eos_token as pad_token" + "pad_token = eos_token, There may be problems with the model during training or inference." + "It is recommended that you manually pass a `pad_tokens` to `load()`", ) if pad_token_id is None: @@ -123,7 +143,9 @@ def auto_fix_pad_token( self.tokenizer.pad_token_id = pad_token_id self.tokenizer.pad_token = self.tokenizer.decode([pad_token_id]) - logger.info(f"Auto fixed pad_token_id={pad_token_id} (token='{self.tokenizer.pad_token}').") + logger.info( + f"Auto fixed pad_token_id={pad_token_id} (token='{self.tokenizer.pad_token}')." + ) def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: pad_token_id = None @@ -135,7 +157,10 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: pad_token_id = candidate_id(pad_tokens, vocab) # Match MODEL_PAD_TOKEN_MAP to get pad token - if pad_token_id is None and MODEL_PAD_TOKEN_MAP.get(model_config.model_type, None) is not None: + if ( + pad_token_id is None + and MODEL_PAD_TOKEN_MAP.get(model_config.model_type, None) is not None + ): token_tuple = MODEL_PAD_TOKEN_MAP.get(model_config.model_type) pad_token = token_tuple.token token_id = vocab.get(pad_token, None) @@ -148,7 +173,10 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: # Use eos_token as pad token if pad_token_id is None: - if isinstance(model_config.eos_token_id, list) and model_config.eos_token_id: + if ( + isinstance(model_config.eos_token_id, list) + and model_config.eos_token_id + ): pad_token_id = model_config.eos_token_id[0] else: pad_token_id = model_config.eos_token_id @@ -156,29 +184,47 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]: return pad_token_id def auto_fix_model_config(self, model_config): - if model_config.bos_token_id is None and self.tokenizer.bos_token_id is not None: + if ( + model_config.bos_token_id is None + and self.tokenizer.bos_token_id is not None + ): model_config.bos_token = self.tokenizer.bos_token model_config.bos_token_id = self.tokenizer.bos_token_id - if model_config.eos_token_id is None and self.tokenizer.eos_token_id is not None: + if ( + model_config.eos_token_id is None + and self.tokenizer.eos_token_id is not None + ): model_config.eos_token = self.tokenizer.eos_token model_config.eos_token_id = self.tokenizer.eos_token_id - def save(self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True) -> str: - return _save(save_dir=save_dir, tokenizer=self.tokenizer, use_chat_template=use_chat_template) + def save( + self, save_dir: Union[str, os.PathLike], use_chat_template: bool = True + ) -> str: + return _save( + save_dir=save_dir, + tokenizer=self.tokenizer, + use_chat_template=use_chat_template, + ) def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: return _validate(self.tokenizer, save_dir=save_dir) - def save_pretrained(self, save_directory: Union[str, os.PathLike], use_chat_template: bool = True, **kwargs,) -> Tuple[str]: + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + use_chat_template: bool = True, + **kwargs, + ) -> Tuple[str]: self.save(save_dir=save_directory, use_chat_template=use_chat_template) return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs) def __getattr__(self, name): if hasattr(self.tokenizer, name): return getattr(self.tokenizer, name) - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) def __call__(self, data, **kwargs): return self.tokenizer(data, **kwargs) - diff --git a/tokenicer/util.py b/tokenicer/util.py index 596afd2..57e1c15 100644 --- a/tokenicer/util.py +++ b/tokenicer/util.py @@ -20,7 +20,9 @@ from transformers import AutoConfig, PretrainedConfig -def candidate_ids(token_list: List[Union[str, int]], vocab: dict) -> List[Optional[int]]: +def candidate_ids( + token_list: List[Union[str, int]], vocab: dict +) -> List[Optional[int]]: token_ids = [] for item in token_list: if isinstance(item, str): diff --git a/tokenicer/validate.py b/tokenicer/validate.py index 05e2ff6..551058f 100644 --- a/tokenicer/validate.py +++ b/tokenicer/validate.py @@ -27,29 +27,33 @@ def _validate_file_exist(tokenizer): path = config_path(tokenizer) if path is None: - raise ValueError("Can not retrieve config path from the provided `pretrained_model_name_or_path`.") + raise ValueError( + "Can not retrieve config path from the provided `pretrained_model_name_or_path`." + ) validate_json_path = os.path.join(path, VALIDATE_JSON_FILE_NAME) return isfile(validate_json_path), validate_json_path def _save( - save_dir: Union[str, os.PathLike], - tokenizer: PreTrainedTokenizerBase, - use_chat_template: bool = True - ): + save_dir: Union[str, os.PathLike], + tokenizer: PreTrainedTokenizerBase, + use_chat_template: bool = True, +) -> str: os.makedirs(save_dir, exist_ok=True) validate_json_path = os.path.join(save_dir, VALIDATE_JSON_FILE_NAME) exist = isfile(validate_json_path) if exist: import logging + logger = logging.getLogger(__name__) logger.warning(f"Validate file:{validate_json_path} already exists.") return validate_json_path if use_chat_template and tokenizer.chat_template is None: import logging + logger = logging.getLogger(__name__) logger.warning("Tokenizer does not support chat template.") use_chat_template = False @@ -76,13 +80,16 @@ def _save( validate_dic = ValidateConfig(data=results).to_dict() - with open(validate_json_path, 'w', encoding='utf-8') as f: + with open(validate_json_path, "w", encoding="utf-8") as f: json.dump(validate_dic, f, indent=4) - f.write('\n') + f.write("\n") return validate_json_path -def _validate(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, os.PathLike]] = None) -> bool: +def _validate( + tokenizer: PreTrainedTokenizerBase, + save_dir: Optional[Union[str, os.PathLike]] = None, +) -> bool: exist = False if save_dir is not None: @@ -92,19 +99,25 @@ def _validate(tokenizer: PreTrainedTokenizerBase, save_dir: Optional[Union[str, if not exist: exist, validate_json_path = _validate_file_exist(tokenizer) if not exist: - raise ValueError("Validate file does not exist, please call the `save()` API first.") + raise ValueError( + "Validate file does not exist, please call the `save()` API first." + ) - with open(validate_json_path, 'r', encoding='utf-8') as f: + with open(validate_json_path, "r", encoding="utf-8") as f: data = json.loads(f.read()) config = ValidateConfig.from_dict(data) if config is None or len(config.data) == 0: - raise ValueError(f"Init validate data failed, please check {validate_json_path}.") + raise ValueError( + f"Init validate data failed, please check {validate_json_path}." + ) for data in config.data: input = data.input - tokenized = tokenizer.encode_plus(input, **VALIDATE_ENCODE_PARAMS)["input_ids"].tolist()[0] + tokenized = tokenizer.encode_plus(input, **VALIDATE_ENCODE_PARAMS)[ + "input_ids" + ].tolist()[0] if data.output != tokenized: return False From 893d8e93fca4ecdf7d19f481b4976719655bd1ab Mon Sep 17 00:00:00 2001 From: CL-ModelCloud Date: Mon, 3 Mar 2025 07:32:02 +0000 Subject: [PATCH 32/32] code update --- tokenicer/tokenicer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py index 74abb25..6b68ce2 100644 --- a/tokenicer/tokenicer.py +++ b/tokenicer/tokenicer.py @@ -51,6 +51,7 @@ def load( tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs ) + print(f"cl->{tokenizer}") if isinstance(tokenizer, PreTrainedTokenizerBase): path = pretrained_model_name_or_path else: @@ -198,11 +199,11 @@ def validate(self, save_dir: Union[str, os.PathLike] = None) -> bool: return _validate(self.tokenizer, save_dir=save_dir) def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - use_chat_template: bool = True, - **kwargs, - ) -> Tuple[str]: + self, + save_directory: Union[str, os.PathLike], + use_chat_template: bool = True, + **kwargs, + ) -> Tuple[str]: self.save(save_dir=save_directory, use_chat_template=use_chat_template) return self.tokenizer.save_pretrained(save_directory=save_directory, **kwargs)