diff --git a/examples/How_to_count_tokens_with_tiktoken.ipynb b/examples/How_to_count_tokens_with_tiktoken.ipynb index 19ce768921..b21cb24f61 100644 --- a/examples/How_to_count_tokens_with_tiktoken.ipynb +++ b/examples/How_to_count_tokens_with_tiktoken.ipynb @@ -197,16 +197,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n", - " \"\"\"Returns the number of tokens in a text string.\"\"\"\n", - " encoding = tiktoken.get_encoding(encoding_name)\n", - " num_tokens = len(encoding.encode(string))\n", - " return num_tokens" - ] + "source": "# Import num_tokens_from_string function from our utility module\nfrom utils.token_counting_utils import num_tokens_from_string" }, { "cell_type": "code", @@ -460,54 +454,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "def num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\"):\n", - " \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n", - " try:\n", - " encoding = tiktoken.encoding_for_model(model)\n", - " except KeyError:\n", - " print(\"Warning: model not found. Using o200k_base encoding.\")\n", - " encoding = tiktoken.get_encoding(\"o200k_base\")\n", - " if model in {\n", - " \"gpt-3.5-turbo-0125\",\n", - " \"gpt-4-0314\",\n", - " \"gpt-4-32k-0314\",\n", - " \"gpt-4-0613\",\n", - " \"gpt-4-32k-0613\",\n", - " \"gpt-4o-mini-2024-07-18\",\n", - " \"gpt-4o-2024-08-06\"\n", - " }:\n", - " tokens_per_message = 3\n", - " tokens_per_name = 1\n", - " elif \"gpt-3.5-turbo\" in model:\n", - " print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0125\")\n", - " elif \"gpt-4o-mini\" in model:\n", - " print(\"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\")\n", - " elif \"gpt-4o\" in model:\n", - " print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-4o-2024-08-06\")\n", - " elif \"gpt-4\" in model:\n", - " print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n", - " else:\n", - " raise NotImplementedError(\n", - " f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n", - " )\n", - " num_tokens = 0\n", - " for message in messages:\n", - " num_tokens += tokens_per_message\n", - " for key, value in message.items():\n", - " num_tokens += len(encoding.encode(value))\n", - " if key == \"name\":\n", - " num_tokens += tokens_per_name\n", - " num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>\n", - " return num_tokens\n" - ] + "source": "# Import the unified token counting function\nimport sys\nimport os\n# Add the utils directory to the path so we can import our utility\nsys.path.append(os.path.join(os.path.dirname(os.path.abspath('.')), 'utils'))\n\nfrom utils.token_counting_utils import num_tokens_from_messages\n\n# The num_tokens_from_messages function is now imported from the shared utility module\n# It supports all current OpenAI models including:\n# - gpt-3.5-turbo variants\n# - gpt-4 variants \n# - gpt-4o and gpt-4o-mini variants" }, { "cell_type": "code", @@ -811,4 +761,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/How_to_format_inputs_to_ChatGPT_models.ipynb b/examples/How_to_format_inputs_to_ChatGPT_models.ipynb index d08fb9cc7f..ccafc368c4 100644 --- a/examples/How_to_format_inputs_to_ChatGPT_models.ipynb +++ b/examples/How_to_format_inputs_to_ChatGPT_models.ipynb @@ -508,53 +508,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import tiktoken\n", - "\n", - "\n", - "def num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\"):\n", - " \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n", - " try:\n", - " encoding = tiktoken.encoding_for_model(model)\n", - " except KeyError:\n", - " print(\"Warning: model not found. Using cl100k_base encoding.\")\n", - " encoding = tiktoken.get_encoding(\"cl100k_base\")\n", - " if model in {\n", - " \"gpt-3.5-turbo-0613\",\n", - " \"gpt-3.5-turbo-16k-0613\",\n", - " \"gpt-4-0314\",\n", - " \"gpt-4-32k-0314\",\n", - " \"gpt-4-0613\",\n", - " \"gpt-4-32k-0613\",\n", - " }:\n", - " tokens_per_message = 3\n", - " tokens_per_name = 1\n", - " elif model == \"gpt-3.5-turbo-0301\":\n", - " tokens_per_message = 4 # every message follows <|start|>{role/name}\\n{content}<|end|>\\n\n", - " tokens_per_name = -1 # if there's a name, the role is omitted\n", - " elif \"gpt-3.5-turbo\" in model:\n", - " print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\")\n", - " elif \"gpt-4\" in model:\n", - " print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n", - " return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n", - " else:\n", - " raise NotImplementedError(\n", - " f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n", - " )\n", - " num_tokens = 0\n", - " for message in messages:\n", - " num_tokens += tokens_per_message\n", - " for key, value in message.items():\n", - " num_tokens += len(encoding.encode(value))\n", - " if key == \"name\":\n", - " num_tokens += tokens_per_name\n", - " num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>\n", - " return num_tokens\n" - ] + "source": "# Import the unified token counting function\nimport sys\nimport os\n# Add the utils directory to the path so we can import our utility\nsys.path.append(os.path.join(os.path.dirname(os.path.abspath('.')), 'utils'))\n\nfrom utils.token_counting_utils import num_tokens_from_messages\n\n# The num_tokens_from_messages function is now imported from the shared utility module\n# It supports all current OpenAI models including:\n# - gpt-3.5-turbo variants\n# - gpt-4 variants \n# - gpt-4o and gpt-4o-mini variants" }, { "cell_type": "code", @@ -678,4 +635,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/utils/token_counting_utils.py b/examples/utils/token_counting_utils.py new file mode 100644 index 0000000000..d4bf54beb0 --- /dev/null +++ b/examples/utils/token_counting_utils.py @@ -0,0 +1,125 @@ +""" +Utility functions for counting tokens used by OpenAI models. + +This module provides functions to estimate the number of tokens that will be +used by various OpenAI models when processing messages. +""" + +import tiktoken + + +def num_tokens_from_messages(messages, model="gpt-4o-mini"): + """ + Return the number of tokens used by a list of messages. + + Args: + messages: List of message dictionaries with 'role' and 'content' keys + model: Model name string (e.g., "gpt-4", "gpt-3.5-turbo", "gpt-4o-mini") + + Returns: + int: Estimated number of tokens used by the messages + + Note: + Token counts are estimates and may vary slightly from actual API usage. + The exact token counting method may change between model versions. + """ + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + print(f"Warning: model {model} not found. Using o200k_base encoding.") + encoding = tiktoken.get_encoding("o200k_base") + + # Models that use o200k_base encoding + if model in { + "gpt-4o", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "gpt-4o-mini", + "gpt-4o-mini-2024-07-18", + }: + # For o200k_base models, use o200k_base encoding + try: + encoding = tiktoken.get_encoding("o200k_base") + except KeyError: + pass + # Models that use cl100k_base encoding + elif model in { + "gpt-3.5-turbo-0125", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + "gpt-4-0314", + "gpt-4-32k-0314", + "gpt-4-0613", + "gpt-4-32k-0613", + }: + # For cl100k_base models, ensure we're using cl100k_base + try: + encoding = tiktoken.get_encoding("cl100k_base") + except KeyError: + pass + + # Set tokens per message and per name based on model + if model in { + "gpt-3.5-turbo-0125", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", + "gpt-4-0314", + "gpt-4-32k-0314", + "gpt-4-0613", + "gpt-4-32k-0613", + "gpt-4o-mini-2024-07-18", + "gpt-4o-mini", + "gpt-4o-2024-08-06", + "gpt-4o", + }: + tokens_per_message = 3 + tokens_per_name = 1 + elif model == "gpt-3.5-turbo-0301": + # Special handling for gpt-3.5-turbo-0301 + tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n + tokens_per_name = -1 # if there's a name, the role is omitted + # Handle base model names that may update over time + elif "gpt-3.5-turbo" in model: + print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.") + return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125") + elif "gpt-4o-mini" in model: + print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.") + return num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18") + elif "gpt-4o" in model: + print("Warning: gpt-4o may update over time. Returning num tokens assuming gpt-4o-2024-08-06.") + return num_tokens_from_messages(messages, model="gpt-4o-2024-08-06") + elif "gpt-4" in model: + print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") + return num_tokens_from_messages(messages, model="gpt-4-0613") + else: + raise NotImplementedError( + f"num_tokens_from_messages() is not implemented for model {model}. " + f"See https://github.com/openai/openai-python/blob/main/chatml.md " + f"for information on how messages are converted to tokens." + ) + + num_tokens = 0 + for message in messages: + num_tokens += tokens_per_message + for key, value in message.items(): + num_tokens += len(encoding.encode(value)) + if key == "name": + num_tokens += tokens_per_name + num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> + return num_tokens + + +def num_tokens_from_string(string: str, encoding_name: str) -> int: + """ + Returns the number of tokens in a text string using the specified encoding. + + Args: + string: The text string to tokenize + encoding_name: The name of the encoding to use (e.g., "cl100k_base", "o200k_base") + + Returns: + int: Number of tokens in the string + """ + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens \ No newline at end of file