diff --git a/Package.swift b/Package.swift index 69fbc17e..caa95675 100644 --- a/Package.swift +++ b/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version: 5.9 +// swift-tools-version: 6.1 // The swift-tools-version declares the minimum version of Swift required to build this package. import PackageDescription @@ -10,34 +10,72 @@ let swiftSettings: [SwiftSetting] = [ let package = Package( name: "swift-transformers", - platforms: [.iOS(.v16), .macOS(.v13)], + platforms: [.iOS(.v17), .macOS(.v14)], products: [ + .library(name: "Hub", targets: ["Hub"]), + // ^ Hub client library + .library(name: "Tokenizers", targets: ["Tokenizers"]), + // ^ Tokenizers with optional chat template support via traits .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), + // ^ Everything, including Core ML inference .executable(name: "transformers", targets: ["TransformersCLI"]), .executable(name: "hub-cli", targets: ["HubCLI"]), ], + traits: [ + .trait( + name: "ChatTemplates", + description: + "Enables chat template support with Jinja templating engine (Swift 6.1+ only)" + ), + ], dependencies: [ - .package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMinor(from: "1.4.0")), + .package( + url: "https://github.com/apple/swift-argument-parser.git", .upToNextMinor(from: "1.4.0") + ), .package(url: "https://github.com/johnmai-dev/Jinja", .upToNextMinor(from: "1.2.1")), ], targets: [ .executableTarget( name: "TransformersCLI", dependencies: [ - "Models", "Generation", "Tokenizers", - .product(name: "ArgumentParser", package: "swift-argument-parser"), + "Models", .product(name: "ArgumentParser", package: "swift-argument-parser"), + ] + ), + .executableTarget( + name: "HubCLI", + dependencies: [ + "Hub", .product(name: "ArgumentParser", package: "swift-argument-parser"), ] ), - .executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]), - .target(name: "Hub", resources: [.process("FallbackConfigs")], swiftSettings: swiftSettings), - .target(name: "Tokenizers", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]), + .target( + name: "Hub", resources: [.process("FallbackConfigs")], swiftSettings: swiftSettings + ), + .target( + name: "Tokenizers", + dependencies: [ + "Hub", + .product( + name: "Jinja", package: "Jinja", condition: .when(traits: ["ChatTemplates"]) + ), + ], + swiftSettings: swiftSettings + ), .target(name: "TensorUtils"), .target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]), .target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]), - .testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]), - .testTarget(name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")], swiftSettings: swiftSettings), + .testTarget( + name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], + resources: [.process("Resources"), .process("Vocabs")] + ), + .testTarget( + name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")], + swiftSettings: swiftSettings + ), .testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]), - .testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], resources: [.process("Resources")]), + .testTarget( + name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], + resources: [.process("Resources")] + ), .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]), .testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"]), ] diff --git a/Package@swift-5.9.swift b/Package@swift-5.9.swift new file mode 100644 index 00000000..c650380e --- /dev/null +++ b/Package@swift-5.9.swift @@ -0,0 +1,46 @@ +// swift-tools-version: 5.9 +// The swift-tools-version declares the minimum version of Swift required to build this package. + +import PackageDescription + +/// Define the strict concurrency settings to be applied to all targets. +let swiftSettings: [SwiftSetting] = [ + .enableExperimentalFeature("StrictConcurrency"), +] + +let package = Package( + name: "swift-transformers", + platforms: [.iOS(.v16), .macOS(.v13)], + products: [ + .library(name: "Hub", targets: ["Hub"]), + // ^ Hub client library + .library(name: "Tokenizers", targets: ["Tokenizers"]), + // ^ Tokenizers library with chat template support + .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), + // ^ Everything, including Core ML inference + .executable(name: "transformers", targets: ["TransformersCLI"]), + .executable(name: "hub-cli", targets: ["HubCLI"]), + ], + dependencies: [ + .package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMinor(from: "1.4.0")), + .package(url: "https://github.com/johnmai-dev/Jinja", .upToNextMinor(from: "1.2.1")), + ], + targets: [ + .executableTarget( + name: "TransformersCLI", + dependencies: ["Models", .product(name: "ArgumentParser", package: "swift-argument-parser")] + ), + .executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]), + .target(name: "Hub", resources: [.process("FallbackConfigs")], swiftSettings: swiftSettings), + .target(name: "Tokenizers", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]), + .target(name: "TensorUtils"), + .target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]), + .target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]), + .testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]), + .testTarget(name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")], swiftSettings: swiftSettings), + .testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]), + .testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], resources: [.process("Resources")]), + .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]), + .testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"]), + ] +) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 1f6607c2..8a4807a8 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -7,7 +7,10 @@ import Foundation import Hub + +#if canImport(Jinja) import Jinja +#endif public typealias Message = [String: Any] public typealias ToolSpec = [String: Any] @@ -26,23 +29,48 @@ public enum TokenizerError: LocalizedError { public var errorDescription: String? { switch self { case .missingConfig: - String(localized: "Tokenizer configuration is missing.", comment: "Error when tokenizer config cannot be found") + String( + localized: "Tokenizer configuration is missing.", + comment: "Error when tokenizer config cannot be found" + ) case .missingTokenizerClassInConfig: - String(localized: "The tokenizer class is not specified in the configuration.", comment: "Error when tokenizer_class is missing in config") + String( + localized: "The tokenizer class is not specified in the configuration.", + comment: "Error when tokenizer_class is missing in config" + ) case let .unsupportedTokenizer(name): - String(localized: "The tokenizer type '\(name)' is not supported.", comment: "Error when tokenizer type is not supported") + String( + localized: "The tokenizer type '\(name)' is not supported.", + comment: "Error when tokenizer type is not supported" + ) case .missingVocab: - String(localized: "Vocabulary file is missing from the tokenizer configuration.", comment: "Error when vocab file is missing") + String( + localized: "Vocabulary file is missing from the tokenizer configuration.", + comment: "Error when vocab file is missing" + ) case .malformedVocab: - String(localized: "The vocabulary file is malformed or corrupted.", comment: "Error when vocab file is malformed") + String( + localized: "The vocabulary file is malformed or corrupted.", + comment: "Error when vocab file is malformed" + ) case let .chatTemplate(message): - String(localized: "Chat template error: \(message)", comment: "Error with chat template") + String( + localized: "Chat template error: \(message)", comment: "Error with chat template" + ) case .missingChatTemplate: - String(localized: "This tokenizer does not have a chat template, and no template was passed.") + String( + localized: + "This tokenizer does not have a chat template, and no template was passed.") case let .tooLong(message): - String(localized: "Input is too long: \(message)", comment: "Error when input exceeds maximum length") + String( + localized: "Input is too long: \(message)", + comment: "Error when input exceeds maximum length" + ) case let .mismatchedConfig(message): - String(localized: "Tokenizer configuration mismatch: \(message)", comment: "Error when tokenizer configuration is inconsistent") + String( + localized: "Tokenizer configuration mismatch: \(message)", + comment: "Error when tokenizer configuration is inconsistent" + ) } } } @@ -70,7 +98,7 @@ public protocol TokenizingModel { } /// Helper - possibly to be moved somewhere else -func addedTokenAsString(_ addedToken: Config?) -> String? { +public func addedTokenAsString(_ addedToken: Config?) -> String? { guard let addedToken else { return nil } if let stringValue = addedToken.string() { return stringValue @@ -122,7 +150,9 @@ struct TokenizerModel { tokenizerConfig.unkToken.content.string() ?? tokenizerConfig.unkToken.string() } - static func from(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String: Int]) throws -> TokenizingModel { + static func from(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String: Int]) + throws -> TokenizingModel + { guard let tokenizerClassName = tokenizerConfig.tokenizerClass.string() else { throw TokenizerError.missingTokenizerClassInConfig } @@ -133,7 +163,9 @@ struct TokenizerModel { throw TokenizerError.unsupportedTokenizer(tokenizerName) } - return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) + return try tokenizerClass.init( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens + ) } } @@ -178,7 +210,9 @@ public protocol Tokenizer { func applyChatTemplate(messages: [Message], tools: [ToolSpec]?) throws -> [Int] /// The appropriate chat template is selected from the tokenizer config - func applyChatTemplate(messages: [Message], tools: [ToolSpec]?, additionalContext: [String: Any]?) throws -> [Int] + func applyChatTemplate( + messages: [Message], tools: [ToolSpec]?, additionalContext: [String: Any]? + ) throws -> [Int] /// The chat template is provided as a string literal or specified by name func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws -> [Int] @@ -196,6 +230,14 @@ public protocol Tokenizer { tools: [ToolSpec]? ) throws -> [Int] + #if !canImport(Jinja) + @available( + *, + deprecated, + message: + "Chat template support requires Jinja dependency. In Swift 6.1+, enable the ChatTemplates trait in Package.swift. In Swift <6.1, Jinja is always included." + ) + #endif func applyChatTemplate( messages: [Message], // A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. @@ -211,7 +253,15 @@ public protocol Tokenizer { extension Tokenizer { public var hasChatTemplate: Bool { false } - /// Call previous signature for backwards compatibility + // Call previous signature for backwards compatibility + #if !canImport(Jinja) + @available( + *, + deprecated, + message: + "Chat template support requires Jinja dependency. In Swift 6.1+, enable the ChatTemplates trait in Package.swift. In Swift <6.1, Jinja is always included." + ) + #endif func applyChatTemplate( messages: [Message], // A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. @@ -224,7 +274,9 @@ extension Tokenizer { ) throws -> [Int] { if additionalContext == nil { try applyChatTemplate( - messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, + messages: messages, chatTemplate: chatTemplate, + addGenerationPrompt: addGenerationPrompt, truncation: truncation, + maxLength: maxLength, tools: tools ) } else { @@ -233,6 +285,31 @@ extension Tokenizer { } } +public extension Tokenizer { + func applyChatTemplate(messages: [Message]) throws -> [Int] { + try applyChatTemplate( + messages: messages, chatTemplate: nil, addGenerationPrompt: true, truncation: false, + maxLength: nil, tools: nil + ) + } + + func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws + -> [Int] + { + try applyChatTemplate( + messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true, + truncation: false, maxLength: nil, tools: nil + ) + } + + func applyChatTemplate(messages: [Message], chatTemplate: String) throws -> [Int] { + try applyChatTemplate( + messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true, + truncation: false, maxLength: nil, tools: nil + ) + } +} + public extension Tokenizer { func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] { encode(text: text, addSpecialTokens: addSpecialTokens) @@ -251,18 +328,8 @@ public extension Tokenizer { } } -let specialTokenAttributes: [String] = [ - "bos_token", - "eos_token", - "unk_token", - "sep_token", - "pad_token", - "cls_token", - "mask_token", - "additional_special_tokens", -] - -public class PreTrainedTokenizer: Tokenizer { +/// open because we have to subclass from `TokenizersTemplates` +open class PreTrainedTokenizer: Tokenizer { let model: TokenizingModel public var bosToken: String? { model.bosToken } @@ -273,24 +340,28 @@ public class PreTrainedTokenizer: Tokenizer { public var unknownTokenId: Int? { model.unknownTokenId } public var fuseUnknownTokens: Bool { model.fuseUnknownTokens } - private let addedTokens: Set - private let specialTokens: [String: Int] - private let addedTokensRegex: NSRegularExpression? + let addedTokens: Set + let specialTokens: [String: Int] + let addedTokensRegex: NSRegularExpression? - private let preTokenizer: PreTokenizer? - private let normalizer: Normalizer? - private let postProcessor: PostProcessor? - private let decoder: Decoder? - private let tokenizerConfig: Config + let preTokenizer: PreTokenizer? + let normalizer: Normalizer? + let postProcessor: PostProcessor? + let decoder: Decoder? + public let tokenizerConfig: Config - private let cleanUpTokenizationSpaces: Bool + let cleanUpTokenizationSpaces: Bool public required init(tokenizerConfig: Config, tokenizerData: Config) throws { var addedTokens: [String: Int] = [:] var specialTokens: [String: Int] = [:] for addedToken in tokenizerData["addedTokens"].array(or: []) { - guard let id = addedToken["id"].integer() else { continue /* malformed: token with no id */ } - guard let content = addedToken.content.string() else { continue /* malformed: token with no content */ } + guard let id = addedToken["id"].integer() else { + continue /* malformed: token with no id */ + } + guard let content = addedToken.content.string() else { + continue /* malformed: token with no content */ + } addedTokens[content] = id if addedToken["special"].boolean(or: false) { @@ -300,14 +371,16 @@ public class PreTrainedTokenizer: Tokenizer { // Convert to tuples for easier access, then sort by length (descending) to avoid early partial matches // (https://github.com/xenova/transformers.js/commit/c305c3824f628f1f02806a6310bd3b18b0f7f8f5) - let unwrappedAddedTokens: [(content: String, prefix: Bool, suffix: Bool)] = (tokenizerData["addedTokens"].array(or: [])).compactMap { addedToken -> (String, Bool, Bool)? in - guard let content = addedToken.content.string() else { return nil } - let prefix = addedToken["lstrip"].boolean(or: false) - let suffix = addedToken["rstrip"].boolean(or: false) - return (content: content, prefix: prefix, suffix: suffix) - }.sorted { - $0.content.count > $1.content.count - } + let unwrappedAddedTokens: [(content: String, prefix: Bool, suffix: Bool)] = + (tokenizerData["addedTokens"].array(or: [])).compactMap { + addedToken -> (String, Bool, Bool)? in + guard let content = addedToken.content.string() else { return nil } + let prefix = addedToken["lstrip"].boolean(or: false) + let suffix = addedToken["rstrip"].boolean(or: false) + return (content: content, prefix: prefix, suffix: suffix) + }.sorted { + $0.content.count > $1.content.count + } // then concatenate into regular expression let addedTokensRegexString = unwrappedAddedTokens.map { @@ -325,11 +398,15 @@ public class PreTrainedTokenizer: Tokenizer { preTokenizer = PreTokenizerFactory.fromConfig(config: tokenizerData["preTokenizer"]) normalizer = NormalizerFactory.fromConfig(config: tokenizerData["normalizer"]) postProcessor = PostProcessorFactory.fromConfig(config: tokenizerData["postProcessor"]) - decoder = DecoderFactory.fromConfig(config: tokenizerData["decoder"], addedTokens: self.addedTokens) + decoder = DecoderFactory.fromConfig( + config: tokenizerData["decoder"], addedTokens: self.addedTokens + ) cleanUpTokenizationSpaces = tokenizerConfig.cleanUpTokenizationSpaces.boolean(or: true) self.tokenizerConfig = tokenizerConfig - model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) + model = try TokenizerModel.from( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens + ) } func preTokenize(_ text: String, options: PreTokenizerOptions) -> [String] { @@ -372,7 +449,8 @@ public class PreTrainedTokenizer: Tokenizer { func fuseUnknown(_ tokens: [String]) -> [String] { guard fuseUnknownTokens else { return tokens } - let (fused, _) = tokens.reduce((fused: [String](), previousIsUnknown: false)) { result, token in + let (fused, _) = tokens.reduce((fused: [String](), previousIsUnknown: false)) { + result, token in var (fused, previousIsUnknown) = result let isUnknown = model.convertTokenToId(token) == model.unknownTokenId if isUnknown { @@ -387,20 +465,25 @@ public class PreTrainedTokenizer: Tokenizer { public func tokenize(text: String) -> [String] { // Take care of special tokens first - let sections: [String] = if let regex = addedTokensRegex { - text.split(by: regex) - } else { - [text] - } + let sections: [String] = + if let regex = addedTokensRegex { + text.split(by: regex) + } else { + [text] + } return sections.enumerated().map { section, x in if addedTokens.contains(x) { return [x] } - return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) } + return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { + model($0) + } }.flatMap { fuseUnknown($0) } } /// Main entry point public func encode(text: String, addSpecialTokens: Bool = true) -> [Int] { - postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { model.convertTokenToId($0)! } + postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { + model.convertTokenToId($0)! + } } public func encode(text: String) -> [Int] { @@ -436,15 +519,17 @@ public class PreTrainedTokenizer: Tokenizer { !tokenizerConfig.chatTemplate.isNull() } - public func applyChatTemplate(messages: [Message]) throws -> [Int] { + open func applyChatTemplate(messages: [Message]) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true) } - public func applyChatTemplate(messages: [Message], tools: [ToolSpec]? = nil) throws -> [Int] { + open func applyChatTemplate(messages: [Message], tools: [ToolSpec]? = nil) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true, tools: tools) } - public func applyChatTemplate(messages: [Message], tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil) throws + open func applyChatTemplate( + messages: [Message], tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil + ) throws -> [Int] { try applyChatTemplate( @@ -455,15 +540,21 @@ public class PreTrainedTokenizer: Tokenizer { ) } - public func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws -> [Int] { - try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) + open func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws + -> [Int] + { + try applyChatTemplate( + messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true + ) } - public func applyChatTemplate(messages: [Message], chatTemplate: String) throws -> [Int] { - try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true) + open func applyChatTemplate(messages: [Message], chatTemplate: String) throws -> [Int] { + try applyChatTemplate( + messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true + ) } - public func applyChatTemplate( + open func applyChatTemplate( messages: [Message], chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, @@ -472,12 +563,21 @@ public class PreTrainedTokenizer: Tokenizer { tools: [ToolSpec]? = nil ) throws -> [Int] { try applyChatTemplate( - messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, + messages: messages, chatTemplate: chatTemplate, + addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools, additionalContext: nil ) } - public func applyChatTemplate( + #if !canImport(Jinja) + @available( + *, + deprecated, + message: + "Chat template support requires Jinja dependency. In Swift 6.1+, enable the ChatTemplates trait in Package.swift. In Swift <6.1, Jinja is always included." + ) + #endif + open func applyChatTemplate( messages: [Message], chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, @@ -491,6 +591,7 @@ public class PreTrainedTokenizer: Tokenizer { tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil ) throws -> [Int] { + #if canImport(Jinja) var selectedChatTemplate: String? if let chatTemplate, case let .literal(template) = chatTemplate { // Use chat template from argument @@ -501,7 +602,9 @@ public class PreTrainedTokenizer: Tokenizer { // If the config specifies a list of chat templates, convert them to a dictionary let templateDict = [String: String]( uniqueKeysWithValues: arrayValue.compactMap { item in - guard let name = item["name"].string(), let template = item["template"].string() else { + guard let name = item["name"].string(), + let template = item["template"].string() + else { return nil } return (name, template) @@ -511,9 +614,13 @@ public class PreTrainedTokenizer: Tokenizer { if let matchingDictEntry = templateDict[name] { selectedChatTemplate = matchingDictEntry } else { - throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config") + throw TokenizerError.chatTemplate( + "No chat template named \"\(name)\" was found in the tokenizer config" + ) } - } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { + } else if let tools, !tools.isEmpty, + let toolUseTemplate = templateDict["tool_use"] + { // Use tool use chat template from config selectedChatTemplate = toolUseTemplate } else if let defaultChatTemplate = templateDict["default"] { @@ -549,6 +656,17 @@ public class PreTrainedTokenizer: Tokenizer { } } + let specialTokenAttributes: [String] = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + for (key, value) in tokenizerConfig.dictionary(or: [:]) { if specialTokenAttributes.contains(key.string), !value.isNull() { if let stringValue = value.string() { @@ -574,6 +692,11 @@ public class PreTrainedTokenizer: Tokenizer { } return encodedTokens + #else + throw TokenizerError.chatTemplate( + "Chat template support requires Jinja dependency. In Swift 6.1+, enable the ChatTemplates trait in Package.swift. In Swift <6.1, Jinja is always included." + ) + #endif } } @@ -597,6 +720,16 @@ public extension AutoTokenizer { // Some tokenizer_class entries use a Fast suffix let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "") + + #if canImport(Jinja) + // Check for template-enabled classes first + if let tokenizerClass = PreTrainedTokenizerTemplateClasses.tokenizerClasses[ + tokenizerName + ] { + return tokenizerClass + } + #endif + if let tokenizerClass = PreTrainedTokenizerClasses.tokenizerClasses[tokenizerName] { return tokenizerClass } @@ -606,7 +739,9 @@ public extension AutoTokenizer { static func from(tokenizerConfig: Config, tokenizerData: Config) throws -> Tokenizer { let tokenizerClass = tokenizerClass(for: tokenizerConfig) - return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + return try tokenizerClass.init( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData + ) } static func from( @@ -614,10 +749,14 @@ public extension AutoTokenizer { hubApi: HubApi = .shared ) async throws -> Tokenizer { let config = LanguageModelConfigurationFromHub(modelName: model, hubApi: hubApi) - guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } + guard let tokenizerConfig = try await config.tokenizerConfig else { + throw TokenizerError.missingConfig + } let tokenizerData = try await config.tokenizerData - return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + return try AutoTokenizer.from( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData + ) } static func from( @@ -625,10 +764,14 @@ public extension AutoTokenizer { hubApi: HubApi = .shared ) async throws -> Tokenizer { let config = LanguageModelConfigurationFromHub(modelFolder: modelFolder, hubApi: hubApi) - guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } + guard let tokenizerConfig = try await config.tokenizerConfig else { + throw TokenizerError.missingConfig + } let tokenizerData = try await config.tokenizerData - return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + return try PreTrainedTokenizer( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData + ) } } @@ -648,11 +791,15 @@ class T5Tokenizer: UnigramTokenizer { } // MARK: - PreTrainedTokenizer classes -let sentencePieceUnderline = "▁" +// These need to be public to be visible from the wrapper factory + +public let sentencePieceUnderline = "▁" /// Hack for Llama tokenizers, see https://github.com/huggingface/transformers/blob/bcb841f0073fcd7a4fb88ea8064313c17dcab04a/src/transformers/models/llama/tokenization_llama_fast.py#L181 /// Return updated config, or nil -func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) throws -> Config? { +public func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) throws + -> Config? +{ // If it's already a Template processor (instead of a ByteLevel one), assume it's correct let postProcessor = PostProcessorFactory.fromConfig(config: processorConfig) guard !(postProcessor is TemplateProcessing) else { return nil } @@ -688,7 +835,9 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) pair = pair + [["SpecialToken": ["id": eosToken!, "type_id": 1]]] } - let postProcessorConfig = Config(["type": PostProcessorType.TemplateProcessing.rawValue, "single": single, "pair": pair]) + let postProcessorConfig = Config([ + "type": PostProcessorType.TemplateProcessing.rawValue, "single": single, "pair": pair, + ]) return postProcessorConfig } @@ -702,11 +851,42 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer { if !isLegacy { _ = configDictionary.removeValue(forKey: "normalizer") configDictionary["pre_tokenizer"] = [ - "type": "Metaspace", "replacement": .init(sentencePieceUnderline), "add_prefix_space": true, "prepend_scheme": "first", + "type": "Metaspace", "replacement": .init(sentencePieceUnderline), + "add_prefix_space": true, "prepend_scheme": "first", + ] + } + + if let postProcessorConfig = try maybeUpdatePostProcessor( + tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData["postProcessor"] + ) { + configDictionary["post_processor"] = .init(postProcessorConfig.dictionary(or: [:])) + } + + let updatedData = Config(configDictionary) + try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData) + } +} + +#if canImport(Jinja) +/// Template-enabled tokenizer classes +/// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions +class LlamaPreTrainedTokenizerWithTemplates: PreTrainedTokenizer { + let isLegacy: Bool + + required init(tokenizerConfig: Config, tokenizerData: Config) throws { + isLegacy = tokenizerConfig.legacy.boolean(or: true) + var configDictionary = tokenizerData.dictionary(or: [:]) + if !isLegacy { + _ = configDictionary.removeValue(forKey: "normalizer") + configDictionary["pre_tokenizer"] = [ + "type": "Metaspace", "replacement": .init(sentencePieceUnderline), + "add_prefix_space": true, "prepend_scheme": "first", ] } - if let postProcessorConfig = try maybeUpdatePostProcessor(tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData["postProcessor"]) { + if let postProcessorConfig = try maybeUpdatePostProcessor( + tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData["postProcessor"] + ) { configDictionary["post_processor"] = .init(postProcessorConfig.dictionary(or: [:])) } @@ -714,3 +894,11 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer { try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData) } } + +struct PreTrainedTokenizerTemplateClasses { + /// Template-enabled class overrides + static let tokenizerClasses: [String: PreTrainedTokenizer.Type] = [ + "LlamaTokenizer": LlamaPreTrainedTokenizerWithTemplates.self, + ] +} +#endif diff --git a/Sources/TransformersCLI/TransformersCLI.swift b/Sources/TransformersCLI/TransformersCLI.swift new file mode 100644 index 00000000..dca8427c --- /dev/null +++ b/Sources/TransformersCLI/TransformersCLI.swift @@ -0,0 +1,97 @@ +import ArgumentParser +import CoreML +import Foundation + +import Generation +import Models + +@main +struct TransformersCLI: AsyncParsableCommand { + static let configuration = CommandConfiguration( + abstract: "Run text generation on a Core ML language model", + version: "0.0.1" + ) + + @Argument(help: "Input text") + var prompt: String + + @Argument(help: "Path to Core ML mlpackage model") + var modelPath: String = "./model.mlpackage" + + @Option(help: "Maximum amount of tokens the model should generate") + var maxLength: Int = 50 + + @Option(help: "Compute units to load model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine}") + var computeUnits: ComputeUnits = .cpuAndGPU + + enum ComputeUnits: String, ExpressibleByArgument, CaseIterable { + case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine + var asMLComputeUnits: MLComputeUnits { + switch self { + case .all: .all + case .cpuAndGPU: .cpuAndGPU + case .cpuOnly: .cpuOnly + case .cpuAndNeuralEngine: .cpuAndNeuralEngine + } + } + } + + func generate(model: LanguageModel, config: GenerationConfig, prompt: String, printOutput: Bool = true) async { + var tokensReceived = 0 + var previousIndex: String.Index? = nil + let begin = Date() + do { + try await model.generate(config: config, prompt: prompt) { inProgressGeneration in + tokensReceived += 1 + let response = inProgressGeneration.replacingOccurrences(of: "\\n", with: "\n") + if printOutput { + print(response[(previousIndex ?? response.startIndex)...], terminator: "") + fflush(stdout) + } + previousIndex = response.endIndex + } + let completionTime = Date().timeIntervalSince(begin) + let tps = Double(tokensReceived) / completionTime + if printOutput { + print("") + print("\(tps.formatted("%.2f")) tokens/s, total time: \(completionTime.formatted("%.2f"))s") + } + } catch { + print("Error \(error)") + } + } + + func compile(at url: URL) throws -> URL { + #if os(watchOS) + fatalError("Model compilation is not supported on watchOS") + #else + if url.pathExtension == "mlmodelc" { return url } + print("Compiling model \(url)") + return try MLModel.compileModel(at: url) + #endif + } + + func run() async throws { + let url = URL(filePath: modelPath) + let compiledURL = try compile(at: url) + print("Loading model \(compiledURL)") + let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits) + + // Using greedy generation for now + var config = model.defaultGenerationConfig + config.doSample = false + config.maxNewTokens = maxLength + + print("Warming up...") + await generate(model: model, config: config, prompt: prompt, printOutput: false) + + print("Generating") + await generate(model: model, config: config, prompt: prompt) + } +} + +extension Double { + func formatted(_ format: String) -> String { + String(format: "\(format)", self) + } +} diff --git a/Sources/TransformersCLI/main.swift b/Sources/TransformersCLI/main.swift deleted file mode 100644 index af4a83ea..00000000 --- a/Sources/TransformersCLI/main.swift +++ /dev/null @@ -1,109 +0,0 @@ -import ArgumentParser -import CoreML -import Foundation - -import Generation -import Models - -@available(iOS 16.2, macOS 13.1, *) -struct TransformersCLI: ParsableCommand { - static let configuration = CommandConfiguration( - abstract: "Run text generation on a Core ML language model", - version: "0.0.1" - ) - - @Argument(help: "Input text") - var prompt: String - - @Argument(help: "Path to Core ML mlpackage model") - var modelPath: String = "./model.mlpackage" - - @Option(help: "Maximum amount of tokens the model should generate") - var maxLength: Int = 50 - - @Option(help: "Compute units to load model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine}") - var computeUnits: ComputeUnits = .cpuAndGPU - - func generate(model: LanguageModel, config: GenerationConfig, prompt: String, printOutput: Bool = true) { - let semaphore = DispatchSemaphore(value: 0) - Task.init { [config] in - defer { semaphore.signal() } - var tokensReceived = 0 - var previousIndex: String.Index? = nil - let begin = Date() - do { - try await model.generate(config: config, prompt: prompt) { inProgressGeneration in - tokensReceived += 1 - let response = inProgressGeneration.replacingOccurrences(of: "\\n", with: "\n") - if printOutput { - print(response[(previousIndex ?? response.startIndex)...], terminator: "") - fflush(stdout) - } - previousIndex = response.endIndex - } - let completionTime = Date().timeIntervalSince(begin) - let tps = Double(tokensReceived) / completionTime - if printOutput { - print("") - print("\(tps.formatted("%.2f")) tokens/s, total time: \(completionTime.formatted("%.2f"))s") - } - } catch { - print("Error \(error)") - } - } - semaphore.wait() - } - - func compile(at url: URL) throws -> URL { - #if os(watchOS) - fatalError("Model compilation is not supported on watchOS") - #else - if url.pathExtension == "mlmodelc" { return url } - print("Compiling model \(url)") - return try MLModel.compileModel(at: url) - #endif - } - - func run() throws { - let url = URL(filePath: modelPath) - let compiledURL = try compile(at: url) - print("Loading model \(compiledURL)") - let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits) - - // Using greedy generation for now - var config = model.defaultGenerationConfig - config.doSample = false - config.maxNewTokens = maxLength - - print("Warming up...") - generate(model: model, config: config, prompt: prompt, printOutput: false) - - print("Generating") - generate(model: model, config: config, prompt: prompt) - } -} - -@available(iOS 16.2, macOS 13.1, *) -enum ComputeUnits: String, ExpressibleByArgument, CaseIterable { - case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine - var asMLComputeUnits: MLComputeUnits { - switch self { - case .all: .all - case .cpuAndGPU: .cpuAndGPU - case .cpuOnly: .cpuOnly - case .cpuAndNeuralEngine: .cpuAndNeuralEngine - } - } -} - -if #available(iOS 16.2, macOS 13.1, *) { - TransformersCLI.main() -} else { - print("Unsupported OS") -} - -extension Double { - func formatted(_ format: String) -> String { - String(format: "\(format)", self) - } -} diff --git a/Tests/NormalizerTests/NormalizerTests.swift b/Tests/NormalizerTests/NormalizerTests.swift index ca69198b..e617be29 100644 --- a/Tests/NormalizerTests/NormalizerTests.swift +++ b/Tests/NormalizerTests/NormalizerTests.swift @@ -120,7 +120,7 @@ class NormalizerTests: XCTestCase { } func testStripAccents() { - let testCases: [(String, String)] = [ + let testCases = [ ("département", "departement"), ] diff --git a/Tests/PostProcessorTests/PostProcessorTests.swift b/Tests/PostProcessorTests/PostProcessorTests.swift index e106ac68..6f48eaf2 100644 --- a/Tests/PostProcessorTests/PostProcessorTests.swift +++ b/Tests/PostProcessorTests/PostProcessorTests.swift @@ -1,3 +1,5 @@ +import XCTest + @testable import Hub @testable import Tokenizers import XCTest @@ -7,61 +9,75 @@ class PostProcessorTests: XCTestCase { let testCases: [(Config, [String], [String]?, [String])] = [ // Should keep spaces; uneven spaces; ignore `addPrefixSpace`. ( - Config(["cls": ["[HEAD]", 0 as UInt], - "sep": ["[END]", 0 as UInt], - "trimOffset": false, - "addPrefixSpace": true]), + Config([ + "cls": ["[HEAD]", 0 as UInt], + "sep": ["[END]", 0 as UInt], + "trimOffset": false, + "addPrefixSpace": true, + ]), [" The", " sun", "sets ", " in ", " the ", "west"], nil, ["[HEAD]", " The", " sun", "sets ", " in ", " the ", "west", "[END]"] ), // Should leave only one space around each token. ( - Config(["cls": ["[START]", 0 as UInt], - "sep": ["[BREAK]", 0 as UInt], - "trimOffset": true, - "addPrefixSpace": true]), + Config([ + "cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], + "trimOffset": true, + "addPrefixSpace": true, + ]), [" The ", " sun", "sets ", " in ", " the ", "west"], nil, ["[START]", " The ", " sun", "sets ", " in ", " the ", "west", "[BREAK]"] ), // Should ignore empty tokens pair. ( - Config(["cls": ["[START]", 0 as UInt], - "sep": ["[BREAK]", 0 as UInt], - "trimOffset": true, - "addPrefixSpace": true]), + Config([ + "cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], + "trimOffset": true, + "addPrefixSpace": true, + ]), [" The ", " sun", "sets ", " in ", " the ", "west"], [], ["[START]", " The ", " sun", "sets ", " in ", " the ", "west", "[BREAK]"] ), // Should trim all whitespace. ( - Config(["cls": ["[CLS]", 0 as UInt], - "sep": ["[SEP]", 0 as UInt], - "trimOffset": true, - "addPrefixSpace": false]), + Config([ + "cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], + "trimOffset": true, + "addPrefixSpace": false, + ]), [" The ", " sun", "sets ", " in ", " the ", "west"], nil, ["[CLS]", "The", "sun", "sets", "in", "the", "west", "[SEP]"] ), // Should add tokens. ( - Config(["cls": ["[CLS]", 0 as UInt], - "sep": ["[SEP]", 0 as UInt], - "trimOffset": true, - "addPrefixSpace": true]), + Config([ + "cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], + "trimOffset": true, + "addPrefixSpace": true, + ]), [" The ", " sun", "sets ", " in ", " the ", "west"], [".", "The", " cat ", " is ", " sitting ", " on", "the ", "mat"], - ["[CLS]", " The ", " sun", "sets ", " in ", " the ", "west", "[SEP]", - "[SEP]", ".", "The", " cat ", " is ", " sitting ", " on", "the ", - "mat", "[SEP]"] + [ + "[CLS]", " The ", " sun", "sets ", " in ", " the ", "west", "[SEP]", + "[SEP]", ".", "The", " cat ", " is ", " sitting ", " on", "the ", + "mat", "[SEP]", + ] ), ( - Config(["cls": ["[CLS]", 0 as UInt], - "sep": ["[SEP]", 0 as UInt], - "trimOffset": true, - "addPrefixSpace": true]), + Config([ + "cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], + "trimOffset": true, + "addPrefixSpace": true, + ]), [" 你 ", " 好 ", ","], [" 凯 ", " 蒂 ", "!"], ["[CLS]", " 你 ", " 好 ", ",", "[SEP]", "[SEP]", " 凯 ", " 蒂 ", "!", "[SEP]"] diff --git a/Tests/PreTokenizerTests/PreTokenizerTests.swift b/Tests/PreTokenizerTests/PreTokenizerTests.swift index cc84be2e..6fb30181 100644 --- a/Tests/PreTokenizerTests/PreTokenizerTests.swift +++ b/Tests/PreTokenizerTests/PreTokenizerTests.swift @@ -8,6 +8,8 @@ import Hub @testable import Tokenizers import XCTest +@testable import Tokenizers + class PreTokenizerTests: XCTestCase { func testWhitespacePreTokenizer() { let preTokenizer = WhitespacePreTokenizer(config: Config([String: Config]())) @@ -118,7 +120,10 @@ class PreTokenizerTests: XCTestCase { ) XCTAssertEqual( preTokenizer1.preTokenize(text: " Hey, friend, what's up? "), - [" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " "] + [ + " ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", + " ", "up?", " ", " ", + ] ) let preTokenizer2 = SplitPreTokenizer(config: Config(["pattern": ["Regex": "\\s"]])) @@ -132,10 +137,19 @@ class PreTokenizerTests: XCTestCase { ) XCTAssertEqual( preTokenizer2.preTokenize(text: " Hey, friend, what's up? "), - [" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " "] + [ + " ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", + " ", "up?", " ", " ", + ] ) - let preTokenizer3 = SplitPreTokenizer(config: Config(["pattern": ["Regex": "(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"], "invert": true])) + let preTokenizer3 = SplitPreTokenizer( + config: Config([ + "pattern": [ + "Regex": + "(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + ], "invert": true, + ])) XCTAssertEqual( preTokenizer3.preTokenize(text: "Hello"), ["Hello"] @@ -154,17 +168,19 @@ class PreTokenizerTests: XCTestCase { /// https://github.com/huggingface/tokenizers/pull/1357 func testMetaspacePreTokenizer() { // Prepend "always" - let preTokenizer = MetaspacePreTokenizer(config: Config([ - "add_prefix_space": true, - "replacement": "▁", - "prepend_scheme": "always", - ])) + let preTokenizer = MetaspacePreTokenizer( + config: Config([ + "add_prefix_space": true, + "replacement": "▁", + "prepend_scheme": "always", + ])) // TODO: different sections on let text = "Hey my friend how▁are you" - let tokens = text - .split(by: "", includeSeparators: true) - .flatMap { preTokenizer.preTokenize(text: $0) } + let tokens = + text + .split(by: "", includeSeparators: true) + .flatMap { preTokenizer.preTokenize(text: $0) } XCTAssertEqual( tokens, diff --git a/Tests/TokenizersTests/BertTokenizerTests.swift b/Tests/TokenizersTests/BertTokenizerTests.swift index a98f5092..14b5467b 100644 --- a/Tests/TokenizersTests/BertTokenizerTests.swift +++ b/Tests/TokenizersTests/BertTokenizerTests.swift @@ -6,6 +6,8 @@ // Copyright © 2019 Hugging Face. All rights reserved. // +import XCTest + @testable import Hub @testable import Tokenizers import XCTest @@ -49,7 +51,9 @@ class BertTokenizerTests: XCTestCase { /// For each Squad question tokenized by python, check that we get the same output through the `BasicTokenizer` func testFullBasicTokenizer() { - let url = Bundle.module.url(forResource: "basic_tokenized_questions", withExtension: "json")! + let url = Bundle.module.url( + forResource: "basic_tokenized_questions", withExtension: "json" + )! let json = try! Data(contentsOf: url) let decoder = JSONDecoder() let sampleTokens = try! decoder.decode([[String]].self, from: json) @@ -102,7 +106,9 @@ class BertTokenizerTests: XCTestCase { func testChineseWithNumeralsTokenization() { let tokenizer = bertTokenizer let text = "2020年奥运会在东京举行。" - let expectedTokens = ["2020", "年", "[UNK]", "[UNK]", "会", "[UNK]", "[UNK]", "京", "[UNK]", "行", "。"] + let expectedTokens = [ + "2020", "年", "[UNK]", "[UNK]", "会", "[UNK]", "[UNK]", "京", "[UNK]", "行", "。", + ] let tokens = tokenizer.tokenize(text: text) XCTAssertEqual(tokens, expectedTokens) @@ -111,7 +117,9 @@ class BertTokenizerTests: XCTestCase { func testChineseWithSpecialTokens() { let tokenizer = bertTokenizer let text = "[CLS] 机器学习是未来。 [SEP]" - let expectedTokens = ["[CLS]", "[UNK]", "[UNK]", "学", "[UNK]", "[UNK]", "[UNK]", "[UNK]", "。", "[SEP]"] + let expectedTokens = [ + "[CLS]", "[UNK]", "[UNK]", "学", "[UNK]", "[UNK]", "[UNK]", "[UNK]", "。", "[SEP]", + ] let tokens = tokenizer.tokenize(text: text) XCTAssertEqual(tokens, expectedTokens) @@ -141,7 +149,10 @@ class BertTokenizerTests: XCTestCase { let tokenizer = bertTokenizer for question in questionTokens { - XCTAssertEqual(question.basic.joined(separator: " "), tokenizer.convertWordpieceToBasicTokenList(question.wordpiece)) + XCTAssertEqual( + question.basic.joined(separator: " "), + tokenizer.convertWordpieceToBasicTokenList(question.wordpiece) + ) } } @@ -178,10 +189,15 @@ class BertTokenizerTests: XCTestCase { } func testBertTokenizerAddedTokensRecognized() async throws { - let base: URL = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first!.appending(component: "huggingface-tests") + let base: URL = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first! + .appending(component: "huggingface-tests") let hubApi = HubApi(downloadBase: base) - let configuration = LanguageModelConfigurationFromHub(modelName: "google-bert/bert-base-uncased", hubApi: hubApi) - guard let tokenizerConfig = try await configuration.tokenizerConfig else { fatalError("missing tokenizer config") } + let configuration = LanguageModelConfigurationFromHub( + modelName: "google-bert/bert-base-uncased", hubApi: hubApi + ) + guard let tokenizerConfig = try await configuration.tokenizerConfig else { + fatalError("missing tokenizer config") + } let tokenizerData = try await configuration.tokenizerData let addedTokens = [ "[ROAD]": 60_001, @@ -192,7 +208,9 @@ class BertTokenizerTests: XCTestCase { "[INTERSECT]": 60_006, "[UNION]": 60_007, ] - let tokenizer = try BertTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) + let tokenizer = try BertTokenizer( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens + ) for (token, idx) in addedTokens { XCTAssertEqual(tokenizer.convertTokenToId(token), idx) } diff --git a/Tests/TokenizersTests/DecoderTests.swift b/Tests/TokenizersTests/DecoderTests.swift index dbac6d4d..512c5938 100644 --- a/Tests/TokenizersTests/DecoderTests.swift +++ b/Tests/TokenizersTests/DecoderTests.swift @@ -8,13 +8,16 @@ import Hub @testable import Tokenizers import XCTest +@testable import Tokenizers + class DecoderTests: XCTestCase { /// https://github.com/huggingface/tokenizers/pull/1357 func testMetaspaceDecoder() { - let decoder = MetaspaceDecoder(config: Config([ - "add_prefix_space": true, - "replacement": "▁", - ])) + let decoder = MetaspaceDecoder( + config: Config([ + "add_prefix_space": true, + "replacement": "▁", + ])) let tokens = ["▁Hey", "▁my", "▁friend", "▁", "▁", "▁how", "▁are", "▁you"] let decoded = decoder.decode(tokens: tokens) @@ -34,7 +37,10 @@ class DecoderTests: XCTestCase { (["##auto", "##mat", "##ic", "transmission"], "##automatic transmission"), (["who", "do", "##n't", "does", "n't", "can't"], "who don't doesn't can't"), (["##un", "##believ", "##able", "##fa", "##ntastic"], "##unbelievablefantastic"), - (["this", "is", "un", "##believ", "##able", "fa", "##ntastic"], "this is unbelievable fantastic"), + ( + ["this", "is", "un", "##believ", "##able", "fa", "##ntastic"], + "this is unbelievable fantastic" + ), (["The", "##quick", "##brown", "fox"], "Thequickbrown fox"), ] diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 93cf3ae1..67a5627f 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -7,6 +7,8 @@ // import Hub +import XCTest + @testable import Models @testable import Tokenizers import XCTest @@ -97,7 +99,10 @@ class GemmaTokenizerTests: TokenizerTests { class GemmaUnicodeTests: XCTestCase { func testGemmaVocab() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") + as? PreTrainedTokenizer + else { XCTFail() return } @@ -109,21 +114,32 @@ class GemmaUnicodeTests: XCTestCase { class PhiSimpleTests: XCTestCase { func testPhi4() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/phi-4") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/phi-4") + as? PreTrainedTokenizer + else { XCTFail() return } XCTAssertEqual(tokenizer.encode(text: "hello"), [15339]) XCTAssertEqual(tokenizer.encode(text: "hello world"), [15339, 1917]) - XCTAssertEqual(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>"), [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266]) + XCTAssertEqual( + tokenizer.encode( + text: + "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>" + ), [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266] + ) } } class LlamaPostProcessorOverrideTests: XCTestCase { /// Deepseek needs a post-processor override to add a bos token as in the reference implementation func testDeepSeek() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") as? PreTrainedTokenizer + else { XCTFail() return } @@ -132,7 +148,10 @@ class LlamaPostProcessorOverrideTests: XCTestCase { /// Some Llama tokenizers already use a bos-prepending Template post-processor func testLlama() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "coreml-projects/Llama-2-7b-chat-coreml") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "coreml-projects/Llama-2-7b-chat-coreml") as? PreTrainedTokenizer + else { XCTFail() return } @@ -150,7 +169,8 @@ class LocalFromPretrainedTests: XCTestCase { let hubApi = HubApi(downloadBase: downloadDestination) let downloadedTo = try await hubApi.snapshot(from: "pcuenq/gemma-tokenizer") - let tokenizer = try await AutoTokenizer.from(modelFolder: downloadedTo) as? PreTrainedTokenizer + let tokenizer = + try await AutoTokenizer.from(modelFolder: downloadedTo) as? PreTrainedTokenizer XCTAssertNotNil(tokenizer) try FileManager.default.removeItem(at: downloadDestination) @@ -159,7 +179,10 @@ class LocalFromPretrainedTests: XCTestCase { class BertDiacriticsTests: XCTestCase { func testBertCased() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "distilbert/distilbert-base-multilingual-cased") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "distilbert/distilbert-base-multilingual-cased") as? PreTrainedTokenizer + else { XCTFail() return } @@ -169,7 +192,11 @@ class BertDiacriticsTests: XCTestCase { } func testBertCasedResaved() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/distilbert-base-multilingual-cased-tokenizer") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "pcuenq/distilbert-base-multilingual-cased-tokenizer") + as? PreTrainedTokenizer + else { XCTFail() return } @@ -178,7 +205,10 @@ class BertDiacriticsTests: XCTestCase { } func testBertUncased() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer + else { XCTFail() return } @@ -190,13 +220,22 @@ class BertDiacriticsTests: XCTestCase { XCTAssertEqual(tokenizer.tokenize(text: "Car"), ["car"]) XCTAssertEqual(tokenizer.tokenize(text: "€4"), ["€", "##4"]) - XCTAssertEqual(tokenizer.tokenize(text: "test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test"), ["test", "$", "1", "r", "##2", "#", "3", "€", "##4", "£5", "¥", "##6", "[UNK]", "₹", "##8", "₱", "##9", "test"]) + XCTAssertEqual( + tokenizer.tokenize(text: "test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test"), + [ + "test", "$", "1", "r", "##2", "#", "3", "€", "##4", "£5", "¥", "##6", "[UNK]", "₹", + "##8", "₱", "##9", "test", + ] + ) } } class BertSpacesTests: XCTestCase { func testEncodeDecode() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from( + pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer + else { XCTFail() return } @@ -214,14 +253,20 @@ class BertSpacesTests: XCTestCase { class RobertaTests: XCTestCase { func testEncodeDecode() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "FacebookAI/roberta-base") as? PreTrainedTokenizer else { + guard + let tokenizer = try await AutoTokenizer.from(pretrained: "FacebookAI/roberta-base") + as? PreTrainedTokenizer + else { XCTFail() return } XCTAssertEqual(tokenizer.tokenize(text: "l'eure"), ["l", "'", "e", "ure"]) XCTAssertEqual(tokenizer.encode(text: "l'eure"), [0, 462, 108, 242, 2407, 2]) - XCTAssertEqual(tokenizer.decode(tokens: tokenizer.encode(text: "l'eure"), skipSpecialTokens: true), "l'eure") + XCTAssertEqual( + tokenizer.decode(tokens: tokenizer.encode(text: "l'eure"), skipSpecialTokens: true), + "l'eure" + ) XCTAssertEqual(tokenizer.tokenize(text: "mąka"), ["m", "Ä", "ħ", "ka"]) XCTAssertEqual(tokenizer.encode(text: "mąka"), [0, 119, 649, 5782, 2348, 2]) @@ -232,11 +277,18 @@ class RobertaTests: XCTestCase { XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["Who", "Ġare", "Ġyou", "?"]) XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 12375, 32, 47, 116, 2]) - XCTAssertEqual(tokenizer.tokenize(text: " Who are you? "), ["ĠWho", "Ġare", "Ġyou", "?", "Ġ"]) + XCTAssertEqual( + tokenizer.tokenize(text: " Who are you? "), ["ĠWho", "Ġare", "Ġyou", "?", "Ġ"] + ) XCTAssertEqual(tokenizer.encode(text: " Who are you? "), [0, 3394, 32, 47, 116, 1437, 2]) - XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["", "Who", "Ġare", "Ġyou", "?", ""]) - XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 0, 12375, 32, 47, 116, 2, 2]) + XCTAssertEqual( + tokenizer.tokenize(text: "Who are you?"), + ["", "Who", "Ġare", "Ġyou", "?", ""] + ) + XCTAssertEqual( + tokenizer.encode(text: "Who are you?"), [0, 0, 12375, 32, 47, 116, 2, 2] + ) } } @@ -304,7 +356,9 @@ class TokenizerTester { throw TokenizerError.tokenizerConfigNotFound } let tokenizerData = try await configuration!.tokenizerData - _tokenizer = try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + _tokenizer = try AutoTokenizer.from( + tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData + ) } catch { XCTFail("Cannot load tokenizer: \(error)") } @@ -388,7 +442,7 @@ class TokenizerTester { class TokenizerTests: XCTestCase { /// Parallel testing in Xcode (when enabled) uses different processes, so this shouldn't be a problem - static var _tester: TokenizerTester? + nonisolated(unsafe) static var _tester: TokenizerTester? class var hubModelName: String? { nil } class var encodedSamplesFilename: String? { nil } @@ -396,7 +450,7 @@ class TokenizerTests: XCTestCase { /// Known id retrieved from Python, to verify it was parsed correctly class var unknownTokenId: Int? { nil } - static var downloadDestination: URL = { + nonisolated(unsafe) static var downloadDestination: URL = { let base = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first! return base.appending(component: "huggingface-tests") }()