@@ -120,22 +120,21 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict
120120 {
121121 while ( ! reader . EndOfStream )
122122 {
123- string line = reader . ReadLine ( ) ;
123+ string ? line = reader . ReadLine ( ) ;
124124 if ( string . IsNullOrWhiteSpace ( line ) )
125125 {
126126 continue ;
127127 }
128128
129- var tokens = line . Split ( ' ' ) ;
130- if ( tokens . Length != 2 )
129+ int spaceIndex = line . IndexOf ( ' ' ) ;
130+ if ( spaceIndex <= 0 || spaceIndex >= line . Length - 1 || line . IndexOf ( ' ' , spaceIndex + 1 ) >= 0 )
131131 {
132132 throw new FormatException ( $ "Invalid format in the BPE encoder file stream") ;
133133 }
134134
135- byte [ ] tokenBytes = Convert . FromBase64String ( tokens [ 0 ] ) ;
136- int rank = 0 ;
135+ byte [ ] tokenBytes = Helpers . FromBase64String ( line , 0 , spaceIndex ) ;
137136
138- if ( int . TryParse ( tokens [ 1 ] , out rank ) )
137+ if ( Helpers . TryParseInt32 ( line , spaceIndex + 1 , out int rank ) )
139138 {
140139 encoder [ tokenBytes ] = rank ;
141140 decoder [ rank ] = tokenBytes ;
@@ -146,7 +145,7 @@ internal static (Dictionary<byte[], int>, Dictionary<string, int>, IReadOnlyDict
146145 }
147146 else
148147 {
149- throw new FormatException ( $ "Can't parse { tokens [ 1 ] } to integer") ;
148+ throw new FormatException ( $ "Can't parse { line . Substring ( spaceIndex ) } to integer") ;
150149 }
151150 }
152151 }
@@ -242,7 +241,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
242241 /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
243242 /// <param name="accumulatedIds">The list of accumulated Ids.</param>
244243 /// <returns>True if the operation succeeded, false otherwise.</returns>
245- public override bool TokenizeToIds ( string sequence , bool isSpecialToken , List < int > accumulatedIds )
244+ public override bool TokenizeToIds ( string sequence , bool isSpecialToken , IList < int > accumulatedIds )
246245 {
247246 if ( string . IsNullOrEmpty ( sequence ) )
248247 {
@@ -320,7 +319,7 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
320319 }
321320
322321 int [ ] idsToCache = BytePairEncoder . BytePairEncode ( Encoding . UTF8 . GetBytes ( token ) , _encoder ) ;
323- _cache . Add ( token , idsToCache . ToArray ( ) ) ;
322+ _cache . Add ( token , idsToCache ) ;
324323
325324 if ( idsToCache . Length == 1 )
326325 {
@@ -338,12 +337,12 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
338337 /// <returns>The mapped token of the Id.</returns>
339338 public override string ? IdToToken ( int id , bool skipSpecialTokens = false )
340339 {
341- if ( ! skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder . TryGetValue ( id , out string token ) )
340+ if ( ! skipSpecialTokens && _specialTokensDecoder is not null && _specialTokensDecoder . TryGetValue ( id , out string ? token ) )
342341 {
343342 return token ;
344343 }
345344
346- if ( _decoder . TryGetValue ( id , out byte [ ] tokenBytes ) )
345+ if ( _decoder . TryGetValue ( id , out byte [ ] ? tokenBytes ) )
347346 {
348347 return Encoding . UTF8 . GetString ( tokenBytes ) ;
349348 }
@@ -363,11 +362,11 @@ public override bool TokenizeToIds(string sequence, bool isSpecialToken, List<in
363362
364363 foreach ( int id in ids )
365364 {
366- if ( _decoder . TryGetValue ( id , out byte [ ] tokenBytes ) )
365+ if ( _decoder . TryGetValue ( id , out byte [ ] ? tokenBytes ) )
367366 {
368367 utf8Bytes . AddRange ( tokenBytes ) ;
369368 }
370- else if ( useSpecialTokens && _specialTokensDecoder ! . TryGetValue ( id , out string token ) )
369+ else if ( useSpecialTokens && _specialTokensDecoder ! . TryGetValue ( id , out string ? token ) )
371370 {
372371 utf8Bytes . AddRange ( Encoding . UTF8 . GetBytes ( token ) ) ;
373372 }
0 commit comments