@@ -64,12 +64,16 @@ public sealed class Arguments : TransformInputBase
6464 public const string LoaderSignature = "CharToken" ;
6565 public const string UserName = "Character Tokenizer Transform" ;
6666
67+ // Keep track of the model that was saved with ver:0x00010001
68+ private readonly bool _isSeparatorStartEnd ;
69+
6770 private static VersionInfo GetVersionInfo ( )
6871 {
6972 return new VersionInfo (
7073 modelSignature : "CHARTOKN" ,
71- verWrittenCur : 0x00010001 , // Initial
72- verReadableCur : 0x00010001 ,
74+ //verWrittenCur: 0x00010001, // Initial
75+ verWrittenCur : 0x00010002 , // Updated to use UnitSeparator <US> character instead of using <ETX><STX> for vector inputs.
76+ verReadableCur : 0x00010002 ,
7377 verWeCanReadBack : 0x00010001 ,
7478 loaderSignature : LoaderSignature ) ;
7579 }
@@ -84,6 +88,7 @@ private static VersionInfo GetVersionInfo()
8488 private volatile string _keyValuesStr ;
8589 private volatile int [ ] _keyValuesBoundaries ;
8690
91+ private const ushort UnitSeparator = 0x1f ;
8792 private const ushort TextStartMarker = 0x02 ;
8893 private const ushort TextEndMarker = 0x03 ;
8994 private const int TextMarkersCount = 2 ;
@@ -120,6 +125,8 @@ private CharTokenizeTransform(IHost host, ModelLoadContext ctx, IDataView input)
120125 // byte: _useMarkerChars value.
121126 _useMarkerChars = ctx . Reader . ReadBoolByte ( ) ;
122127
128+ _isSeparatorStartEnd = ctx . Header . ModelVerReadable < 0x00010002 || ctx . Reader . ReadBoolByte ( ) ;
129+
123130 _type = GetOutputColumnType ( ) ;
124131 SetMetadata ( ) ;
125132 }
@@ -145,6 +152,7 @@ public override void Save(ModelSaveContext ctx)
145152 // byte: _useMarkerChars value.
146153 SaveBase ( ctx ) ;
147154 ctx . Writer . WriteBoolByte ( _useMarkerChars ) ;
155+ ctx . Writer . WriteBoolByte ( _isSeparatorStartEnd ) ;
148156 }
149157
150158 protected override ColumnType GetColumnTypeCore ( int iinfo )
@@ -399,8 +407,8 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
399407
400408 var getSrc = GetSrcGetter < VBuffer < DvText > > ( input , iinfo ) ;
401409 var src = default ( VBuffer < DvText > ) ;
402- return
403- ( ref VBuffer < ushort > dst ) =>
410+
411+ ValueGetter < VBuffer < ushort > > getterWithStartEndSep = ( ref VBuffer < ushort > dst ) =>
404412 {
405413 getSrc ( ref src ) ;
406414
@@ -438,6 +446,67 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
438446
439447 dst = new VBuffer < ushort > ( len , values , dst . Indices ) ;
440448 } ;
449+
450+ ValueGetter < VBuffer < ushort > > getterWithUnitSep = ( ref VBuffer < ushort > dst ) =>
451+ {
452+ getSrc ( ref src ) ;
453+
454+ int len = 0 ;
455+
456+ for ( int i = 0 ; i < src . Count ; i ++ )
457+ {
458+ if ( src . Values [ i ] . HasChars )
459+ {
460+ len += src . Values [ i ] . Length ;
461+
462+ if ( i > 0 )
463+ len += 1 ; // add UnitSeparator character to len that will be added
464+ }
465+ }
466+
467+ if ( _useMarkerChars )
468+ len += TextMarkersCount ;
469+
470+ var values = dst . Values ;
471+ if ( len > 0 )
472+ {
473+ if ( Utils . Size ( values ) < len )
474+ values = new ushort [ len ] ;
475+
476+ int index = 0 ;
477+
478+ // VBuffer<DvText> can be a result of either concatenating text columns together
479+ // or application of word tokenizer before char tokenizer in TextTransform.
480+ //
481+ // Considering VBuffer<DvText> as a single text stream.
482+ // Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector.
483+ // Insert UnitSeparator after every piece of text in the vector.
484+ if ( _useMarkerChars )
485+ values [ index ++ ] = TextStartMarker ;
486+
487+ for ( int i = 0 ; i < src . Count ; i ++ )
488+ {
489+ if ( ! src . Values [ i ] . HasChars )
490+ continue ;
491+
492+ if ( i > 0 )
493+ values [ index ++ ] = UnitSeparator ;
494+
495+ for ( int ich = 0 ; ich < src . Values [ i ] . Length ; ich ++ )
496+ {
497+ values [ index ++ ] = src . Values [ i ] [ ich ] ;
498+ }
499+ }
500+
501+ if ( _useMarkerChars )
502+ values [ index ++ ] = TextEndMarker ;
503+
504+ Contracts . Assert ( index == len ) ;
505+ }
506+
507+ dst = new VBuffer < ushort > ( len , values , dst . Indices ) ;
508+ } ;
509+ return _isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep ;
441510 }
442511 }
443512}
0 commit comments