1313#include "unicode.h"
1414#include "smb_common.h"
1515
16- /*
17- * smb_utf16_bytes() - how long will a string be after conversion?
18- * @from: pointer to input string
19- * @maxbytes: don't go past this many bytes of input string
20- * @codepage: destination codepage
21- *
22- * Walk a utf16le string and return the number of bytes that the string will
23- * be after being converted to the given charset, not including any null
24- * termination required. Don't walk past maxbytes in the source buffer.
25- *
26- * Return: string length after conversion
27- */
28- static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
29- const struct nls_table * codepage )
30- {
31- int i ;
32- int charlen , outlen = 0 ;
33- int maxwords = maxbytes / 2 ;
34- char tmp [NLS_MAX_CHARSET_SIZE ];
35- __u16 ftmp ;
36-
37- for (i = 0 ; i < maxwords ; i ++ ) {
38- ftmp = get_unaligned_le16 (& from [i ]);
39- if (ftmp == 0 )
40- break ;
41-
42- charlen = codepage -> uni2char (ftmp , tmp , NLS_MAX_CHARSET_SIZE );
43- if (charlen > 0 )
44- outlen += charlen ;
45- else
46- outlen ++ ;
47- }
48-
49- return outlen ;
50- }
51-
5216/*
5317 * cifs_mapchar() - convert a host-endian char to proper char in codepage
5418 * @target: where converted character should be copied
55- * @src_char: 2 byte host-endian source character
19+ * @from: host-endian source string
5620 * @cp: codepage to which character should be converted
5721 * @mapchar: should character be mapped according to mapchars mount option?
5822 *
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
6327 * Return: string length after conversion
6428 */
6529static int
66- cifs_mapchar (char * target , const __u16 src_char , const struct nls_table * cp ,
30+ cifs_mapchar (char * target , const __u16 * from , const struct nls_table * cp ,
6731 bool mapchar )
6832{
6933 int len = 1 ;
34+ __u16 src_char ;
35+
36+ src_char = * from ;
7037
7138 if (!mapchar )
7239 goto cp_convert ;
@@ -104,12 +71,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
10471
10572cp_convert :
10673 len = cp -> uni2char (src_char , target , NLS_MAX_CHARSET_SIZE );
107- if (len <= 0 ) {
108- * target = '?' ;
109- len = 1 ;
110- }
74+ if (len <= 0 )
75+ goto surrogate_pair ;
11176
11277 goto out ;
78+
79+ surrogate_pair :
80+ /* convert SURROGATE_PAIR and IVS */
81+ if (strcmp (cp -> charset , "utf8" ))
82+ goto unknown ;
83+ len = utf16s_to_utf8s (from , 3 , UTF16_LITTLE_ENDIAN , target , 6 );
84+ if (len <= 0 )
85+ goto unknown ;
86+ return len ;
87+
88+ unknown :
89+ * target = '?' ;
90+ len = 1 ;
91+ goto out ;
92+ }
93+
94+ /*
95+ * smb_utf16_bytes() - compute converted string length
96+ * @from: pointer to input string
97+ * @maxbytes: input string length
98+ * @codepage: destination codepage
99+ *
100+ * Walk a utf16le string and return the number of bytes that the string will
101+ * be after being converted to the given charset, not including any null
102+ * termination required. Don't walk past maxbytes in the source buffer.
103+ *
104+ * Return: string length after conversion
105+ */
106+ static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
107+ const struct nls_table * codepage )
108+ {
109+ int i , j ;
110+ int charlen , outlen = 0 ;
111+ int maxwords = maxbytes / 2 ;
112+ char tmp [NLS_MAX_CHARSET_SIZE ];
113+ __u16 ftmp [3 ];
114+
115+ for (i = 0 ; i < maxwords ; i ++ ) {
116+ ftmp [0 ] = get_unaligned_le16 (& from [i ]);
117+ if (ftmp [0 ] == 0 )
118+ break ;
119+ for (j = 1 ; j <= 2 ; j ++ ) {
120+ if (i + j < maxwords )
121+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
122+ else
123+ ftmp [j ] = 0 ;
124+ }
125+
126+ charlen = cifs_mapchar (tmp , ftmp , codepage , 0 );
127+ if (charlen > 0 )
128+ outlen += charlen ;
129+ else
130+ outlen ++ ;
131+ }
132+
133+ return outlen ;
113134}
114135
115136/*
@@ -139,12 +160,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
139160static int smb_from_utf16 (char * to , const __le16 * from , int tolen , int fromlen ,
140161 const struct nls_table * codepage , bool mapchar )
141162{
142- int i , charlen , safelen ;
163+ int i , j , charlen , safelen ;
143164 int outlen = 0 ;
144165 int nullsize = nls_nullsize (codepage );
145166 int fromwords = fromlen / 2 ;
146167 char tmp [NLS_MAX_CHARSET_SIZE ];
147- __u16 ftmp ;
168+ __u16 ftmp [ 3 ]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
148169
149170 /*
150171 * because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
155176 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize );
156177
157178 for (i = 0 ; i < fromwords ; i ++ ) {
158- ftmp = get_unaligned_le16 (& from [i ]);
159- if (ftmp == 0 )
179+ ftmp [ 0 ] = get_unaligned_le16 (& from [i ]);
180+ if (ftmp [ 0 ] == 0 )
160181 break ;
182+ for (j = 1 ; j <= 2 ; j ++ ) {
183+ if (i + j < fromwords )
184+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
185+ else
186+ ftmp [j ] = 0 ;
187+ }
161188
162189 /*
163190 * check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
172199 /* put converted char into 'to' buffer */
173200 charlen = cifs_mapchar (& to [outlen ], ftmp , codepage , mapchar );
174201 outlen += charlen ;
202+
203+ /*
204+ * charlen (=bytes of UTF-8 for 1 character)
205+ * 4bytes UTF-8(surrogate pair) is charlen=4
206+ * (4bytes UTF-16 code)
207+ * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
208+ * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
209+ */
210+ if (charlen == 4 )
211+ i ++ ;
212+ else if (charlen >= 5 )
213+ /* 5-6bytes UTF-8 */
214+ i += 2 ;
175215 }
176216
177217 /* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
306346 char src_char ;
307347 __le16 dst_char ;
308348 wchar_t tmp ;
349+ wchar_t wchar_to [6 ]; /* UTF-16 */
350+ int ret ;
351+ unicode_t u ;
309352
310353 if (!mapchars )
311354 return smb_strtoUTF16 (target , source , srclen , cp );
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
348391 * if no match, use question mark, which at least in
349392 * some cases serves as wild card
350393 */
351- if (charlen < 1 ) {
352- dst_char = cpu_to_le16 (0x003f );
353- charlen = 1 ;
394+ if (charlen > 0 )
395+ goto ctoUTF16 ;
396+
397+ /* convert SURROGATE_PAIR */
398+ if (strcmp (cp -> charset , "utf8" ))
399+ goto unknown ;
400+ if (* (source + i ) & 0x80 ) {
401+ charlen = utf8_to_utf32 (source + i , 6 , & u );
402+ if (charlen < 0 )
403+ goto unknown ;
404+ } else
405+ goto unknown ;
406+ ret = utf8s_to_utf16s (source + i , charlen ,
407+ UTF16_LITTLE_ENDIAN ,
408+ wchar_to , 6 );
409+ if (ret < 0 )
410+ goto unknown ;
411+
412+ i += charlen ;
413+ dst_char = cpu_to_le16 (* wchar_to );
414+ if (charlen <= 3 )
415+ /* 1-3bytes UTF-8 to 2bytes UTF-16 */
416+ put_unaligned (dst_char , & target [j ]);
417+ else if (charlen == 4 ) {
418+ /*
419+ * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
420+ * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
421+ * (charlen=3+4 or 4+4)
422+ */
423+ put_unaligned (dst_char , & target [j ]);
424+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
425+ j ++ ;
426+ put_unaligned (dst_char , & target [j ]);
427+ } else if (charlen >= 5 ) {
428+ /* 5-6bytes UTF-8 to 6bytes UTF-16 */
429+ put_unaligned (dst_char , & target [j ]);
430+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
431+ j ++ ;
432+ put_unaligned (dst_char , & target [j ]);
433+ dst_char = cpu_to_le16 (* (wchar_to + 2 ));
434+ j ++ ;
435+ put_unaligned (dst_char , & target [j ]);
354436 }
437+ continue ;
438+
439+ unknown :
440+ dst_char = cpu_to_le16 (0x003f );
441+ charlen = 1 ;
355442 }
443+
444+ ctoUTF16 :
356445 /*
357446 * character may take more than one byte in the source string,
358447 * but will take exactly two bytes in the target string
0 commit comments