1414#include "uniupr.h"
1515#include "smb_common.h"
1616
17- /*
18- * smb_utf16_bytes() - how long will a string be after conversion?
19- * @from: pointer to input string
20- * @maxbytes: don't go past this many bytes of input string
21- * @codepage: destination codepage
22- *
23- * Walk a utf16le string and return the number of bytes that the string will
24- * be after being converted to the given charset, not including any null
25- * termination required. Don't walk past maxbytes in the source buffer.
26- *
27- * Return: string length after conversion
28- */
29- static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
30- const struct nls_table * codepage )
31- {
32- int i ;
33- int charlen , outlen = 0 ;
34- int maxwords = maxbytes / 2 ;
35- char tmp [NLS_MAX_CHARSET_SIZE ];
36- __u16 ftmp ;
37-
38- for (i = 0 ; i < maxwords ; i ++ ) {
39- ftmp = get_unaligned_le16 (& from [i ]);
40- if (ftmp == 0 )
41- break ;
42-
43- charlen = codepage -> uni2char (ftmp , tmp , NLS_MAX_CHARSET_SIZE );
44- if (charlen > 0 )
45- outlen += charlen ;
46- else
47- outlen ++ ;
48- }
49-
50- return outlen ;
51- }
52-
5317/*
5418 * cifs_mapchar() - convert a host-endian char to proper char in codepage
5519 * @target: where converted character should be copied
56- * @src_char: 2 byte host-endian source character
20+ * @from: host-endian source string
5721 * @cp: codepage to which character should be converted
5822 * @mapchar: should character be mapped according to mapchars mount option?
5923 *
@@ -64,10 +28,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
6428 * Return: string length after conversion
6529 */
6630static int
67- cifs_mapchar (char * target , const __u16 src_char , const struct nls_table * cp ,
31+ cifs_mapchar (char * target , const __u16 * from , const struct nls_table * cp ,
6832 bool mapchar )
6933{
7034 int len = 1 ;
35+ __u16 src_char ;
36+
37+ src_char = * from ;
7138
7239 if (!mapchar )
7340 goto cp_convert ;
@@ -105,12 +72,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
10572
10673cp_convert :
10774 len = cp -> uni2char (src_char , target , NLS_MAX_CHARSET_SIZE );
108- if (len <= 0 ) {
109- * target = '?' ;
110- len = 1 ;
111- }
75+ if (len <= 0 )
76+ goto surrogate_pair ;
11277
11378 goto out ;
79+
80+ surrogate_pair :
81+ /* convert SURROGATE_PAIR and IVS */
82+ if (strcmp (cp -> charset , "utf8" ))
83+ goto unknown ;
84+ len = utf16s_to_utf8s (from , 3 , UTF16_LITTLE_ENDIAN , target , 6 );
85+ if (len <= 0 )
86+ goto unknown ;
87+ return len ;
88+
89+ unknown :
90+ * target = '?' ;
91+ len = 1 ;
92+ goto out ;
93+ }
94+
95+ /*
96+ * smb_utf16_bytes() - compute converted string length
97+ * @from: pointer to input string
98+ * @maxbytes: input string length
99+ * @codepage: destination codepage
100+ *
101+ * Walk a utf16le string and return the number of bytes that the string will
102+ * be after being converted to the given charset, not including any null
103+ * termination required. Don't walk past maxbytes in the source buffer.
104+ *
105+ * Return: string length after conversion
106+ */
107+ static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
108+ const struct nls_table * codepage )
109+ {
110+ int i , j ;
111+ int charlen , outlen = 0 ;
112+ int maxwords = maxbytes / 2 ;
113+ char tmp [NLS_MAX_CHARSET_SIZE ];
114+ __u16 ftmp [3 ];
115+
116+ for (i = 0 ; i < maxwords ; i ++ ) {
117+ ftmp [0 ] = get_unaligned_le16 (& from [i ]);
118+ if (ftmp [0 ] == 0 )
119+ break ;
120+ for (j = 1 ; j <= 2 ; j ++ ) {
121+ if (i + j < maxwords )
122+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
123+ else
124+ ftmp [j ] = 0 ;
125+ }
126+
127+ charlen = cifs_mapchar (tmp , ftmp , codepage , 0 );
128+ if (charlen > 0 )
129+ outlen += charlen ;
130+ else
131+ outlen ++ ;
132+ }
133+
134+ return outlen ;
114135}
115136
116137/*
@@ -140,12 +161,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
140161static int smb_from_utf16 (char * to , const __le16 * from , int tolen , int fromlen ,
141162 const struct nls_table * codepage , bool mapchar )
142163{
143- int i , charlen , safelen ;
164+ int i , j , charlen , safelen ;
144165 int outlen = 0 ;
145166 int nullsize = nls_nullsize (codepage );
146167 int fromwords = fromlen / 2 ;
147168 char tmp [NLS_MAX_CHARSET_SIZE ];
148- __u16 ftmp ;
169+ __u16 ftmp [ 3 ]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
149170
150171 /*
151172 * because the chars can be of varying widths, we need to take care
@@ -156,9 +177,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
156177 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize );
157178
158179 for (i = 0 ; i < fromwords ; i ++ ) {
159- ftmp = get_unaligned_le16 (& from [i ]);
160- if (ftmp == 0 )
180+ ftmp [ 0 ] = get_unaligned_le16 (& from [i ]);
181+ if (ftmp [ 0 ] == 0 )
161182 break ;
183+ for (j = 1 ; j <= 2 ; j ++ ) {
184+ if (i + j < fromwords )
185+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
186+ else
187+ ftmp [j ] = 0 ;
188+ }
162189
163190 /*
164191 * check to see if converting this character might make the
@@ -173,6 +200,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
173200 /* put converted char into 'to' buffer */
174201 charlen = cifs_mapchar (& to [outlen ], ftmp , codepage , mapchar );
175202 outlen += charlen ;
203+
204+ /*
205+ * charlen (=bytes of UTF-8 for 1 character)
206+ * 4bytes UTF-8(surrogate pair) is charlen=4
207+ * (4bytes UTF-16 code)
208+ * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
209+ * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
210+ */
211+ if (charlen == 4 )
212+ i ++ ;
213+ else if (charlen >= 5 )
214+ /* 5-6bytes UTF-8 */
215+ i += 2 ;
176216 }
177217
178218 /* properly null-terminate string */
@@ -307,6 +347,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
307347 char src_char ;
308348 __le16 dst_char ;
309349 wchar_t tmp ;
350+ wchar_t wchar_to [6 ]; /* UTF-16 */
351+ int ret ;
352+ unicode_t u ;
310353
311354 if (!mapchars )
312355 return smb_strtoUTF16 (target , source , srclen , cp );
@@ -349,11 +392,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
349392 * if no match, use question mark, which at least in
350393 * some cases serves as wild card
351394 */
352- if (charlen < 1 ) {
353- dst_char = cpu_to_le16 (0x003f );
354- charlen = 1 ;
395+ if (charlen > 0 )
396+ goto ctoUTF16 ;
397+
398+ /* convert SURROGATE_PAIR */
399+ if (strcmp (cp -> charset , "utf8" ))
400+ goto unknown ;
401+ if (* (source + i ) & 0x80 ) {
402+ charlen = utf8_to_utf32 (source + i , 6 , & u );
403+ if (charlen < 0 )
404+ goto unknown ;
405+ } else
406+ goto unknown ;
407+ ret = utf8s_to_utf16s (source + i , charlen ,
408+ UTF16_LITTLE_ENDIAN ,
409+ wchar_to , 6 );
410+ if (ret < 0 )
411+ goto unknown ;
412+
413+ i += charlen ;
414+ dst_char = cpu_to_le16 (* wchar_to );
415+ if (charlen <= 3 )
416+ /* 1-3bytes UTF-8 to 2bytes UTF-16 */
417+ put_unaligned (dst_char , & target [j ]);
418+ else if (charlen == 4 ) {
419+ /*
420+ * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
421+ * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
422+ * (charlen=3+4 or 4+4)
423+ */
424+ put_unaligned (dst_char , & target [j ]);
425+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
426+ j ++ ;
427+ put_unaligned (dst_char , & target [j ]);
428+ } else if (charlen >= 5 ) {
429+ /* 5-6bytes UTF-8 to 6bytes UTF-16 */
430+ put_unaligned (dst_char , & target [j ]);
431+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
432+ j ++ ;
433+ put_unaligned (dst_char , & target [j ]);
434+ dst_char = cpu_to_le16 (* (wchar_to + 2 ));
435+ j ++ ;
436+ put_unaligned (dst_char , & target [j ]);
355437 }
438+ continue ;
439+
440+ unknown :
441+ dst_char = cpu_to_le16 (0x003f );
442+ charlen = 1 ;
356443 }
444+
445+ ctoUTF16 :
357446 /*
358447 * character may take more than one byte in the source string,
359448 * but will take exactly two bytes in the target string
0 commit comments