Skip to content

Commit 7533f28

Browse files
namjaejeonroxanan1996
authored andcommitted
ksmbd: add support for surrogate pair conversion
BugLink: https://bugs.launchpad.net/bugs/2052406 [ Upstream commit 0c18031 ] ksmbd is missing supporting to convert filename included surrogate pair characters. It triggers a "file or folder does not exist" error in Windows client. [Steps to Reproduce for bug] 1. Create surrogate pair file touch $(echo -e '\xf0\x9d\x9f\xa3') touch $(echo -e '\xf0\x9d\x9f\xa4') 2. Try to open these files in ksmbd share through Windows client. This patch update unicode functions not to consider about surrogate pair (and IVS). Reviewed-by: Marios Makassikis <[email protected]> Tested-by: Marios Makassikis <[email protected]> Signed-off-by: Namjae Jeon <[email protected]> Signed-off-by: Steve French <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]> Signed-off-by: Portia Stephens <[email protected]> Signed-off-by: Roxana Nicolescu <[email protected]>
1 parent 4b8ca9f commit 7533f28

File tree

1 file changed

+138
-49
lines changed

1 file changed

+138
-49
lines changed

fs/ksmbd/unicode.c

Lines changed: 138 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -14,46 +14,10 @@
1414
#include "uniupr.h"
1515
#include "smb_common.h"
1616

17-
/*
18-
* smb_utf16_bytes() - how long will a string be after conversion?
19-
* @from: pointer to input string
20-
* @maxbytes: don't go past this many bytes of input string
21-
* @codepage: destination codepage
22-
*
23-
* Walk a utf16le string and return the number of bytes that the string will
24-
* be after being converted to the given charset, not including any null
25-
* termination required. Don't walk past maxbytes in the source buffer.
26-
*
27-
* Return: string length after conversion
28-
*/
29-
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
30-
const struct nls_table *codepage)
31-
{
32-
int i;
33-
int charlen, outlen = 0;
34-
int maxwords = maxbytes / 2;
35-
char tmp[NLS_MAX_CHARSET_SIZE];
36-
__u16 ftmp;
37-
38-
for (i = 0; i < maxwords; i++) {
39-
ftmp = get_unaligned_le16(&from[i]);
40-
if (ftmp == 0)
41-
break;
42-
43-
charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
44-
if (charlen > 0)
45-
outlen += charlen;
46-
else
47-
outlen++;
48-
}
49-
50-
return outlen;
51-
}
52-
5317
/*
5418
* cifs_mapchar() - convert a host-endian char to proper char in codepage
5519
* @target: where converted character should be copied
56-
* @src_char: 2 byte host-endian source character
20+
* @from: host-endian source string
5721
* @cp: codepage to which character should be converted
5822
* @mapchar: should character be mapped according to mapchars mount option?
5923
*
@@ -64,10 +28,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
6428
* Return: string length after conversion
6529
*/
6630
static int
67-
cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
31+
cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
6832
bool mapchar)
6933
{
7034
int len = 1;
35+
__u16 src_char;
36+
37+
src_char = *from;
7138

7239
if (!mapchar)
7340
goto cp_convert;
@@ -105,12 +72,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
10572

10673
cp_convert:
10774
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
108-
if (len <= 0) {
109-
*target = '?';
110-
len = 1;
111-
}
75+
if (len <= 0)
76+
goto surrogate_pair;
11277

11378
goto out;
79+
80+
surrogate_pair:
81+
/* convert SURROGATE_PAIR and IVS */
82+
if (strcmp(cp->charset, "utf8"))
83+
goto unknown;
84+
len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
85+
if (len <= 0)
86+
goto unknown;
87+
return len;
88+
89+
unknown:
90+
*target = '?';
91+
len = 1;
92+
goto out;
93+
}
94+
95+
/*
96+
* smb_utf16_bytes() - compute converted string length
97+
* @from: pointer to input string
98+
* @maxbytes: input string length
99+
* @codepage: destination codepage
100+
*
101+
* Walk a utf16le string and return the number of bytes that the string will
102+
* be after being converted to the given charset, not including any null
103+
* termination required. Don't walk past maxbytes in the source buffer.
104+
*
105+
* Return: string length after conversion
106+
*/
107+
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
108+
const struct nls_table *codepage)
109+
{
110+
int i, j;
111+
int charlen, outlen = 0;
112+
int maxwords = maxbytes / 2;
113+
char tmp[NLS_MAX_CHARSET_SIZE];
114+
__u16 ftmp[3];
115+
116+
for (i = 0; i < maxwords; i++) {
117+
ftmp[0] = get_unaligned_le16(&from[i]);
118+
if (ftmp[0] == 0)
119+
break;
120+
for (j = 1; j <= 2; j++) {
121+
if (i + j < maxwords)
122+
ftmp[j] = get_unaligned_le16(&from[i + j]);
123+
else
124+
ftmp[j] = 0;
125+
}
126+
127+
charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
128+
if (charlen > 0)
129+
outlen += charlen;
130+
else
131+
outlen++;
132+
}
133+
134+
return outlen;
114135
}
115136

116137
/*
@@ -140,12 +161,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
140161
static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
141162
const struct nls_table *codepage, bool mapchar)
142163
{
143-
int i, charlen, safelen;
164+
int i, j, charlen, safelen;
144165
int outlen = 0;
145166
int nullsize = nls_nullsize(codepage);
146167
int fromwords = fromlen / 2;
147168
char tmp[NLS_MAX_CHARSET_SIZE];
148-
__u16 ftmp;
169+
__u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
149170

150171
/*
151172
* because the chars can be of varying widths, we need to take care
@@ -156,9 +177,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
156177
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
157178

158179
for (i = 0; i < fromwords; i++) {
159-
ftmp = get_unaligned_le16(&from[i]);
160-
if (ftmp == 0)
180+
ftmp[0] = get_unaligned_le16(&from[i]);
181+
if (ftmp[0] == 0)
161182
break;
183+
for (j = 1; j <= 2; j++) {
184+
if (i + j < fromwords)
185+
ftmp[j] = get_unaligned_le16(&from[i + j]);
186+
else
187+
ftmp[j] = 0;
188+
}
162189

163190
/*
164191
* check to see if converting this character might make the
@@ -173,6 +200,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
173200
/* put converted char into 'to' buffer */
174201
charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
175202
outlen += charlen;
203+
204+
/*
205+
* charlen (=bytes of UTF-8 for 1 character)
206+
* 4bytes UTF-8(surrogate pair) is charlen=4
207+
* (4bytes UTF-16 code)
208+
* 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
209+
* (2 UTF-8 pairs divided to 2 UTF-16 pairs)
210+
*/
211+
if (charlen == 4)
212+
i++;
213+
else if (charlen >= 5)
214+
/* 5-6bytes UTF-8 */
215+
i += 2;
176216
}
177217

178218
/* properly null-terminate string */
@@ -307,6 +347,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
307347
char src_char;
308348
__le16 dst_char;
309349
wchar_t tmp;
350+
wchar_t wchar_to[6]; /* UTF-16 */
351+
int ret;
352+
unicode_t u;
310353

311354
if (!mapchars)
312355
return smb_strtoUTF16(target, source, srclen, cp);
@@ -349,11 +392,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
349392
* if no match, use question mark, which at least in
350393
* some cases serves as wild card
351394
*/
352-
if (charlen < 1) {
353-
dst_char = cpu_to_le16(0x003f);
354-
charlen = 1;
395+
if (charlen > 0)
396+
goto ctoUTF16;
397+
398+
/* convert SURROGATE_PAIR */
399+
if (strcmp(cp->charset, "utf8"))
400+
goto unknown;
401+
if (*(source + i) & 0x80) {
402+
charlen = utf8_to_utf32(source + i, 6, &u);
403+
if (charlen < 0)
404+
goto unknown;
405+
} else
406+
goto unknown;
407+
ret = utf8s_to_utf16s(source + i, charlen,
408+
UTF16_LITTLE_ENDIAN,
409+
wchar_to, 6);
410+
if (ret < 0)
411+
goto unknown;
412+
413+
i += charlen;
414+
dst_char = cpu_to_le16(*wchar_to);
415+
if (charlen <= 3)
416+
/* 1-3bytes UTF-8 to 2bytes UTF-16 */
417+
put_unaligned(dst_char, &target[j]);
418+
else if (charlen == 4) {
419+
/*
420+
* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
421+
* 7-8bytes UTF-8(IVS) divided to 2 UTF-16
422+
* (charlen=3+4 or 4+4)
423+
*/
424+
put_unaligned(dst_char, &target[j]);
425+
dst_char = cpu_to_le16(*(wchar_to + 1));
426+
j++;
427+
put_unaligned(dst_char, &target[j]);
428+
} else if (charlen >= 5) {
429+
/* 5-6bytes UTF-8 to 6bytes UTF-16 */
430+
put_unaligned(dst_char, &target[j]);
431+
dst_char = cpu_to_le16(*(wchar_to + 1));
432+
j++;
433+
put_unaligned(dst_char, &target[j]);
434+
dst_char = cpu_to_le16(*(wchar_to + 2));
435+
j++;
436+
put_unaligned(dst_char, &target[j]);
355437
}
438+
continue;
439+
440+
unknown:
441+
dst_char = cpu_to_le16(0x003f);
442+
charlen = 1;
356443
}
444+
445+
ctoUTF16:
357446
/*
358447
* character may take more than one byte in the source string,
359448
* but will take exactly two bytes in the target string

0 commit comments

Comments
 (0)