Skip to content

Commit a869796

Browse files
[3.14] gh-63161: Add more tests for source encoding (GH-139440) (GH-139442)
(cherry picked from commit b2f5ad0) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 2fc69e8 commit a869796

File tree

2 files changed

+177
-20
lines changed

2 files changed

+177
-20
lines changed

Lib/test/test_source_encoding.py

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
172172
os.unlink(TESTFN)
173173

174174

175+
BUFSIZ = 2**13
176+
175177
class AbstractSourceEncodingTest:
176178

177179
def test_default_coding(self):
@@ -184,14 +186,20 @@ def test_first_coding_line(self):
184186
self.check_script_output(src, br"'\xc3\u20ac'")
185187

186188
def test_second_coding_line(self):
187-
src = (b'#\n'
189+
src = (b'#!/usr/bin/python\n'
190+
b'#coding:iso8859-15\n'
191+
b'print(ascii("\xc3\xa4"))\n')
192+
self.check_script_output(src, br"'\xc3\u20ac'")
193+
194+
def test_second_coding_line_empty_first_line(self):
195+
src = (b'\n'
188196
b'#coding:iso8859-15\n'
189197
b'print(ascii("\xc3\xa4"))\n')
190198
self.check_script_output(src, br"'\xc3\u20ac'")
191199

192200
def test_third_coding_line(self):
193201
# Only first two lines are tested for a magic comment.
194-
src = (b'#\n'
202+
src = (b'#!/usr/bin/python\n'
195203
b'#\n'
196204
b'#coding:iso8859-15\n'
197205
b'print(ascii("\xc3\xa4"))\n')
@@ -209,13 +217,52 @@ def test_double_coding_same_line(self):
209217
b'print(ascii("\xc3\xa4"))\n')
210218
self.check_script_output(src, br"'\xc3\u20ac'")
211219

220+
def test_double_coding_utf8(self):
221+
src = (b'#coding:utf-8\n'
222+
b'#coding:latin1\n'
223+
b'print(ascii("\xc3\xa4"))\n')
224+
self.check_script_output(src, br"'\xe4'")
225+
226+
def test_long_first_coding_line(self):
227+
src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
228+
b'print(ascii("\xc3\xa4"))\n')
229+
self.check_script_output(src, br"'\xc3\u20ac'")
230+
231+
def test_long_second_coding_line(self):
232+
src = (b'#!/usr/bin/python\n'
233+
b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
234+
b'print(ascii("\xc3\xa4"))\n')
235+
self.check_script_output(src, br"'\xc3\u20ac'")
236+
237+
def test_long_coding_line(self):
238+
src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
239+
b'print(ascii("\xc3\xa4"))\n')
240+
self.check_script_output(src, br"'\xc3\u20ac'")
241+
242+
def test_long_coding_name(self):
243+
src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
244+
b'print(ascii("\xc3\xa4"))\n')
245+
self.check_script_output(src, br"'\xc3\xa4'")
246+
247+
def test_long_first_utf8_line(self):
248+
src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
249+
self.check_script_output(src, b'')
250+
src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
251+
self.check_script_output(src, b'')
252+
253+
def test_long_second_utf8_line(self):
254+
src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
255+
self.check_script_output(src, b'')
256+
src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
257+
self.check_script_output(src, b'')
258+
212259
def test_first_non_utf8_coding_line(self):
213260
src = (b'#coding:iso-8859-15 \xa4\n'
214261
b'print(ascii("\xc3\xa4"))\n')
215262
self.check_script_output(src, br"'\xc3\u20ac'")
216263

217264
def test_second_non_utf8_coding_line(self):
218-
src = (b'\n'
265+
src = (b'#!/usr/bin/python\n'
219266
b'#coding:iso-8859-15 \xa4\n'
220267
b'print(ascii("\xc3\xa4"))\n')
221268
self.check_script_output(src, br"'\xc3\u20ac'")
@@ -224,27 +271,56 @@ def test_utf8_bom(self):
224271
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
225272
self.check_script_output(src, br"'\xe4'")
226273

274+
def test_utf8_bom_utf8_comments(self):
275+
src = (b'\xef\xbb\xbf#\xc3\xa4\n'
276+
b'#\xc3\xa4\n'
277+
b'print(ascii("\xc3\xa4"))\n')
278+
self.check_script_output(src, br"'\xe4'")
279+
227280
def test_utf8_bom_and_utf8_coding_line(self):
228281
src = (b'\xef\xbb\xbf#coding:utf-8\n'
229282
b'print(ascii("\xc3\xa4"))\n')
230283
self.check_script_output(src, br"'\xe4'")
231284

285+
def test_utf8_non_utf8_comment_line_error(self):
286+
src = (b'#coding: utf8\n'
287+
b'#\n'
288+
b'#\xa4\n'
289+
b'raise RuntimeError\n')
290+
self.check_script_error(src,
291+
br"'utf-8' codec can't decode byte|"
292+
br"encoding problem: utf8")
293+
232294
def test_crlf(self):
233295
src = (b'print(ascii("""\r\n"""))\n')
234-
out = self.check_script_output(src, br"'\n'")
296+
self.check_script_output(src, br"'\n'")
235297

236298
def test_crcrlf(self):
237299
src = (b'print(ascii("""\r\r\n"""))\n')
238-
out = self.check_script_output(src, br"'\n\n'")
300+
self.check_script_output(src, br"'\n\n'")
239301

240302
def test_crcrcrlf(self):
241303
src = (b'print(ascii("""\r\r\r\n"""))\n')
242-
out = self.check_script_output(src, br"'\n\n\n'")
304+
self.check_script_output(src, br"'\n\n\n'")
243305

244306
def test_crcrcrlf2(self):
245307
src = (b'#coding:iso-8859-1\n'
246308
b'print(ascii("""\r\r\r\n"""))\n')
247-
out = self.check_script_output(src, br"'\n\n\n'")
309+
self.check_script_output(src, br"'\n\n\n'")
310+
311+
def test_nul_in_first_coding_line(self):
312+
src = (b'#coding:iso8859-15\x00\n'
313+
b'\n'
314+
b'\n'
315+
b'raise RuntimeError\n')
316+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
317+
318+
def test_nul_in_second_coding_line(self):
319+
src = (b'#!/usr/bin/python\n'
320+
b'#coding:iso8859-15\x00\n'
321+
b'\n'
322+
b'raise RuntimeError\n')
323+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
248324

249325

250326
class UTF8ValidatorTest(unittest.TestCase):
@@ -324,6 +400,10 @@ def check_script_output(self, src, expected):
324400
out = stdout.getvalue().encode('latin1')
325401
self.assertEqual(out.rstrip(), expected)
326402

403+
def check_script_error(self, src, expected):
404+
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
405+
exec(src)
406+
327407

328408
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
329409

@@ -335,6 +415,14 @@ def check_script_output(self, src, expected):
335415
res = script_helper.assert_python_ok(fn)
336416
self.assertEqual(res.out.rstrip(), expected)
337417

418+
def check_script_error(self, src, expected):
419+
with tempfile.TemporaryDirectory() as tmpd:
420+
fn = os.path.join(tmpd, 'test.py')
421+
with open(fn, 'wb') as fp:
422+
fp.write(src)
423+
res = script_helper.assert_python_failure(fn)
424+
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
425+
338426

339427
if __name__ == "__main__":
340428
unittest.main()

Lib/test/test_tokenize.py

Lines changed: 82 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,24 +1363,63 @@ def readline():
13631363

13641364
def test_no_bom_no_encoding_cookie(self):
13651365
lines = (
1366-
b'# something\n',
1366+
b'#!/home/\xc3\xa4/bin/python\n',
1367+
b'# something \xe2\x82\xac\n',
13671368
b'print(something)\n',
13681369
b'do_something(else)\n'
13691370
)
13701371
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13711372
self.assertEqual(encoding, 'utf-8')
13721373
self.assertEqual(consumed_lines, list(lines[:2]))
13731374

1375+
def test_no_bom_no_encoding_cookie_first_line_error(self):
1376+
lines = (
1377+
b'#!/home/\xa4/bin/python\n\n',
1378+
b'print(something)\n',
1379+
b'do_something(else)\n'
1380+
)
1381+
with self.assertRaises(SyntaxError):
1382+
tokenize.detect_encoding(self.get_readline(lines))
1383+
1384+
def test_no_bom_no_encoding_cookie_second_line_error(self):
1385+
lines = (
1386+
b'#!/usr/bin/python\n',
1387+
b'# something \xe2\n',
1388+
b'print(something)\n',
1389+
b'do_something(else)\n'
1390+
)
1391+
with self.assertRaises(SyntaxError):
1392+
tokenize.detect_encoding(self.get_readline(lines))
1393+
13741394
def test_bom_no_cookie(self):
13751395
lines = (
1376-
b'\xef\xbb\xbf# something\n',
1396+
b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
13771397
b'print(something)\n',
13781398
b'do_something(else)\n'
13791399
)
13801400
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13811401
self.assertEqual(encoding, 'utf-8-sig')
13821402
self.assertEqual(consumed_lines,
1383-
[b'# something\n', b'print(something)\n'])
1403+
[b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])
1404+
1405+
def test_bom_no_cookie_first_line_error(self):
1406+
lines = (
1407+
b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
1408+
b'print(something)\n',
1409+
b'do_something(else)\n'
1410+
)
1411+
with self.assertRaises(SyntaxError):
1412+
tokenize.detect_encoding(self.get_readline(lines))
1413+
1414+
def test_bom_no_cookie_second_line_error(self):
1415+
lines = (
1416+
b'\xef\xbb\xbf#!/usr/bin/python\n',
1417+
b'# something \xe2\n',
1418+
b'print(something)\n',
1419+
b'do_something(else)\n'
1420+
)
1421+
with self.assertRaises(SyntaxError):
1422+
tokenize.detect_encoding(self.get_readline(lines))
13841423

13851424
def test_cookie_first_line_no_bom(self):
13861425
lines = (
@@ -1456,27 +1495,58 @@ def test_cookie_second_line_noncommented_first_line(self):
14561495
expected = [b"print('\xc2\xa3')\n"]
14571496
self.assertEqual(consumed_lines, expected)
14581497

1459-
def test_cookie_second_line_commented_first_line(self):
1498+
def test_cookie_second_line_empty_first_line(self):
14601499
lines = (
1461-
b"#print('\xc2\xa3')\n",
1500+
b'\n',
14621501
b'# vim: set fileencoding=iso8859-15 :\n',
14631502
b"print('\xe2\x82\xac')\n"
14641503
)
14651504
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14661505
self.assertEqual(encoding, 'iso8859-15')
1467-
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1506+
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
14681507
self.assertEqual(consumed_lines, expected)
14691508

1470-
def test_cookie_second_line_empty_first_line(self):
1509+
def test_cookie_third_line(self):
14711510
lines = (
1472-
b'\n',
1473-
b'# vim: set fileencoding=iso8859-15 :\n',
1474-
b"print('\xe2\x82\xac')\n"
1511+
b'#!/home/\xc3\xa4/bin/python\n',
1512+
b'# something\n',
1513+
b'# vim: set fileencoding=ascii :\n',
1514+
b'print(something)\n',
1515+
b'do_something(else)\n'
1516+
)
1517+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1518+
self.assertEqual(encoding, 'utf-8')
1519+
self.assertEqual(consumed_lines, list(lines[:2]))
1520+
1521+
def test_double_coding_line(self):
1522+
# If the first line matches the second line is ignored.
1523+
lines = (
1524+
b'#coding:iso8859-15\n',
1525+
b'#coding:latin1\n',
1526+
b'print(something)\n'
14751527
)
14761528
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14771529
self.assertEqual(encoding, 'iso8859-15')
1478-
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1479-
self.assertEqual(consumed_lines, expected)
1530+
self.assertEqual(consumed_lines, list(lines[:1]))
1531+
1532+
def test_double_coding_same_line(self):
1533+
lines = (
1534+
b'#coding:iso8859-15 coding:latin1\n',
1535+
b'print(something)\n'
1536+
)
1537+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1538+
self.assertEqual(encoding, 'iso8859-15')
1539+
self.assertEqual(consumed_lines, list(lines[:1]))
1540+
1541+
def test_double_coding_utf8(self):
1542+
lines = (
1543+
b'#coding:utf-8\n',
1544+
b'#coding:latin1\n',
1545+
b'print(something)\n'
1546+
)
1547+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1548+
self.assertEqual(encoding, 'utf-8')
1549+
self.assertEqual(consumed_lines, list(lines[:1]))
14801550

14811551
def test_latin1_normalization(self):
14821552
# See get_normal_name() in Parser/tokenizer/helpers.c.
@@ -1502,7 +1572,6 @@ def test_syntaxerror_latin1(self):
15021572
readline = self.get_readline(lines)
15031573
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
15041574

1505-
15061575
def test_utf8_normalization(self):
15071576
# See get_normal_name() in Parser/tokenizer/helpers.c.
15081577
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

0 commit comments

Comments
 (0)