Skip to content

Commit 0d5a983

Browse files
authored
Merge pull request #6671 from cjerdonek/make-subprocess-error-non-ascii-cmd
Handle non-ascii commands in Python 2 in make_subprocess_output_error()
2 parents d641b54 + a6020e8 commit 0d5a983

File tree

4 files changed

+129
-22
lines changed

4 files changed

+129
-22
lines changed

src/pip/_internal/utils/compat.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
1616

1717
if MYPY_CHECK_RUNNING:
18-
from typing import Tuple, Text
18+
from typing import Optional, Text, Tuple, Union
1919

2020
try:
2121
import _ssl # noqa
@@ -83,18 +83,29 @@ def backslashreplace_decode_fn(err):
8383
backslashreplace_decode = "backslashreplace_decode"
8484

8585

86-
def console_to_str(data):
87-
# type: (bytes) -> Text
88-
"""Return a string, safe for output, of subprocess output.
86+
def str_to_display(data, desc=None):
87+
# type: (Union[bytes, Text], Optional[str]) -> Text
88+
"""
89+
For display or logging purposes, convert a bytes object (or text) to
90+
text (e.g. unicode in Python 2) safe for output.
8991
90-
We assume the data is in the locale preferred encoding.
91-
If it won't decode properly, we warn the user but decode as
92-
best we can.
92+
:param desc: An optional phrase describing the input data, for use in
93+
the log message if a warning is logged. Defaults to "Bytes object".
9394
94-
We also ensure that the output can be safely written to
95-
standard output without encoding errors.
95+
This function should never error out and so can take a best effort
96+
approach. It is okay to be lossy if needed since the return value is
97+
just for display.
98+
99+
We assume the data is in the locale preferred encoding. If it won't
100+
decode properly, we warn the user but decode as best we can.
101+
102+
We also ensure that the output can be safely written to standard output
103+
without encoding errors.
96104
"""
105+
if isinstance(data, text_type):
106+
return data
97107

108+
# Otherwise, data is a bytes object (str in Python 2).
98109
# First, get the encoding we assume. This is the preferred
99110
# encoding for the locale, unless that is not found, or
100111
# it is ASCII, in which case assume UTF-8
@@ -107,10 +118,10 @@ def console_to_str(data):
107118
try:
108119
decoded_data = data.decode(encoding)
109120
except UnicodeDecodeError:
110-
logger.warning(
111-
"Subprocess output does not appear to be encoded as %s",
112-
encoding,
113-
)
121+
if desc is None:
122+
desc = 'Bytes object'
123+
msg_format = '{} does not appear to be encoded as %s'.format(desc)
124+
logger.warning(msg_format, encoding)
114125
decoded_data = data.decode(encoding, errors=backslashreplace_decode)
115126

116127
# Make sure we can print the output, by encoding it to the output
@@ -138,6 +149,13 @@ def console_to_str(data):
138149
return decoded_data
139150

140151

152+
def console_to_str(data):
153+
# type: (bytes) -> Text
154+
"""Return a string, safe for output, of subprocess output.
155+
"""
156+
return str_to_display(data, desc='Subprocess output')
157+
158+
141159
if sys.version_info >= (3,):
142160
def native_str(s, replace=False):
143161
# type: (str, bool) -> str

src/pip/_internal/utils/misc.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
write_delete_marker_file,
3636
)
3737
from pip._internal.utils.compat import (
38-
WINDOWS, console_to_str, expanduser, stdlib_pkgs,
38+
WINDOWS, console_to_str, expanduser, stdlib_pkgs, str_to_display,
3939
)
4040
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
4141

@@ -751,19 +751,25 @@ def make_subprocess_output_error(
751751
:param lines: A list of lines, each ending with a newline.
752752
"""
753753
command = format_command_args(cmd_args)
754+
# Convert `command` to text (unicode in Python 2) so we can use it as
755+
# an argument in the unicode format string below. This avoids
756+
# "UnicodeDecodeError: 'ascii' codec can't decode byte ..." in Python 2
757+
# when the formatted command contains a non-ascii character.
758+
command_display = str_to_display(command, desc='command bytes')
759+
754760
# We know the joined output value ends in a newline.
755761
output = ''.join(lines)
756762
msg = (
757-
# We need to mark this explicitly as a unicode string to avoid
758-
# "UnicodeEncodeError: 'ascii' codec can't encode character ..."
759-
# errors in Python 2 since e.g. `output` is a unicode string.
763+
# Use a unicode string to avoid "UnicodeEncodeError: 'ascii'
764+
# codec can't encode character ..." in Python 2 when a format
765+
# argument (e.g. `output`) has a non-ascii character.
760766
u'Command errored out with exit status {exit_status}:\n'
761-
' command: {command}\n'
767+
' command: {command_display}\n'
762768
' cwd: {cwd}\n'
763769
'Complete output ({line_count} lines):\n{output}{divider}'
764770
).format(
765771
exit_status=exit_status,
766-
command=command,
772+
command_display=command_display,
767773
cwd=cwd,
768774
line_count=len(lines),
769775
output=output,

tests/unit/test_compat.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
# -*- coding: utf-8 -*-
2+
13
import locale
24
import os
35

46
import pytest
57

68
import pip._internal.utils.compat as pip_compat
79
from pip._internal.utils.compat import (
8-
console_to_str, expanduser, get_path_uid, native_str,
10+
console_to_str, expanduser, get_path_uid, native_str, str_to_display,
911
)
1012

1113

@@ -45,6 +47,58 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
4547
get_path_uid(fs)
4648

4749

50+
@pytest.mark.parametrize('data, expected', [
51+
('abc', u'abc'),
52+
# Test text (unicode in Python 2) input.
53+
(u'abc', u'abc'),
54+
# Test text input with non-ascii characters.
55+
(u'déf', u'déf'),
56+
])
57+
def test_str_to_display(data, expected):
58+
actual = str_to_display(data)
59+
assert actual == expected, (
60+
# Show the encoding for easier troubleshooting.
61+
'encoding: {!r}'.format(locale.getpreferredencoding())
62+
)
63+
64+
65+
@pytest.mark.parametrize('data, encoding, expected', [
66+
# Test str input with non-ascii characters.
67+
('déf', 'utf-8', u'déf'),
68+
# Test bytes input with non-ascii characters:
69+
(u'déf'.encode('utf-8'), 'utf-8', u'déf'),
70+
# Test a Windows encoding.
71+
(u'déf'.encode('cp1252'), 'cp1252', u'déf'),
72+
# Test a Windows encoding with incompatibly encoded text.
73+
(u'déf'.encode('utf-8'), 'cp1252', u'déf'),
74+
])
75+
def test_str_to_display__encoding(monkeypatch, data, encoding, expected):
76+
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: encoding)
77+
actual = str_to_display(data)
78+
assert actual == expected, (
79+
# Show the encoding for easier troubleshooting.
80+
'encoding: {!r}'.format(locale.getpreferredencoding())
81+
)
82+
83+
84+
def test_str_to_display__decode_error(monkeypatch, caplog):
85+
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
86+
# Encode with an incompatible encoding.
87+
data = u'ab'.encode('utf-16')
88+
actual = str_to_display(data)
89+
90+
assert actual == u'\\xff\\xfea\x00b\x00', (
91+
# Show the encoding for easier troubleshooting.
92+
'encoding: {!r}'.format(locale.getpreferredencoding())
93+
)
94+
assert len(caplog.records) == 1
95+
record = caplog.records[0]
96+
assert record.levelname == 'WARNING'
97+
assert record.message == (
98+
'Bytes object does not appear to be encoded as utf-8'
99+
)
100+
101+
48102
def test_console_to_str(monkeypatch):
49103
some_bytes = b"a\xE9\xC3\xE9b"
50104
encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',

tests/unit/test_utils.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77
import codecs
88
import itertools
9+
import locale
910
import os
1011
import shutil
1112
import stat
@@ -767,10 +768,38 @@ def test_make_subprocess_output_error():
767768
assert actual == expected, 'actual: {}'.format(actual)
768769

769770

771+
def test_make_subprocess_output_error__non_ascii_command_arg(monkeypatch):
772+
"""
773+
Test a command argument with a non-ascii character.
774+
"""
775+
cmd_args = ['foo', 'déf']
776+
if sys.version_info[0] == 2:
777+
# Check in Python 2 that the str (bytes object) with the non-ascii
778+
# character has the encoding we expect. (This comes from the source
779+
# code encoding at the top of the file.)
780+
assert cmd_args[1].decode('utf-8') == u'déf'
781+
782+
# We need to monkeypatch so the encoding will be correct on Windows.
783+
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
784+
actual = make_subprocess_output_error(
785+
cmd_args=cmd_args,
786+
cwd='/path/to/cwd',
787+
lines=[],
788+
exit_status=1,
789+
)
790+
expected = dedent(u"""\
791+
Command errored out with exit status 1:
792+
command: foo 'déf'
793+
cwd: /path/to/cwd
794+
Complete output (0 lines):
795+
----------------------------------------""")
796+
assert actual == expected, u'actual: {}'.format(actual)
797+
798+
770799
# This test is mainly important for checking unicode in Python 2.
771-
def test_make_subprocess_output_error__unicode():
800+
def test_make_subprocess_output_error__non_ascii_line():
772801
"""
773-
Test a line with non-ascii unicode characters.
802+
Test a line with a non-ascii character.
774803
"""
775804
lines = [u'curly-quote: \u2018\n']
776805
actual = make_subprocess_output_error(

0 commit comments

Comments
 (0)