pypa · cjerdonek · Jul 4, 2019 · Jul 2, 2019
diff --git a/src/pip/_internal/utils/compat.py b/src/pip/_internal/utils/compat.py
@@ -15,7 +15,7 @@
 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
 
 if MYPY_CHECK_RUNNING:
-    from typing import Tuple, Text
+    from typing import Optional, Text, Tuple, Union
 
 try:
     import _ssl  # noqa
@@ -83,18 +83,29 @@ def backslashreplace_decode_fn(err):
     backslashreplace_decode = "backslashreplace_decode"
 
 
-def console_to_str(data):
-    # type: (bytes) -> Text
-    """Return a string, safe for output, of subprocess output.
+def str_to_display(data, desc=None):
+    # type: (Union[bytes, Text], Optional[str]) -> Text
+    """
+    For display or logging purposes, convert a bytes object (or text) to
+    text (e.g. unicode in Python 2) safe for output.
 
-    We assume the data is in the locale preferred encoding.
-    If it won't decode properly, we warn the user but decode as
-    best we can.
+    :param desc: An optional phrase describing the input data, for use in
+        the log message if a warning is logged. Defaults to "Bytes object".
 
-    We also ensure that the output can be safely written to
-    standard output without encoding errors.
+    This function should never error out and so can take a best effort
+    approach. It is okay to be lossy if needed since the return value is
+    just for display.
+
+    We assume the data is in the locale preferred encoding. If it won't
+    decode properly, we warn the user but decode as best we can.
+
+    We also ensure that the output can be safely written to standard output
+    without encoding errors.
     """
+    if isinstance(data, text_type):
+        return data
 
+    # Otherwise, data is a bytes object (str in Python 2).
     # First, get the encoding we assume. This is the preferred
     # encoding for the locale, unless that is not found, or
     # it is ASCII, in which case assume UTF-8
@@ -107,10 +118,10 @@ def console_to_str(data):
     try:
         decoded_data = data.decode(encoding)
     except UnicodeDecodeError:
-        logger.warning(
-            "Subprocess output does not appear to be encoded as %s",
-            encoding,
-        )
+        if desc is None:
+            desc = 'Bytes object'
+        msg_format = '{} does not appear to be encoded as %s'.format(desc)
+        logger.warning(msg_format, encoding)
         decoded_data = data.decode(encoding, errors=backslashreplace_decode)
 
     # Make sure we can print the output, by encoding it to the output
@@ -138,6 +149,13 @@ def console_to_str(data):
     return decoded_data
 
 
+def console_to_str(data):
+    # type: (bytes) -> Text
+    """Return a string, safe for output, of subprocess output.
+    """
+    return str_to_display(data, desc='Subprocess output')
+
+
 if sys.version_info >= (3,):
     def native_str(s, replace=False):
         # type: (str, bool) -> str

diff --git a/src/pip/_internal/utils/misc.py b/src/pip/_internal/utils/misc.py
@@ -34,7 +34,7 @@
     write_delete_marker_file,
 )
 from pip._internal.utils.compat import (
-    WINDOWS, console_to_str, expanduser, stdlib_pkgs,
+    WINDOWS, console_to_str, expanduser, stdlib_pkgs, str_to_display,
 )
 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
 
@@ -738,19 +738,25 @@ def make_subprocess_output_error(
     :param lines: A list of lines, each ending with a newline.
     """
     command = format_command_args(cmd_args)
+    # Convert `command` to text (unicode in Python 2) so we can use it as
+    # an argument in the unicode format string below. This avoids
+    # "UnicodeDecodeError: 'ascii' codec can't decode byte ..." in Python 2
+    # when the formatted command contains a non-ascii character.
+    command_display = str_to_display(command, desc='command bytes')
+
     # We know the joined output value ends in a newline.
     output = ''.join(lines)
     msg = (
-        # We need to mark this explicitly as a unicode string to avoid
-        # "UnicodeEncodeError: 'ascii' codec can't encode character ..."
-        # errors in Python 2 since e.g. `output` is a unicode string.
+        # Use a unicode string to avoid "UnicodeEncodeError: 'ascii'
+        # codec can't encode character ..." in Python 2 when a format
+        # argument (e.g. `output`) has a non-ascii character.
         u'Command errored out with exit status {exit_status}:\n'
-        ' command: {command}\n'
+        ' command: {command_display}\n'
         '     cwd: {cwd}\n'
         'Complete output ({line_count} lines):\n{output}{divider}'
     ).format(
         exit_status=exit_status,
-        command=command,
+        command_display=command_display,
         cwd=cwd,
         line_count=len(lines),
         output=output,

diff --git a/tests/unit/test_compat.py b/tests/unit/test_compat.py
@@ -1,11 +1,13 @@
+# -*- coding: utf-8 -*-
+
 import locale
 import os
 
 import pytest
 
 import pip._internal.utils.compat as pip_compat
 from pip._internal.utils.compat import (
-    console_to_str, expanduser, get_path_uid, native_str,
+    console_to_str, expanduser, get_path_uid, native_str, str_to_display,
 )
 
 
@@ -45,6 +47,58 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
         get_path_uid(fs)
 
 
+@pytest.mark.parametrize('data, expected', [
+    ('abc', u'abc'),
+    # Test text (unicode in Python 2) input.
+    (u'abc', u'abc'),
+    # Test text input with non-ascii characters.
+    (u'déf', u'déf'),
+])
+def test_str_to_display(data, expected):
+    actual = str_to_display(data)
+    assert actual == expected, (
+        # Show the encoding for easier troubleshooting.
+        'encoding: {!r}'.format(locale.getpreferredencoding())
+    )
+
+
+@pytest.mark.parametrize('data, encoding, expected', [
+    # Test str input with non-ascii characters.
+    ('déf', 'utf-8', u'déf'),
+    # Test bytes input with non-ascii characters:
+    (u'déf'.encode('utf-8'), 'utf-8', u'déf'),
+    # Test a Windows encoding.
+    (u'déf'.encode('cp1252'), 'cp1252', u'déf'),
+    # Test a Windows encoding with incompatibly encoded text.
+    (u'déf'.encode('utf-8'), 'cp1252', u'dÃ©f'),
+])
+def test_str_to_display__encoding(monkeypatch, data, encoding, expected):
+    monkeypatch.setattr(locale, 'getpreferredencoding', lambda: encoding)
+    actual = str_to_display(data)
+    assert actual == expected, (
+        # Show the encoding for easier troubleshooting.
+        'encoding: {!r}'.format(locale.getpreferredencoding())
+    )
+
+
+def test_str_to_display__decode_error(monkeypatch, caplog):
+    monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
+    # Encode with an incompatible encoding.
+    data = u'ab'.encode('utf-16')
+    actual = str_to_display(data)
+
+    assert actual == u'\\xff\\xfea\x00b\x00', (
+        # Show the encoding for easier troubleshooting.
+        'encoding: {!r}'.format(locale.getpreferredencoding())
+    )
+    assert len(caplog.records) == 1
+    record = caplog.records[0]
+    assert record.levelname == 'WARNING'
+    assert record.message == (
+        'Bytes object does not appear to be encoded as utf-8'
+    )
+
+
 def test_console_to_str(monkeypatch):
     some_bytes = b"a\xE9\xC3\xE9b"
     encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -6,6 +6,7 @@
 """
 import codecs
 import itertools
+import locale
 import os
 import shutil
 import stat
@@ -767,10 +768,38 @@ def test_make_subprocess_output_error():
     assert actual == expected, 'actual: {}'.format(actual)
 
 
+def test_make_subprocess_output_error__non_ascii_command_arg(monkeypatch):
+    """
+    Test a command argument with a non-ascii character.
+    """
+    cmd_args = ['foo', 'déf']
+    if sys.version_info[0] == 2:
+        # Check in Python 2 that the str (bytes object) with the non-ascii
+        # character has the encoding we expect. (This comes from the source
+        # code encoding at the top of the file.)
+        assert cmd_args[1].decode('utf-8') == u'déf'
+
+    # We need to monkeypatch so the encoding will be correct on Windows.
+    monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
+    actual = make_subprocess_output_error(
+        cmd_args=cmd_args,
+        cwd='/path/to/cwd',
+        lines=[],
+        exit_status=1,
+    )
+    expected = dedent(u"""\
+    Command errored out with exit status 1:
+     command: foo 'déf'
+         cwd: /path/to/cwd
+    Complete output (0 lines):
+    ----------------------------------------""")
+    assert actual == expected, u'actual: {}'.format(actual)
+
+
 # This test is mainly important for checking unicode in Python 2.
-def test_make_subprocess_output_error__unicode():
+def test_make_subprocess_output_error__non_ascii_line():
     """
-    Test a line with non-ascii unicode characters.
+    Test a line with a non-ascii character.
     """
     lines = [u'curly-quote: \u2018\n']
     actual = make_subprocess_output_error(