Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions src/pip/_internal/utils/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pip._internal.utils.typing import MYPY_CHECK_RUNNING

if MYPY_CHECK_RUNNING:
from typing import Tuple, Text
from typing import Optional, Text, Tuple, Union

try:
import _ssl # noqa
Expand Down Expand Up @@ -83,18 +83,29 @@ def backslashreplace_decode_fn(err):
backslashreplace_decode = "backslashreplace_decode"


def console_to_str(data):
# type: (bytes) -> Text
"""Return a string, safe for output, of subprocess output.
def str_to_display(data, desc=None):
# type: (Union[bytes, Text], Optional[str]) -> Text
"""
For display or logging purposes, convert a bytes object (or text) to
text (e.g. unicode in Python 2) safe for output.

We assume the data is in the locale preferred encoding.
If it won't decode properly, we warn the user but decode as
best we can.
:param desc: An optional phrase describing the input data, for use in
the log message if a warning is logged. Defaults to "Bytes object".

We also ensure that the output can be safely written to
standard output without encoding errors.
This function should never error out and so can take a best effort
approach. It is okay to be lossy if needed since the return value is
just for display.

We assume the data is in the locale preferred encoding. If it won't
decode properly, we warn the user but decode as best we can.

We also ensure that the output can be safely written to standard output
without encoding errors.
"""
if isinstance(data, text_type):
return data

# Otherwise, data is a bytes object (str in Python 2).
# First, get the encoding we assume. This is the preferred
# encoding for the locale, unless that is not found, or
# it is ASCII, in which case assume UTF-8
Expand All @@ -107,10 +118,10 @@ def console_to_str(data):
try:
decoded_data = data.decode(encoding)
except UnicodeDecodeError:
logger.warning(
"Subprocess output does not appear to be encoded as %s",
encoding,
)
if desc is None:
desc = 'Bytes object'
msg_format = '{} does not appear to be encoded as %s'.format(desc)
logger.warning(msg_format, encoding)
decoded_data = data.decode(encoding, errors=backslashreplace_decode)

# Make sure we can print the output, by encoding it to the output
Expand Down Expand Up @@ -138,6 +149,13 @@ def console_to_str(data):
return decoded_data


def console_to_str(data):
# type: (bytes) -> Text
"""Return a string, safe for output, of subprocess output.
"""
return str_to_display(data, desc='Subprocess output')


if sys.version_info >= (3,):
def native_str(s, replace=False):
# type: (str, bool) -> str
Expand Down
18 changes: 12 additions & 6 deletions src/pip/_internal/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
write_delete_marker_file,
)
from pip._internal.utils.compat import (
WINDOWS, console_to_str, expanduser, stdlib_pkgs,
WINDOWS, console_to_str, expanduser, stdlib_pkgs, str_to_display,
)
from pip._internal.utils.typing import MYPY_CHECK_RUNNING

Expand Down Expand Up @@ -738,19 +738,25 @@ def make_subprocess_output_error(
:param lines: A list of lines, each ending with a newline.
"""
command = format_command_args(cmd_args)
# Convert `command` to text (unicode in Python 2) so we can use it as
# an argument in the unicode format string below. This avoids
# "UnicodeDecodeError: 'ascii' codec can't decode byte ..." in Python 2
# when the formatted command contains a non-ascii character.
command_display = str_to_display(command, desc='command bytes')

# We know the joined output value ends in a newline.
output = ''.join(lines)
msg = (
# We need to mark this explicitly as a unicode string to avoid
# "UnicodeEncodeError: 'ascii' codec can't encode character ..."
# errors in Python 2 since e.g. `output` is a unicode string.
# Use a unicode string to avoid "UnicodeEncodeError: 'ascii'
# codec can't encode character ..." in Python 2 when a format
# argument (e.g. `output`) has a non-ascii character.
u'Command errored out with exit status {exit_status}:\n'
' command: {command}\n'
' command: {command_display}\n'
' cwd: {cwd}\n'
'Complete output ({line_count} lines):\n{output}{divider}'
).format(
exit_status=exit_status,
command=command,
command_display=command_display,
cwd=cwd,
line_count=len(lines),
output=output,
Expand Down
56 changes: 55 additions & 1 deletion tests/unit/test_compat.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-

import locale
import os

import pytest

import pip._internal.utils.compat as pip_compat
from pip._internal.utils.compat import (
console_to_str, expanduser, get_path_uid, native_str,
console_to_str, expanduser, get_path_uid, native_str, str_to_display,
)


Expand Down Expand Up @@ -45,6 +47,58 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
get_path_uid(fs)


@pytest.mark.parametrize('data, expected', [
('abc', u'abc'),
# Test text (unicode in Python 2) input.
(u'abc', u'abc'),
# Test text input with non-ascii characters.
(u'déf', u'déf'),
])
def test_str_to_display(data, expected):
actual = str_to_display(data)
assert actual == expected, (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)


@pytest.mark.parametrize('data, encoding, expected', [
# Test str input with non-ascii characters.
('déf', 'utf-8', u'déf'),
# Test bytes input with non-ascii characters:
(u'déf'.encode('utf-8'), 'utf-8', u'déf'),
# Test a Windows encoding.
(u'déf'.encode('cp1252'), 'cp1252', u'déf'),
# Test a Windows encoding with incompatibly encoded text.
(u'déf'.encode('utf-8'), 'cp1252', u'déf'),
])
def test_str_to_display__encoding(monkeypatch, data, encoding, expected):
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: encoding)
actual = str_to_display(data)
assert actual == expected, (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)


def test_str_to_display__decode_error(monkeypatch, caplog):
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
# Encode with an incompatible encoding.
data = u'ab'.encode('utf-16')
actual = str_to_display(data)

assert actual == u'\\xff\\xfea\x00b\x00', (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)
assert len(caplog.records) == 1
record = caplog.records[0]
assert record.levelname == 'WARNING'
assert record.message == (
'Bytes object does not appear to be encoded as utf-8'
)


def test_console_to_str(monkeypatch):
some_bytes = b"a\xE9\xC3\xE9b"
encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',
Expand Down
33 changes: 31 additions & 2 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
import codecs
import itertools
import locale
import os
import shutil
import stat
Expand Down Expand Up @@ -767,10 +768,38 @@ def test_make_subprocess_output_error():
assert actual == expected, 'actual: {}'.format(actual)


def test_make_subprocess_output_error__non_ascii_command_arg(monkeypatch):
"""
Test a command argument with a non-ascii character.
"""
cmd_args = ['foo', 'déf']
if sys.version_info[0] == 2:
# Check in Python 2 that the str (bytes object) with the non-ascii
# character has the encoding we expect. (This comes from the source
# code encoding at the top of the file.)
assert cmd_args[1].decode('utf-8') == u'déf'

# We need to monkeypatch so the encoding will be correct on Windows.
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
actual = make_subprocess_output_error(
cmd_args=cmd_args,
cwd='/path/to/cwd',
lines=[],
exit_status=1,
)
expected = dedent(u"""\
Command errored out with exit status 1:
command: foo 'déf'
cwd: /path/to/cwd
Complete output (0 lines):
----------------------------------------""")
assert actual == expected, u'actual: {}'.format(actual)


# This test is mainly important for checking unicode in Python 2.
def test_make_subprocess_output_error__unicode():
def test_make_subprocess_output_error__non_ascii_line():
"""
Test a line with non-ascii unicode characters.
Test a line with a non-ascii character.
"""
lines = [u'curly-quote: \u2018\n']
actual = make_subprocess_output_error(
Expand Down