fix(enc): apply surrogateescape when enc/decoding cmd-streams also

ankostis · ankostis · commit f801a3aed7ab · 2016-10-23T14:47:50.000+02:00
Try to fix gitpython-developers#543 unicode woes in "core" position (ie TMPDIR): + Apply surrogate-escapes(PEP383) also when decoding. + Ensure all file-path and cmd-streams are surogate-escape dencoded. + test_utils: check if lock works with unicodes. + git.compat: FIX undefined exc to raise in `replace_surrogate_encode()` and fkale8 fixes. ci results: + Linux: + py2.7 FAIL 53 TCs + py3: fixed + Windows, all were OK
diff --git a/git/cmd.py b/git/cmd.py
@@ -21,11 +21,11 @@
 from git.compat import (
     string_types,
     defenc,
-    force_bytes,
     PY3,
     # just to satisfy flake8 on py3
     unicode,
     safe_decode,
+    safe_encode,
     is_posix,
     is_win,
 )
@@ -274,13 +274,13 @@ def wait(self, stderr=b''):  # TODO: Bad choice to mimic `proc.wait()` but with
             :raise GitCommandError: if the return status is not 0"""
             if stderr is None:
                 stderr = b''
-            stderr = force_bytes(stderr)
+            stderr = safe_encode(stderr)
 
             status = self.proc.wait()
 
             def read_all_from_possibly_closed_stream(stream):
                 try:
-                    return stderr + force_bytes(stream.read())
+                    return stderr + safe_encode(stream.read())
                 except ValueError:
                     return stderr or b''
 
diff --git a/git/compat.py b/git/compat.py
@@ -21,8 +21,7 @@
 from gitdb.utils.encoding import (
     string_types,    # @UnusedImport
     text_type,       # @UnusedImport
-    force_bytes,     # @UnusedImport
-    force_text       # @UnusedImport
+    force_text,      # @UnusedImport
 )
 
 
@@ -77,7 +76,7 @@ def safe_decode(s):
 def safe_encode(s):
     """Safely decodes a binary string to unicode"""
     if isinstance(s, unicode):
-        return s.encode(defenc)
+        return s.encode(defenc, 'surrogateescape')
     elif isinstance(s, bytes):
         return s
     elif s is not None:
@@ -123,8 +122,8 @@ def __str__(self):
     else:  # Python 2
         def __str__(self):
             return self.__unicode__().encode(defenc)
-            
-            
+
+
 """
 This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
 handler of Python 3.
@@ -139,12 +138,14 @@ def __str__(self):
 #     # -- Python 2/3 compatibility -------------------------------------
 #     FS_ERRORS = 'my_surrogateescape'
 
+
 def u(text):
     if PY3:
         return text
     else:
         return text.decode('unicode_escape')
 
+
 def b(data):
     if PY3:
         return data.encode('latin1')
@@ -155,9 +156,10 @@ def b(data):
     _unichr = chr
     bytes_chr = lambda code: bytes((code,))
 else:
-    _unichr = unichr
+    _unichr = unichr  # @UndefinedVariable
     bytes_chr = chr
 
+
 def surrogateescape_handler(exc):
     """
     Pure Python implementation of the PEP 383: the "surrogateescape" error
@@ -204,7 +206,7 @@ def replace_surrogate_encode(mystring):
         # The following magic comes from Py3.3's Python/codecs.c file:
         if not 0xD800 <= code <= 0xDCFF:
             # Not a surrogate. Fail with the original exception.
-            raise exc
+            raise
         # mybytes = [0xe0 | (code >> 12),
         #            0x80 | ((code >> 6) & 0x3f),
         #            0x80 | (code & 0x3f)]
@@ -256,9 +258,8 @@ def encodefilename(fn):
             elif 0xDC80 <= code <= 0xDCFF:
                 ch = bytes_chr(code - 0xDC00)
             else:
-                raise UnicodeEncodeError(FS_ENCODING,
-                    fn, index, index+1,
-                    'ordinal not in range(128)')
+                raise UnicodeEncodeError(FS_ENCODING, fn, index, index + 1,
+                                         'ordinal not in range(128)')
             encoded.append(ch)
         return bytes().join(encoded)
     elif FS_ENCODING == 'utf-8':
@@ -272,20 +273,22 @@ def encodefilename(fn):
                     ch = bytes_chr(code - 0xDC00)
                     encoded.append(ch)
                 else:
-                    raise UnicodeEncodeError(
-                        FS_ENCODING,
-                        fn, index, index+1, 'surrogates not allowed')
+                    raise UnicodeEncodeError(FS_ENCODING, fn, index, index + 1,
+                                             'surrogates not allowed')
             else:
                 ch_utf8 = ch.encode('utf-8')
                 encoded.append(ch_utf8)
         return bytes().join(encoded)
     else:
         return fn.encode(FS_ENCODING, FS_ERRORS)
 
+
 def decodefilename(fn):
     return fn.decode(FS_ENCODING, FS_ERRORS)
 
-FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+FS_ENCODING = 'ascii'
+fn = b('[abc\xff]')
+encoded = u('[abc\udcff]')
 # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
 # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
 
diff --git a/git/config.py b/git/config.py
@@ -551,7 +551,7 @@ def get_value(self, section, option, default=None):
     def _value_to_string(self, value):
         if isinstance(value, (int, float, bool)):
             return str(value)
-        return force_text(value)
+        return force_text(value)  # No `safe_decode()`, let any unicode errors bubble-up.
 
     @needs_values
     @set_dirty_and_flush_changes
diff --git a/git/ext/gitdb b/git/ext/gitdb
@@ -1 +1 @@
-Subproject commit 38866bc7c4956170c681a62c4508f934ac826469
+Subproject commit 97035c64f429c229629c25becc54ae44dd95e49d
diff --git a/git/index/base.py b/git/index/base.py
@@ -15,10 +15,10 @@
     izip,
     xrange,
     string_types,
-    force_bytes,
     defenc,
     mviter,
-    is_win
+    is_win,
+    safe_encode
 )
 from git.exc import (
     GitCommandError,
@@ -597,7 +597,7 @@ def _store_path(self, filepath, fprogress):
         st = os.lstat(filepath)     # handles non-symlinks as well
         if S_ISLNK(st.st_mode):
             # in PY3, readlink is string, but we need bytes. In PY2, it's just OS encoded bytes, we assume UTF-8
-            open_stream = lambda: BytesIO(force_bytes(os.readlink(filepath), encoding=defenc))
+            open_stream = lambda: BytesIO(safe_encode(os.readlink(filepath)))
         else:
             open_stream = lambda: open(filepath, 'rb')
         with open_stream() as stream:
diff --git a/git/index/fun.py b/git/index/fun.py
@@ -17,8 +17,6 @@
 from git.compat import (
     PY3,
     defenc,
-    force_text,
-    force_bytes,
     is_posix,
     safe_encode,
     safe_decode,
@@ -91,8 +89,8 @@ def run_commit_hook(name, index):
         stdout = ''.join(stdout)
         stderr = ''.join(stderr)
         if cmd.returncode != 0:
-            stdout = force_text(stdout, defenc)
-            stderr = force_text(stderr, defenc)
+            stdout = safe_decode(stdout)
+            stderr = safe_decode(stderr)
             raise HookExecutionError(hp, cmd.returncode, stdout, stderr)
     # end handle return code
 
@@ -136,7 +134,7 @@ def write_cache(entries, stream, extension_data=None, ShaStreamCls=IndexFileSHA1
         write(entry[4])         # ctime
         write(entry[5])         # mtime
         path = entry[3]
-        path = force_bytes(path, encoding=defenc)
+        path = safe_encode(path)
         plen = len(path) & CE_NAMEMASK      # path length
         assert plen == len(path), "Path %s too long to fit into index" % entry[3]
         flags = plen | (entry[2] & CE_NAMEMASK_INV)     # clear possible previous values
diff --git a/git/remote.py b/git/remote.py
@@ -9,7 +9,11 @@
 import re
 
 from git.cmd import handle_process_output, Git
-from git.compat import (defenc, force_text, is_win)
+from git.compat import (
+    defenc,
+    is_win,
+    safe_decode,
+)
 from git.exc import GitCommandError
 from git.util import (
     LazyMixin,
@@ -640,7 +644,7 @@ def _get_fetch_info_from_stderr(self, proc, progress):
             log.warning("Error lines received while fetching: %s", stderr_text)
 
         for line in progress.other_lines:
-            line = force_text(line)
+            line = safe_decode(line)
             for cmd in cmds:
                 if len(line) > 1 and line[0] == ' ' and line[1] == cmd:
                     fetch_info_lines.append(line)
diff --git a/git/test/test_repo.py b/git/test/test_repo.py
@@ -35,6 +35,7 @@
     is_win,
     string_types,
     win_encode,
+    safe_encode,
 )
 from git.exc import (
     BadObject,
@@ -804,7 +805,7 @@ def test_git_file(self, rwrepo):
 
         # Test using an absolute gitdir path in the .git file.
         with open(git_file_path, 'wb') as fp:
-            fp.write(('gitdir: %s\n' % real_path_abs).encode('ascii'))
+            fp.write(safe_encode(('gitdir: %s\n' % real_path_abs)))
         git_file_repo = Repo(rwrepo.working_tree_dir)
         self.assertEqual(osp.abspath(git_file_repo.git_dir), real_path_abs)
 
diff --git a/git/test/test_util.py b/git/test/test_util.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # test_utils.py
 # Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors
 #
@@ -153,6 +154,14 @@ def test_lock_file(self):
         lock_file._obtain_lock_or_raise()
         lock_file._release_lock()
 
+    def test_lock_file_unicode(self):
+        my_file = tempfile.mktemp(prefix=u'καλημερα_')
+        lock_file = LockFile(my_file)
+        assert not lock_file._has_lock()
+        lock_file._obtain_lock_or_raise()
+        assert lock_file._has_lock()
+        lock_file._release_lock()
+
     def test_blocking_lock_file(self):
         my_file = tempfile.mktemp()
         lock_file = BlockingLockFile(my_file)