1 files changed, 38 insertions, 49 deletions
diff --git a/git.py b/git.py
index 2240175..09ccd37 100644
--- a/git.py
+++ b/git.py
@@ -12,35 +12,13 @@ import subprocess
 from collections import defaultdict
 import email.utils
 import datetime
-import urllib
-from cgi import escape
+import urllib.request, urllib.parse, urllib.error
+from html import escape
 
 
 # Path to the git binary.
 GIT_BIN = "git"
 
-class EncodeWrapper:
-    """File-like wrapper that returns data utf8 encoded."""
-    def __init__(self, fd, encoding = 'utf8', errors = 'replace'):
-        self.fd = fd
-        self.encoding = encoding
-        self.errors = errors
-
-    def __iter__(self):
-        for line in self.fd:
-            yield line.decode(self.encoding, errors = self.errors)
-
-    def read(self):
-        """Returns the whole content."""
-        s = self.fd.read()
-        return s.decode(self.encoding, errors = self.errors)
-
-    def readline(self):
-        """Returns a single line."""
-        s = self.fd.readline()
-        return s.decode(self.encoding, errors = self.errors)
-
-
 def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False):
     """Invokes git with the given parameters.
 
@@ -66,13 +44,8 @@ def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False)
     if raw:
         return p.stdout
 
-    # We need to wrap stdout if we want to decode it as utf8, subprocess
-    # doesn't support us telling it the encoding.
-    if sys.version_info.major == 3:
-        return io.TextIOWrapper(p.stdout, encoding = 'utf8',
-                errors = 'replace')
-    else:
-        return EncodeWrapper(p.stdout)
+    return io.TextIOWrapper(p.stdout, encoding = 'utf8',
+            errors = 'backslashreplace')
 
 
 class GitCommand (object):
@@ -109,6 +82,8 @@ class GitCommand (object):
     def stdin(self, s):
         """Sets the contents we will send in stdin."""
         self._override = True
+        if isinstance(s, str):
+            s = s.encode("utf8")
         self._stdin_buf = s
         self._override = False
 
@@ -116,7 +91,7 @@ class GitCommand (object):
         """Runs the git command."""
         params = [self._cmd]
 
-        for k, v in self._kwargs.items():
+        for k, v in list(self._kwargs.items()):
             dash = '--' if len(k) > 1 else '-'
             if v is None:
                 params.append('%s%s' % (dash, k))
@@ -146,11 +121,16 @@ class smstr:
         .html    -> an HTML-embeddable representation.
     """
     def __init__(self, raw):
-        if not isinstance(raw, str):
-            raise TypeError("The raw string must be instance of 'str'")
+        if not isinstance(raw, (str, bytes)):
+            raise TypeError(
+                    "The raw string must be instance of 'str', not %s" %
+                    type(raw))
         self.raw = raw
-        self.unicode = raw.decode('utf8', errors = 'replace')
-        self.url = urllib.pathname2url(raw)
+        if isinstance(raw, bytes):
+            self.unicode = raw.decode('utf8', errors = 'backslashreplace')
+        else:
+            self.unicode = raw
+        self.url = urllib.request.pathname2url(raw)
         self.html = self._to_html()
 
     def __cmp__(self, other):
@@ -163,7 +143,7 @@ class smstr:
     @staticmethod
     def from_url(url):
         """Returns an smstr() instance from an url-encoded string."""
-        return smstr(urllib.url2pathname(url))
+        return smstr(urllib.request.url2pathname(url))
 
     def split(self, sep):
         """Like str.split()."""
@@ -176,10 +156,10 @@ class smstr:
 
     def _to_html(self):
         """Returns an html representation of the unicode string."""
-        html = u''
+        html = ''
         for c in escape(self.unicode):
             if c in '\t\r\n\r\f\a\b\v\0':
-                esc_c = c.encode('ascii').encode('string_escape')
+                esc_c = c.encode("unicode-escape").decode("utf8")
                 html += '<span class="ctrlchr">%s</span>' % esc_c
             else:
                 html += c
@@ -190,14 +170,23 @@ class smstr:
 def unquote(s):
     """Git can return quoted file names, unquote them. Always return a str."""
     if not (s[0] == '"' and s[-1] == '"'):
-        # Unquoted strings are always safe, no need to mess with them; just
-        # make sure we return str.
-        s = s.encode('ascii')
+        # Unquoted strings are always safe, no need to mess with them
         return s
 
-    # Get rid of the quotes, we never want them in the output, and convert to
-    # a raw string, un-escaping the backslashes.
-    s = s[1:-1].decode('string-escape')
+    # The string will be of the form `"<escaped>"`, where <escaped> is a
+    # backslash-escaped representation of the name of the file.
+    # Examples:  "with\ttwo\ttabs" , "\303\261aca-utf8", "\361aca-latin1"
+
+    # Get rid of the quotes, we never want them in the output.
+    s = s[1:-1]
+
+    # Un-escape the backslashes.
+    # latin1 is ok to use here because in Python it just maps the code points
+    # 0-255 to the bytes 0x-0xff, which is what we expect.
+    s = s.encode("latin1").decode("unicode-escape")
+
+    # Convert to utf8.
+    s = s.encode("latin1").decode("utf8", errors='backslashreplace')
 
     return s
 
@@ -337,13 +326,13 @@ class Repo:
         cmd.raw(True)
         cmd.batch = '%(objectsize)'
 
-        if isinstance(ref, unicode):
-            ref = ref.encode('utf8')
-        cmd.stdin('%s:%s' % (ref, path))
+        # Format: <ref>:<path>
+        # Construct it in binary since the path might not be utf8.
+        cmd.stdin(ref.encode("utf8") + b":" + path)
 
         out = cmd.run()
         head = out.readline()
-        if not head or head.strip().endswith('missing'):
+        if not head or head.strip().endswith(b'missing'):
             return None
 
         return Blob(out.read()[:int(head)])