[Python-3000] Plea for help: python/branches/py3k-struni/Lib/tarfile.py

Tue Aug 7 06:37:49 CEST 2007

This evening I had a couple hours to spar and happend to read Guido's
plea for help near the beginning of it. I picked up a failing testcase
that no one had claimed and did what I could: it's not finished, but it
fixes approximately 75% of the errors in test_tarfile. I concentrated
on fixing problems that the testcase turned up; a pure inspection of
the source would turn up lots of things I missed, I'm sure. I hope it's
useful; it probably need minor attention from me on what the Right Thing
to do is in the case of encoding and decoding: ascii? I had to do a
.decode('latin-1') to pass the umlaut-in-a-filename test, but I'm not at
all sure that that's the true Right Thing. Anyway, here's a start; I'm
explicitly *not* claiming that I'll ever touch this source code again; I
don't want to block anyone else from working on it.  Enjoy.

  --pj


Index: tarfile.py
===================================================================

--- tarfile.py	(revision 56785)
+++ tarfile.py	(working copy)
@@ -72,33 +72,33 @@
 #---------------------------------------------------------
 # tar constants
 #---------------------------------------------------------
-NUL = "\0"                      # the null character
+NUL = b"\0"                     # the null character
 BLOCKSIZE = 512                 # length of processing blocks
 RECORDSIZE = BLOCKSIZE * 20     # length of records
-GNU_MAGIC = "ustar  \0"         # magic gnu tar string
-POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
+GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
+POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
 
 LENGTH_NAME = 100               # maximum length of a filename
 LENGTH_LINK = 100               # maximum length of a linkname
 LENGTH_PREFIX = 155             # maximum length of the prefix field
 
-REGTYPE = "0"                   # regular file
-AREGTYPE = "\0"                 # regular file
-LNKTYPE = "1"                   # link (inside tarfile)
-SYMTYPE = "2"                   # symbolic link
-CHRTYPE = "3"                   # character special device
-BLKTYPE = "4"                   # block special device
-DIRTYPE = "5"                   # directory
-FIFOTYPE = "6"                  # fifo special device
-CONTTYPE = "7"                  # contiguous file
+REGTYPE = b"0"                   # regular file
+AREGTYPE = b"\0"                 # regular file
+LNKTYPE = b"1"                   # link (inside tarfile)
+SYMTYPE = b"2"                   # symbolic link
+CHRTYPE = b"3"                   # character special device
+BLKTYPE = b"4"                   # block special device
+DIRTYPE = b"5"                   # directory
+FIFOTYPE = b"6"                  # fifo special device
+CONTTYPE = b"7"                  # contiguous file
 
-GNUTYPE_LONGNAME = "L"          # GNU tar longname
-GNUTYPE_LONGLINK = "K"          # GNU tar longlink
-GNUTYPE_SPARSE = "S"            # GNU tar sparse file
+GNUTYPE_LONGNAME = b"L"          # GNU tar longname
+GNUTYPE_LONGLINK = b"K"          # GNU tar longlink
+GNUTYPE_SPARSE = b"S"            # GNU tar sparse file
 
-XHDTYPE = "x"                   # POSIX.1-2001 extended header
-XGLTYPE = "g"                   # POSIX.1-2001 global header
-SOLARIS_XHDTYPE = "X"           # Solaris extended header
+XHDTYPE = b"x"                   # POSIX.1-2001 extended header
+XGLTYPE = b"g"                   # POSIX.1-2001 global header
+SOLARIS_XHDTYPE = b"X"           # Solaris extended header
 
 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 GNU_FORMAT = 1                  # GNU tar format
@@ -176,6 +176,9 @@
 def stn(s, length):
     """Convert a python string to a null-terminated string buffer.
     """
+    #return s[:length].encode('ascii') + (length - len(s)) * NUL
+    if type(s) != type(b''):
+        s = s.encode('ascii')
     return s[:length] + (length - len(s)) * NUL
 
 def nts(s):
@@ -184,8 +187,8 @@
     # Use the string up to the first null char.
     p = s.find("\0")
     if p == -1:
-        return s
-    return s[:p]
+        return s.decode('latin-1')
+    return s[:p].decode('latin-1')
 
 def nti(s):
     """Convert a number field to a python number.
@@ -214,7 +217,7 @@
     # encoding, the following digits-1 bytes are a big-endian
     # representation. This allows values up to (256**(digits-1))-1.
     if 0 <= n < 8 ** (digits - 1):
-        s = "%0*o" % (digits - 1, n) + NUL
+        s = ("%0*o" % (digits - 1, n)).encode('ascii') + NUL
     else:
         if format != GNU_FORMAT or n >= 256 ** (digits - 1):
             raise ValueError("overflow in number field")
@@ -412,7 +415,7 @@
         self.comptype = comptype
         self.fileobj  = fileobj
         self.bufsize  = bufsize
-        self.buf      = ""
+        self.buf      = b""
         self.pos      = 0
         self.closed   = False
 
@@ -434,7 +437,7 @@
             except ImportError:
                 raise CompressionError("bz2 module is not available")
             if mode == "r":
-                self.dbuf = ""
+                self.dbuf = b""
                 self.cmp = bz2.BZ2Decompressor()
             else:
                 self.cmp = bz2.BZ2Compressor()
@@ -451,10 +454,10 @@
                                             self.zlib.DEF_MEM_LEVEL,
                                             0)
         timestamp = struct.pack("<L", int(time.time()))
-        self.__write("\037\213\010\010%s\002\377" % timestamp)
+        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
         if self.name.endswith(".gz"):
             self.name = self.name[:-3]
-        self.__write(self.name + NUL)
+        self.__write(self.name.encode('ascii') + NUL)
 
     def write(self, s):
         """Write string s to the stream.
@@ -487,7 +490,7 @@
 
         if self.mode == "w" and self.buf:
             self.fileobj.write(self.buf)
-            self.buf = ""
+            self.buf = b""
             if self.comptype == "gz":
                 # The native zlib crc is an unsigned 32-bit integer, but
                 # the Python wrapper implicitly casts that to a signed C
@@ -507,12 +510,12 @@
         """Initialize for reading a gzip compressed fileobj.
         """
         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
-        self.dbuf = ""
+        self.dbuf = b""
 
         # taken from gzip.GzipFile with some alterations
-        if self.__read(2) != "\037\213":
+        if self.__read(2) != b"\037\213":
             raise ReadError("not a gzip file")
-        if self.__read(1) != "\010":
+        if self.__read(1) != b"\010":
             raise CompressionError("unsupported compression method")
 
         flag = ord(self.__read(1))
@@ -564,7 +567,7 @@
                 if not buf:
                     break
                 t.append(buf)
-            buf = "".join(t)
+            buf = b"".join(t)
         else:
             buf = self._read(size)
         self.pos += len(buf)
@@ -588,7 +591,7 @@
                 raise ReadError("invalid compressed data")
             t.append(buf)
             c += len(buf)
-        t = "".join(t)
+        t = b"".join(t)
         self.dbuf = t[size:]
         return t[:size]
 
@@ -604,7 +607,7 @@
                 break
             t.append(buf)
             c += len(buf)
-        t = "".join(t)
+        t = b"".join(t)
         self.buf = t[size:]
         return t[:size]
 # class _Stream
@@ -655,7 +658,7 @@
         if self.mode == "r":
             self.bz2obj = bz2.BZ2Decompressor()
             self.fileobj.seek(0)
-            self.buf = ""
+            self.buf = b""
         else:
             self.bz2obj = bz2.BZ2Compressor()
 
@@ -670,7 +673,7 @@
             except EOFError:
                 break
             x += len(data)
-        self.buf = "".join(b)
+        self.buf = b"".join(b)
 
         buf = self.buf[:size]
         self.buf = self.buf[size:]
@@ -753,7 +756,7 @@
                 break
             size -= len(buf)
             data.append(buf)
-        return "".join(data)
+        return b"".join(data)
 
     def readsparsesection(self, size):
         """Read a single section of a sparse file.
@@ -761,7 +764,7 @@
         section = self.sparse.find(self.position)
 
         if section is None:
-            return ""
+            return b""
 
         size = min(size, section.offset + section.size - self.position)
 
@@ -793,7 +796,7 @@
         self.size = tarinfo.size
 
         self.position = 0
-        self.buffer = ""
+        self.buffer = b""
 
     def read(self, size=None):
         """Read at most size bytes from the file. If size is not
@@ -802,11 +805,11 @@
         if self.closed:
             raise ValueError("I/O operation on closed file")
 
-        buf = ""
+        buf = b""
         if self.buffer:
             if size is None:
                 buf = self.buffer
-                self.buffer = ""
+                self.buffer = b""
             else:
                 buf = self.buffer[:size]
                 self.buffer = self.buffer[size:]
@@ -827,16 +830,16 @@
         if self.closed:
             raise ValueError("I/O operation on closed file")
 
-        if "\n" in self.buffer:
-            pos = self.buffer.find("\n") + 1
+        if b"\n" in self.buffer:
+            pos = self.buffer.find(b"\n") + 1
         else:
             buffers = [self.buffer]
             while True:
                 buf = self.fileobj.read(self.blocksize)
                 buffers.append(buf)
-                if not buf or "\n" in buf:
-                    self.buffer = "".join(buffers)
-                    pos = self.buffer.find("\n") + 1
+                if not buf or b"\n" in buf:
+                    self.buffer = b"".join(buffers)
+                    pos = self.buffer.find(b"\n") + 1
                     if pos == 0:
                         # no newline found.
                         pos = len(self.buffer)
@@ -848,7 +851,7 @@
         buf = self.buffer[:pos]
         self.buffer = self.buffer[pos:]
         self.position += len(buf)
-        return buf
+        return buf.decode()
 
     def readlines(self):
         """Return a list with all remaining lines.
@@ -886,7 +889,7 @@
         else:
             raise ValueError("Invalid argument")
 
-        self.buffer = ""
+        self.buffer = b""
         self.fileobj.seek(self.position)
 
     def close(self):
@@ -1015,7 +1018,7 @@
         """
         info["magic"] = GNU_MAGIC
 
-        buf = ""
+        buf = b""
         if len(info["linkname"]) > LENGTH_LINK:
             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
 
@@ -1071,7 +1074,7 @@
         if pax_headers:
             buf = self._create_pax_generic_header(pax_headers)
         else:
-            buf = ""
+            buf = b""
 
         return buf + self._create_header(info, USTAR_FORMAT)
 
@@ -1108,7 +1111,7 @@
             itn(info.get("gid", 0), 8, format),
             itn(info.get("size", 0), 12, format),
             itn(info.get("mtime", 0), 12, format),
-            "        ", # checksum field
+            b"        ", # checksum field
             info.get("type", REGTYPE),
             stn(info.get("linkname", ""), 100),
             stn(info.get("magic", POSIX_MAGIC), 8),
@@ -1119,9 +1122,9 @@
             stn(info.get("prefix", ""), 155)
         ]
 
-        buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
+        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
-        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
+        buf = buf[:-364] + ("%06o\0" % chksum).encode('ascii') + buf[-357:]
         return buf
 
     @staticmethod
@@ -1139,10 +1142,10 @@
         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
            for name.
         """
-        name += NUL
+        name = name.encode('ascii') + NUL
 
         info = {}
-        info["name"] = "././@LongLink"
+        info["name"] = b"././@LongLink"
         info["type"] = type
         info["size"] = len(name)
         info["magic"] = GNU_MAGIC
@@ -1324,7 +1327,7 @@
             lastpos = offset + numbytes
             pos += 24
 
-        isextended = ord(buf[482])
+        isextended = buf[482]
         origsize = nti(buf[483:495])
 
         # If the isextended flag is given,
@@ -1344,7 +1347,7 @@
                 realpos += numbytes
                 lastpos = offset + numbytes
                 pos += 24
-            isextended = ord(buf[504])
+            isextended = buf[504]
 
         if lastpos < origsize:
             sp.append(_hole(lastpos, origsize - lastpos))
Index: test/test_tarfile.py
===================================================================
--- test/test_tarfile.py	(revision 56784)
+++ test/test_tarfile.py	(working copy)
@@ -115,7 +115,7 @@
         fobj.seek(0, 2)
         self.assertEqual(tarinfo.size, fobj.tell(),
                      "seek() to file's end failed")
-        self.assert_(fobj.read() == "",
+        self.assert_(fobj.read() == b"",
                      "read() at file's end did not return empty string")
         fobj.seek(-tarinfo.size, 2)
         self.assertEqual(0, fobj.tell(),