[Python-3000] Plea for help: python/branches/py3k-struni/Lib/tarfile.py
Paul Jimenez
pj at place.org
Tue Aug 7 06:37:49 CEST 2007
This evening I had a couple hours to spar and happend to read Guido's
plea for help near the beginning of it. I picked up a failing testcase
that no one had claimed and did what I could: it's not finished, but it
fixes approximately 75% of the errors in test_tarfile. I concentrated
on fixing problems that the testcase turned up; a pure inspection of
the source would turn up lots of things I missed, I'm sure. I hope it's
useful; it probably need minor attention from me on what the Right Thing
to do is in the case of encoding and decoding: ascii? I had to do a
.decode('latin-1') to pass the umlaut-in-a-filename test, but I'm not at
all sure that that's the true Right Thing. Anyway, here's a start; I'm
explicitly *not* claiming that I'll ever touch this source code again; I
don't want to block anyone else from working on it. Enjoy.
--pj
Index: tarfile.py
===================================================================
--- tarfile.py (revision 56785)
+++ tarfile.py (working copy)
@@ -72,33 +72,33 @@
#---------------------------------------------------------
# tar constants
#---------------------------------------------------------
-NUL = "\0" # the null character
+NUL = b"\0" # the null character
BLOCKSIZE = 512 # length of processing blocks
RECORDSIZE = BLOCKSIZE * 20 # length of records
-GNU_MAGIC = "ustar \0" # magic gnu tar string
-POSIX_MAGIC = "ustar\x0000" # magic posix tar string
+GNU_MAGIC = b"ustar \0" # magic gnu tar string
+POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
LENGTH_NAME = 100 # maximum length of a filename
LENGTH_LINK = 100 # maximum length of a linkname
LENGTH_PREFIX = 155 # maximum length of the prefix field
-REGTYPE = "0" # regular file
-AREGTYPE = "\0" # regular file
-LNKTYPE = "1" # link (inside tarfile)
-SYMTYPE = "2" # symbolic link
-CHRTYPE = "3" # character special device
-BLKTYPE = "4" # block special device
-DIRTYPE = "5" # directory
-FIFOTYPE = "6" # fifo special device
-CONTTYPE = "7" # contiguous file
+REGTYPE = b"0" # regular file
+AREGTYPE = b"\0" # regular file
+LNKTYPE = b"1" # link (inside tarfile)
+SYMTYPE = b"2" # symbolic link
+CHRTYPE = b"3" # character special device
+BLKTYPE = b"4" # block special device
+DIRTYPE = b"5" # directory
+FIFOTYPE = b"6" # fifo special device
+CONTTYPE = b"7" # contiguous file
-GNUTYPE_LONGNAME = "L" # GNU tar longname
-GNUTYPE_LONGLINK = "K" # GNU tar longlink
-GNUTYPE_SPARSE = "S" # GNU tar sparse file
+GNUTYPE_LONGNAME = b"L" # GNU tar longname
+GNUTYPE_LONGLINK = b"K" # GNU tar longlink
+GNUTYPE_SPARSE = b"S" # GNU tar sparse file
-XHDTYPE = "x" # POSIX.1-2001 extended header
-XGLTYPE = "g" # POSIX.1-2001 global header
-SOLARIS_XHDTYPE = "X" # Solaris extended header
+XHDTYPE = b"x" # POSIX.1-2001 extended header
+XGLTYPE = b"g" # POSIX.1-2001 global header
+SOLARIS_XHDTYPE = b"X" # Solaris extended header
USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
GNU_FORMAT = 1 # GNU tar format
@@ -176,6 +176,9 @@
def stn(s, length):
"""Convert a python string to a null-terminated string buffer.
"""
+ #return s[:length].encode('ascii') + (length - len(s)) * NUL
+ if type(s) != type(b''):
+ s = s.encode('ascii')
return s[:length] + (length - len(s)) * NUL
def nts(s):
@@ -184,8 +187,8 @@
# Use the string up to the first null char.
p = s.find("\0")
if p == -1:
- return s
- return s[:p]
+ return s.decode('latin-1')
+ return s[:p].decode('latin-1')
def nti(s):
"""Convert a number field to a python number.
@@ -214,7 +217,7 @@
# encoding, the following digits-1 bytes are a big-endian
# representation. This allows values up to (256**(digits-1))-1.
if 0 <= n < 8 ** (digits - 1):
- s = "%0*o" % (digits - 1, n) + NUL
+ s = ("%0*o" % (digits - 1, n)).encode('ascii') + NUL
else:
if format != GNU_FORMAT or n >= 256 ** (digits - 1):
raise ValueError("overflow in number field")
@@ -412,7 +415,7 @@
self.comptype = comptype
self.fileobj = fileobj
self.bufsize = bufsize
- self.buf = ""
+ self.buf = b""
self.pos = 0
self.closed = False
@@ -434,7 +437,7 @@
except ImportError:
raise CompressionError("bz2 module is not available")
if mode == "r":
- self.dbuf = ""
+ self.dbuf = b""
self.cmp = bz2.BZ2Decompressor()
else:
self.cmp = bz2.BZ2Compressor()
@@ -451,10 +454,10 @@
self.zlib.DEF_MEM_LEVEL,
0)
timestamp = struct.pack("<L", int(time.time()))
- self.__write("\037\213\010\010%s\002\377" % timestamp)
+ self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
if self.name.endswith(".gz"):
self.name = self.name[:-3]
- self.__write(self.name + NUL)
+ self.__write(self.name.encode('ascii') + NUL)
def write(self, s):
"""Write string s to the stream.
@@ -487,7 +490,7 @@
if self.mode == "w" and self.buf:
self.fileobj.write(self.buf)
- self.buf = ""
+ self.buf = b""
if self.comptype == "gz":
# The native zlib crc is an unsigned 32-bit integer, but
# the Python wrapper implicitly casts that to a signed C
@@ -507,12 +510,12 @@
"""Initialize for reading a gzip compressed fileobj.
"""
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
- self.dbuf = ""
+ self.dbuf = b""
# taken from gzip.GzipFile with some alterations
- if self.__read(2) != "\037\213":
+ if self.__read(2) != b"\037\213":
raise ReadError("not a gzip file")
- if self.__read(1) != "\010":
+ if self.__read(1) != b"\010":
raise CompressionError("unsupported compression method")
flag = ord(self.__read(1))
@@ -564,7 +567,7 @@
if not buf:
break
t.append(buf)
- buf = "".join(t)
+ buf = b"".join(t)
else:
buf = self._read(size)
self.pos += len(buf)
@@ -588,7 +591,7 @@
raise ReadError("invalid compressed data")
t.append(buf)
c += len(buf)
- t = "".join(t)
+ t = b"".join(t)
self.dbuf = t[size:]
return t[:size]
@@ -604,7 +607,7 @@
break
t.append(buf)
c += len(buf)
- t = "".join(t)
+ t = b"".join(t)
self.buf = t[size:]
return t[:size]
# class _Stream
@@ -655,7 +658,7 @@
if self.mode == "r":
self.bz2obj = bz2.BZ2Decompressor()
self.fileobj.seek(0)
- self.buf = ""
+ self.buf = b""
else:
self.bz2obj = bz2.BZ2Compressor()
@@ -670,7 +673,7 @@
except EOFError:
break
x += len(data)
- self.buf = "".join(b)
+ self.buf = b"".join(b)
buf = self.buf[:size]
self.buf = self.buf[size:]
@@ -753,7 +756,7 @@
break
size -= len(buf)
data.append(buf)
- return "".join(data)
+ return b"".join(data)
def readsparsesection(self, size):
"""Read a single section of a sparse file.
@@ -761,7 +764,7 @@
section = self.sparse.find(self.position)
if section is None:
- return ""
+ return b""
size = min(size, section.offset + section.size - self.position)
@@ -793,7 +796,7 @@
self.size = tarinfo.size
self.position = 0
- self.buffer = ""
+ self.buffer = b""
def read(self, size=None):
"""Read at most size bytes from the file. If size is not
@@ -802,11 +805,11 @@
if self.closed:
raise ValueError("I/O operation on closed file")
- buf = ""
+ buf = b""
if self.buffer:
if size is None:
buf = self.buffer
- self.buffer = ""
+ self.buffer = b""
else:
buf = self.buffer[:size]
self.buffer = self.buffer[size:]
@@ -827,16 +830,16 @@
if self.closed:
raise ValueError("I/O operation on closed file")
- if "\n" in self.buffer:
- pos = self.buffer.find("\n") + 1
+ if b"\n" in self.buffer:
+ pos = self.buffer.find(b"\n") + 1
else:
buffers = [self.buffer]
while True:
buf = self.fileobj.read(self.blocksize)
buffers.append(buf)
- if not buf or "\n" in buf:
- self.buffer = "".join(buffers)
- pos = self.buffer.find("\n") + 1
+ if not buf or b"\n" in buf:
+ self.buffer = b"".join(buffers)
+ pos = self.buffer.find(b"\n") + 1
if pos == 0:
# no newline found.
pos = len(self.buffer)
@@ -848,7 +851,7 @@
buf = self.buffer[:pos]
self.buffer = self.buffer[pos:]
self.position += len(buf)
- return buf
+ return buf.decode()
def readlines(self):
"""Return a list with all remaining lines.
@@ -886,7 +889,7 @@
else:
raise ValueError("Invalid argument")
- self.buffer = ""
+ self.buffer = b""
self.fileobj.seek(self.position)
def close(self):
@@ -1015,7 +1018,7 @@
"""
info["magic"] = GNU_MAGIC
- buf = ""
+ buf = b""
if len(info["linkname"]) > LENGTH_LINK:
buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
@@ -1071,7 +1074,7 @@
if pax_headers:
buf = self._create_pax_generic_header(pax_headers)
else:
- buf = ""
+ buf = b""
return buf + self._create_header(info, USTAR_FORMAT)
@@ -1108,7 +1111,7 @@
itn(info.get("gid", 0), 8, format),
itn(info.get("size", 0), 12, format),
itn(info.get("mtime", 0), 12, format),
- " ", # checksum field
+ b" ", # checksum field
info.get("type", REGTYPE),
stn(info.get("linkname", ""), 100),
stn(info.get("magic", POSIX_MAGIC), 8),
@@ -1119,9 +1122,9 @@
stn(info.get("prefix", ""), 155)
]
- buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
+ buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
- buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
+ buf = buf[:-364] + ("%06o\0" % chksum).encode('ascii') + buf[-357:]
return buf
@staticmethod
@@ -1139,10 +1142,10 @@
"""Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
for name.
"""
- name += NUL
+ name = name.encode('ascii') + NUL
info = {}
- info["name"] = "././@LongLink"
+ info["name"] = b"././@LongLink"
info["type"] = type
info["size"] = len(name)
info["magic"] = GNU_MAGIC
@@ -1324,7 +1327,7 @@
lastpos = offset + numbytes
pos += 24
- isextended = ord(buf[482])
+ isextended = buf[482]
origsize = nti(buf[483:495])
# If the isextended flag is given,
@@ -1344,7 +1347,7 @@
realpos += numbytes
lastpos = offset + numbytes
pos += 24
- isextended = ord(buf[504])
+ isextended = buf[504]
if lastpos < origsize:
sp.append(_hole(lastpos, origsize - lastpos))
Index: test/test_tarfile.py
===================================================================
--- test/test_tarfile.py (revision 56784)
+++ test/test_tarfile.py (working copy)
@@ -115,7 +115,7 @@
fobj.seek(0, 2)
self.assertEqual(tarinfo.size, fobj.tell(),
"seek() to file's end failed")
- self.assert_(fobj.read() == "",
+ self.assert_(fobj.read() == b"",
"read() at file's end did not return empty string")
fobj.seek(-tarinfo.size, 2)
self.assertEqual(0, fobj.tell(),
More information about the Python-3000
mailing list