[pypy-commit] pypy utf8-unicode2: Refactor Utf8Builder API some; don't allow .append(<some int>)
waedt
noreply at buildbot.pypy.org
Fri Aug 8 09:22:42 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72719:b5e27ed82427
Date: 2014-08-08 02:12 -0500
http://bitbucket.org/pypy/pypy/changeset/b5e27ed82427/
Log: Refactor Utf8Builder API some; don't allow .append(<some int>)
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -9,10 +9,10 @@
def build_utf8str():
builder = Utf8Builder()
- builder.append('A') #0x41
- builder.append(0x10F) #0xC4 0x8F
- builder.append(0x20AC) #0xE2 0x82 0xAC
- builder.append(0x1F63D) #0xF0 0x9F 0x98 0xBD
+ builder.append_ascii('A') #0x41
+ builder.append_codepoint(0x10F) #0xC4 0x8F
+ builder.append_codepoint(0x20AC) #0xE2 0x82 0xAC
+ builder.append_codepoint(0x1F63D) #0xF0 0x9F 0x98 0xBD
return builder.build()
def test_builder():
@@ -88,7 +88,7 @@
def test_unicode_literal_comparison():
builder = Utf8Builder()
- builder.append(0x10F)
+ builder.append_codepoint(0x10F)
s = builder.build()
assert s == u'\u010F'
assert s[0] == u'\u010F'
diff --git a/pypy/interpreter/test/test_utf8_codecs.py b/pypy/interpreter/test/test_utf8_codecs.py
--- a/pypy/interpreter/test/test_utf8_codecs.py
+++ b/pypy/interpreter/test/test_utf8_codecs.py
@@ -58,7 +58,7 @@
assert t is s
assert start == startingpos
assert stop == endingpos
- return "42424242", None, stop
+ return Utf8Str("42424242"), None, stop
encoder = self.getencoder(encoding)
result = encoder(s, len(s), "foo!", errorhandler)
assert called[0]
@@ -85,8 +85,8 @@
assert stop == endingpos
if msg is not None:
assert errmsg == msg
- return "42424242", stop
- return "", endingpos
+ return Utf8Str("42424242"), stop
+ return Utf8Str(""), endingpos
decoder = self.getdecoder(encoding)
if addstuff:
s += "some rest in ascii"
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -21,7 +21,7 @@
# Like unichr, but returns a Utf8Str object
# TODO: Do this without the builder so its faster
b = Utf8Builder()
- b.append(value)
+ b.append_codepoint(value)
return b.build()
def utf8ord_bytes(bytes, start):
@@ -130,6 +130,13 @@
else:
self._len = len(data)
+ if not we_are_translated():
+ self.bytes.decode('utf8')
+
+ if self._is_ascii:
+ for i in self.bytes:
+ assert ord(i) < 128
+
def _calc_length(self):
pos = 0
length = 0
@@ -559,15 +566,15 @@
i += 1
c2 = intmask(array[i])
if c2 == 0:
- builder.append(c)
+ builder.append_codepoint(c)
break
elif not (0xDC00 <= c2 <= 0xDFFF):
- builder.append(c)
+ builder.append_codepoint(c)
c = c2
else:
c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
- builder.append(c)
+ builder.append_codepoint(c)
i += 1
return builder.build()
@@ -587,15 +594,15 @@
i += 1
c2 = intmask(array[i])
if c2 == 0:
- builder.append(c)
+ builder.append_codepoint(c)
break
elif not (0xDC00 <= c2 <= 0xDFFF):
- builder.append(c)
+ builder.append_codepoint(c)
c = c2
else:
c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
- builder.append(c)
+ builder.append_codepoint(c)
i += 1
return builder.build()
@@ -613,12 +620,12 @@
i += 1
c2 = intmask(array[i])
if not (0xDC00 <= c2 <= 0xDFFF):
- builder.append(c)
+ builder.append_codepoint(c)
c = c2
else:
c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
- builder.append(c)
+ builder.append_codepoint(c)
i += 1
return builder.build()
@@ -634,42 +641,54 @@
self._length = 0
+ def append_codepoint(self, c):
+ if c < 0x80:
+ self._builder.append(chr(c))
+ elif c < 0x800:
+ self._builder.append(chr(0xC0 | (c >> 6)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ elif c < 0x10000:
+ self._builder.append(chr(0xE0 | (c >> 12)))
+ self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ elif c <= 0x10FFFF:
+ self._builder.append(chr(0xF0 | (c >> 18)))
+ self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
+ self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ else:
+ raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
+ self._length += 1
+
+ def append_ascii(self, str):
+ if not we_are_translated():
+ # XXX For testing purposes, make sure this is actually ascii
+ for i in str:
+ assert ord(i) < 128
+
+ self._builder.append(str)
+ self._length += len(str)
+
+ def append_utf8(self, ustr):
+ self._builder.append(ustr.bytes)
+ if not ustr._is_ascii:
+ self._is_ascii = False
+ self._length += len(ustr)
+
+ def _append_bytes(self, bytes, is_ascii=False):
+ # XXX This breaks getlength()
+ self._builder.append(bytes)
+ self._is_ascii = self._is_ascii and is_ascii
+
@specialize.argtype(1)
def append(self, c):
if isinstance(c, Utf8Str):
- self._builder.append(c.bytes)
- if not c._is_ascii:
- self._is_ascii = False
- self._length += len(c)
-
- elif isinstance(c, int):
- if c < 0x80:
- self._builder.append(chr(c))
- elif c < 0x800:
- self._builder.append(chr(0xC0 | (c >> 6)))
- self._builder.append(chr(0x80 | (c & 0x3F)))
- self._is_ascii = False
- elif c < 0x10000:
- self._builder.append(chr(0xE0 | (c >> 12)))
- self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
- self._builder.append(chr(0x80 | (c & 0x3F)))
- self._is_ascii = False
- elif c <= 0x10FFFF:
- self._builder.append(chr(0xF0 | (c >> 18)))
- self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
- self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
- self._builder.append(chr(0x80 | (c & 0x3F)))
- self._is_ascii = False
- else:
- raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
- self._length += 1
+ self.append_utf8(c)
else:
- assert isinstance(c, str)
- self._builder.append(c)
-
- # XXX The assumption here is that the bytes being appended are
- # ASCII, ie 1:1 byte:char
- self._length += len(c)
+ self.append_ascii(c)
@specialize.argtype(1)
def append_slice(self, s, start, end):
@@ -685,18 +704,12 @@
type(s))
self._length += end - start
- @specialize.argtype(1)
def append_multiple_char(self, c, count):
# TODO: What do I do when I have an int? Is it fine to just loop over
# .append(c) then? Should (can) I force a resize first?
- if isinstance(c, int):
- self._builder.append_multiple_char(chr(c), count)
- return
-
- if isinstance(c, str):
- self._builder.append_multiple_char(c, count)
- else:
- self._builder.append_multiple_char(c.bytes, count)
+ if ord(c) > 127:
+ raise ValueError("an ascii char is required")
+ self._builder.append_multiple_char(c, count)
self._length += count
def getlength(self):
@@ -705,6 +718,7 @@
def build(self):
return Utf8Str(self._builder.build(), self._is_ascii)
+
class WCharContextManager(object):
def __init__(self, str):
self.str = str
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -34,7 +34,7 @@
# Non-escape characters are interpreted as Unicode ordinals
if ch != '\\':
- builder.append(ord(ch))
+ builder.append_codepoint(ord(ch))
pos += 1
continue
@@ -44,23 +44,23 @@
message = "\\ at end of string"
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, size)
- builder.append(res)
+ builder.append_utf8(res)
continue
ch = s[pos]
pos += 1
# \x escapes
if ch == '\n': pass
- elif ch == '\\': builder.append('\\')
- elif ch == '\'': builder.append('\'')
- elif ch == '\"': builder.append('\"')
- elif ch == 'b' : builder.append('\b')
- elif ch == 'f' : builder.append('\f')
- elif ch == 't' : builder.append('\t')
- elif ch == 'n' : builder.append('\n')
- elif ch == 'r' : builder.append('\r')
- elif ch == 'v' : builder.append('\v')
- elif ch == 'a' : builder.append('\a')
+ elif ch == '\\': builder.append_ascii('\\')
+ elif ch == '\'': builder.append_ascii('\'')
+ elif ch == '\"': builder.append_ascii('\"')
+ elif ch == 'b' : builder.append_ascii('\b')
+ elif ch == 'f' : builder.append_ascii('\f')
+ elif ch == 't' : builder.append_ascii('\t')
+ elif ch == 'n' : builder.append_ascii('\n')
+ elif ch == 'r' : builder.append_ascii('\r')
+ elif ch == 'v' : builder.append_ascii('\v')
+ elif ch == 'a' : builder.append_ascii('\a')
elif '0' <= ch <= '7':
x = ord(ch) - ord('0')
if pos < size:
@@ -73,7 +73,7 @@
if '0' <= ch <= '7':
pos += 1
x = (x<<3) + ord(ch) - ord('0')
- builder.append(x)
+ builder.append_codepoint(x)
# hex escapes
# \xXX
elif ch == 'x':
@@ -105,7 +105,7 @@
"(can't load unicodedata module)")
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, size)
- builder.append(res)
+ builder.append_utf8(res)
continue
if look < size and s[look] == '{':
@@ -120,21 +120,21 @@
if code < 0:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- builder.append(res)
+ builder.append_utf8(res)
continue
pos = look + 1
- builder.append(code)
+ builder.append_codepoint(code)
else:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- builder.append(res)
+ builder.append_utf8(res)
else:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- builder.append(res)
+ builder.append_utf8(res)
else:
- builder.append('\\')
- builder.append(ord(ch))
+ builder.append_ascii('\\')
+ builder.append_codepoint(ord(ch))
return builder.build(), pos
@@ -149,7 +149,7 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- builder.append(res)
+ builder.append_utf8(res)
else:
try:
chr = r_uint(int(s[pos:pos+digits], 16))
@@ -159,18 +159,18 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- builder.append(res)
+ builder.append_utf8(res)
else:
# when we get here, chr is a 32-bit unicode character
if chr <= MAXUNICODE:
- builder.append(intmask(chr))
+ builder.append_codepoint(intmask(chr))
pos += digits
else:
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
- builder.append(res)
+ builder.append_utf8(res)
return pos
def make_unicode_escape_function(pass_printable=False, unicode_output=False,
@@ -288,7 +288,7 @@
# Non-escape characters are interpreted as Unicode ordinals
if ch != '\\':
- result.append(ord(ch))
+ result.append_codepoint(ord(ch))
pos += 1
continue
@@ -299,18 +299,18 @@
pos += 1
if pos == size or s[pos] != '\\':
break
- result.append('\\')
+ result.append_ascii('\\')
# we have a backslash at the end of the string, stop here
if pos >= size:
- result.append('\\')
+ result.append_ascii('\\')
break
if ((pos - bs) & 1 == 0 or
pos >= size or
(s[pos] != 'u' and s[pos] != 'U')):
- result.append('\\')
- result.append(ord(s[pos]))
+ result.append_ascii('\\')
+ result.append_codepoint(ord(s[pos]))
pos += 1
continue
@@ -350,7 +350,7 @@
pos = 0
result = Utf8Builder(size)
while pos < size:
- result.append(ord(s[pos]))
+ result.append_codepoint(ord(s[pos]))
pos += 1
return result.build(), pos
@@ -370,12 +370,12 @@
while pos < size:
c = s[pos]
if ord(c) < 128:
- result.append(c)
+ result.append_ascii(c)
pos += 1
else:
r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
s, pos, pos + 1)
- result.append(r)
+ result.append_utf8(r)
return result.build(), pos
@@ -416,8 +416,9 @@
# py3k only
result.append(rs)
continue
+
for ch in ru:
- cd = utf8.ORD(ch, 0)
+ cd = utf8ord(ch, 0)
if cd < limit:
result.append(chr(cd))
else:
@@ -470,9 +471,10 @@
if (iter.pos != len(s) and oc <= 0xDBFF and
0xDC00 <= iter.peek_next() <= 0xDFFF):
oc2 = iter.next()
- result.append(((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
+ result.append_codepoint(
+ ((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
elif allow_surrogates:
- result.append(oc)
+ result.append_codepoint(oc)
else:
ru, rs, pos = errorhandler(errors, 'utf8',
'surrogates not allowed', s,
@@ -480,17 +482,17 @@
iter.move(pos - iter.pos)
if rs is not None:
# py3k only
- result.append(rs)
+ result.append_utf8(rs)
continue
for ch in ru:
if ord(ch) < 0x80:
- result.append(ch)
+ result.append_ascii(ch)
else:
errorhandler('strict', 'utf8',
'surrogates not allowed',
s, pos-1, pos)
else:
- result.append(oc)
+ result.append_codepoint(oc)
return result.build().bytes
@@ -516,7 +518,7 @@
# fast path for ASCII
# XXX maybe use a while loop here
if ordch1 < 0x80:
- result.append(ordch1)
+ result.append_codepoint(ordch1)
pos += 1
continue
@@ -532,7 +534,7 @@
r, pos = errorhandler(errors, 'utf8',
'unexpected end of data',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
break
ordch2 = ord(s[pos+1])
if n == 3:
@@ -544,14 +546,14 @@
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
continue
else:
# second byte valid, but third byte missing
r, pos = errorhandler(errors, 'utf8',
'unexpected end of data',
s, pos, pos+2)
- result.append(r)
+ result.append_utf8(r)
break
elif n == 4:
# 4-bytes seq with 1 or 2 continuation bytes
@@ -562,28 +564,28 @@
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
continue
elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2: # 0b10
# third byte invalid, take the first two and continue
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+2)
- result.append(r)
+ result.append_utf8(r)
continue
else:
# there's only 1 or 2 valid cb, but the others are missing
r, pos = errorhandler(errors, 'utf8',
'unexpected end of data',
s, pos, pos+charsleft+1)
- result.append(r)
+ result.append_utf8(r)
break
if n == 0:
r, pos = errorhandler(errors, 'utf8',
'invalid start byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
elif n == 1:
assert 0, "ascii should have gone through the fast path"
@@ -594,11 +596,11 @@
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
continue
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
- result.append(((ordch1 & 0x1F) << 6) + # 0b00011111
- (ordch2 & 0x3F)) # 0b00111111
+ result.append_codepoint(((ordch1 & 0x1F) << 6) + # 0b00011111
+ (ordch2 & 0x3F)) # 0b00111111
pos += 2
elif n == 3:
@@ -612,18 +614,18 @@
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
continue
elif ordch3>>6 != 0x2: # 0b10
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+2)
- result.append(r)
+ result.append_utf8(r)
continue
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
- result.append((((ordch1 & 0x0F) << 12) + # 0b00001111
- ((ordch2 & 0x3F) << 6) + # 0b00111111
- (ordch3 & 0x3F))) # 0b00111111
+ result.append_codepoint((((ordch1 & 0x0F) << 12) + # 0b00001111
+ ((ordch2 & 0x3F) << 6) + # 0b00111111
+ (ordch3 & 0x3F))) # 0b00111111
pos += 3
elif n == 4:
@@ -636,19 +638,19 @@
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
- result.append(r)
+ result.append_utf8(r)
continue
elif ordch3>>6 != 0x2: # 0b10
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+2)
- result.append(r)
+ result.append_utf8(r)
continue
elif ordch4>>6 != 0x2: # 0b10
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+3)
- result.append(r)
+ result.append_utf8(r)
continue
# 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
c = (((ordch1 & 0x07) << 18) + # 0b00000111
@@ -659,7 +661,7 @@
# TODO: Why doesn't this raise an error when c > MAXUNICODE? If I'm
# converting utf8 -> utf8 is this necessary
if c <= MAXUNICODE:
- result.append(c)
+ result.append_codepoint(c)
pos += 4
return pos
@@ -748,13 +750,13 @@
break
r, pos = errorhandler(errors, 'utf16', "truncated data",
s, pos, len(s))
- result.append(r)
+ result.append_utf8(r)
if len(s) - pos < 2:
break
ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
pos += 2
if ch < 0xD800 or ch > 0xDFFF:
- result.append(ch)
+ result.append_codepoint(ch)
continue
# UTF-16 code pair:
if len(s) - pos < 2:
@@ -763,26 +765,26 @@
break
errmsg = "unexpected end of data"
r, pos = errorhandler(errors, 'utf16', errmsg, s, pos, len(s))
- result.append(r)
+ result.append_utf8(r)
if len(s) - pos < 2:
break
elif 0xD800 <= ch <= 0xDBFF:
ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
pos += 2
if 0xDC00 <= ch2 <= 0xDFFF:
- result.append((((ch & 0x3FF)<<10) |
- (ch2 & 0x3FF)) + 0x10000)
+ result.append_codepoint((((ch & 0x3FF)<<10) |
+ (ch2 & 0x3FF)) + 0x10000)
continue
else:
r, pos = errorhandler(errors, 'utf16',
"illegal UTF-16 surrogate",
s, pos - 4, pos - 2)
- result.append(r)
+ result.append_utf8(r)
else:
r, pos = errorhandler(errors, 'utf16',
"illegal encoding",
s, pos - 2, pos)
- result.append(r)
+ result.append_utf8(r)
return result.build(), pos, bo
def create_surrogate_pair(val):
@@ -930,7 +932,7 @@
break
r, pos = errorhandler(errors, encodingname, "truncated data",
s, pos, len(s))
- result.append(r)
+ result.append_utf8(r)
if len(s) - pos < 4:
break
continue
@@ -940,10 +942,10 @@
r, pos = errorhandler(errors, encodingname,
"codepoint not in range(0x110000)",
s, pos, len(s))
- result.append(r)
+ result.append_utf8(r)
continue
- result.append(ch)
+ result.append_codepoint(ch)
pos += 4
return result.build(), pos, bo
@@ -1131,19 +1133,19 @@
if surrogate:
# expecting a second surrogate
if outCh >= 0xDC00 and outCh <= 0xDFFFF:
- result.append((((surrogate & 0x3FF)<<10) |
- (outCh & 0x3FF)) + 0x10000)
+ result.append_codepoint((((surrogate & 0x3FF)<<10) |
+ (outCh & 0x3FF)) + 0x10000)
surrogate = 0
continue
else:
- result.append(surrogate)
+ result.append_codepoint(surrogate)
surrogate = 0
# Not done with outCh: falls back to next line
if outCh >= 0xD800 and outCh <= 0xDBFF:
# first surrogate
surrogate = outCh
else:
- result.append(outCh)
+ result.append_codepoint(outCh)
else:
# now leaving a base-64 section
@@ -1151,7 +1153,7 @@
pos += 1
if surrogate:
- result.append(surrogate)
+ result.append_codepoint(surrogate)
surrogate = 0
if base64bits > 0: # left-over bits
@@ -1160,7 +1162,7 @@
msg = "partial character in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- result.append(res)
+ result.append_utf8(res)
continue
else:
# Some bits remain; they should be zero
@@ -1168,7 +1170,7 @@
msg = "non-zero padding bits in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- result.append(res)
+ result.append_utf8(res)
continue
if ch == '-':
@@ -1178,13 +1180,13 @@
base64buffer = 0
surrogate = 0
else:
- result.append(ch)
+ result.append_codepoint(oc)
elif ch == '+':
pos += 1 # consume '+'
if pos < size and s[pos] == '-': # '+-' encodes '+'
pos += 1
- result.append('+')
+ result.append_ascii('+')
else: # begin base64-encoded section
inShift = 1
shiftOutStartPos = pos - 1
@@ -1192,13 +1194,13 @@
base64buffer = 0
elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
- result.append(chr(oc))
+ result.append_codepoint(oc)
pos += 1
else:
pos += 1
msg = "unexpected special character"
res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
- result.append(res)
+ result.append_utf8(res)
# end of string
@@ -1209,7 +1211,7 @@
(base64bits > 0 and base64buffer != 0)):
msg = "unterminated shift sequence"
res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos)
- result.append(res)
+ result.append_utf8(res)
elif inShift:
pos = shiftOutStartPos # back off output
@@ -1298,9 +1300,9 @@
r, pos = errorhandler(errors, "charmap",
"character maps to <undefined>",
s, pos, pos + 1)
- result.append(r)
+ result.append_utf8(r)
continue
- result.append(c)
+ result.append_utf8(c)
pos += 1
return result.build(), pos
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -231,9 +231,9 @@
pos = start
while pos < end:
code = utf8ord(obj, pos)
- builder.append("&#")
- builder.append(str(code))
- builder.append(";")
+ builder.append_ascii("&#")
+ builder.append_ascii(str(code))
+ builder.append_ascii(";")
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
else:
@@ -254,13 +254,13 @@
oc = utf8ord(obj, pos)
num = hex(oc)
if (oc >= 0x10000):
- builder.append("\\U")
+ builder.append_ascii("\\U")
zeros = 8
elif (oc >= 0x100):
- builder.append("\\u")
+ builder.append_ascii("\\u")
zeros = 4
else:
- builder.append("\\x")
+ builder.append_ascii("\\x")
zeros = 2
lnum = len(num)
nb = zeros + 2 - lnum # num starts with '0x'
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -141,9 +141,9 @@
i += 1
else:
seennl |= SEEN_CR
- builder.append('\n')
+ builder.append_ascii('\n')
continue
- builder.append(c)
+ builder.append_codepoint(c)
output = builder.build()
self.seennl |= seennl
@@ -614,7 +614,7 @@
# Keep reading chunks until we have n characters to return
while True:
data = self._get_decoded_chars(remaining)
- builder.append(data)
+ builder.append_utf8(data)
remaining -= len(data)
if remaining <= 0: # Done
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -418,8 +418,8 @@
result.append_multiple_char(' ', padding)
# pad with spaces on the left
if sign:
- # TODO: Why r[0]?
result.append(r[0]) # the sign
+ # prefix is is only every '' or '0x', ie always ascii
result.append(prefix) # the prefix
if padnumber == '0':
result.append_multiple_char('0', padding)
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -7,7 +7,7 @@
from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD, utf8chr
from pypy.interpreter.utf8_codecs import (
unicode_encode_latin_1, unicode_encode_ascii, str_decode_ascii)
-from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
+from rpython.rlib import rstring, rlocale, rfloat, jit
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rfloat import copysign, formatd
@@ -698,6 +698,7 @@
need_separator = False
done = False
previous = 0
+
while True:
group = ord(grouping[grouping_state])
if group > 0:
@@ -750,9 +751,7 @@
if spec.n_sign:
if self.is_unicode:
- # TODO: A better way to do this might be to check if
- # spec.sign < 127 ...
- sign = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
+ sign = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
else:
sign = chr(spec.sign)
out.append(sign)
More information about the pypy-commit
mailing list