[pypy-svn] r75711 - pypy/branch/interplevel-codecs/pypy/rlib
afa at codespeak.net
afa at codespeak.net
Thu Jul 1 10:52:11 CEST 2010
Author: afa
Date: Thu Jul 1 10:52:09 2010
New Revision: 75711
Modified:
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
- Uniformly handle the decoding of \U and \u
- Decoded \U sequences must be unsigned int, otherwise \UFFFFFEEEE wraps and becomes \ueeee...
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Thu Jul 1 10:52:09 2010
@@ -3,6 +3,7 @@
from pypy.rpython.lltypesystem import lltype, rffi
from pypy.rlib.objectmodel import we_are_translated, specialize
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
+from pypy.rlib.rarithmetic import r_uint
if rffi.sizeof(lltype.UniChar) == 4:
MAXUNICODE = 0x10ffff
@@ -780,7 +781,8 @@
hexdigits = "0123456789ABCDEFabcdef"
-def hexescape(builder, s, pos, digits, errorhandler, message, errors):
+def hexescape(builder, s, pos, digits,
+ encoding, errorhandler, message, errors):
import sys
chr = 0
if (pos+digits>len(s)):
@@ -790,12 +792,12 @@
builder.append(res)
else:
try:
- chr = int(s[pos:pos+digits], 16)
+ chr = r_uint(int(s[pos:pos+digits], 16))
except ValueError:
endinpos = pos
while s[endinpos] in hexdigits:
endinpos += 1
- res, pos = errorhandler(errors, "unicodeescape",
+ res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos+1)
builder.append(res)
else:
@@ -811,8 +813,8 @@
pos += digits
else:
message = "illegal Unicode character"
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-2, pos+1)
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, pos+digits)
builder.append(res)
return pos
@@ -878,21 +880,21 @@
digits = 2
message = "truncated \\xXX escape"
pos = hexescape(builder, s, pos, digits,
- errorhandler, message, errors)
+ "unicodeescape", errorhandler, message, errors)
# \uXXXX
elif ch == 'u':
digits = 4
message = "truncated \\uXXXX escape"
pos = hexescape(builder, s, pos, digits,
- errorhandler, message, errors)
+ "unicodeescape", errorhandler, message, errors)
# \UXXXXXXXX
elif ch == 'U':
digits = 8
message = "truncated \\UXXXXXXXX escape"
pos = hexescape(builder, s, pos, digits,
- errorhandler, message, errors)
+ "unicodeescape", errorhandler, message, errors)
# \N{name}
elif ch == 'N':
@@ -1052,31 +1054,14 @@
continue
if s[pos] == 'u':
- count = 4
+ digits = 4
+ message = "truncated \\uXXXX escape"
else:
- count = 8
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
pos += 1
-
- # \uXXXX with 4 hex digits, \Uxxxxxxxx with 8
- x = 0
- try:
- x = int(s[pos:pos+count], 16)
- except ValueError:
- res, pos = errorhandler(errors, "rawunicodeescape",
- "truncated \\uXXXX",
- s, pos, size)
- result.append(res)
- continue
-
- if (x > MAXUNICODE):
- res, pos = errorhandler(errors, "rawunicodeescape",
- "\\Uxxxxxxxx out of range",
- s, pos, size)
- result.append(res)
- continue
-
- result.append(unichr(x))
- pos += count
+ pos = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
return result.build(), pos
@@ -1147,10 +1132,10 @@
if pos > size - unicode_bytes:
break
continue
- t = 0
+ t = r_uint(0)
h = 0
for j in range(start, stop, step):
- t += ord(s[pos + j]) << (h*8)
+ t += r_uint(ord(s[pos + j])) << (h*8)
h += 1
if t > MAXUNICODE:
res, pos = errorhandler(errors, "unicode_internal",
More information about the Pypy-commit
mailing list