[pypy-svn] r75632 - in pypy/branch/interplevel-codecs/pypy: module/_codecs module/_codecs/test rlib
afa at codespeak.net
afa at codespeak.net
Mon Jun 28 13:46:45 CEST 2010
Author: afa
Date: Mon Jun 28 13:46:43 2010
New Revision: 75632
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Port some codecs from applevel to interplevel:
raw_unicode_escape, and the error handlers
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py Mon Jun 28 13:46:43 2010
@@ -9,8 +9,6 @@
'charmap_encode' : 'app_codecs.charmap_encode',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
- 'raw_unicode_escape_decode' : 'app_codecs.raw_unicode_escape_decode',
- 'raw_unicode_escape_encode' : 'app_codecs.raw_unicode_escape_encode',
'unicode_escape_decode' : 'app_codecs.unicode_escape_decode',
'unicode_escape_encode' : 'app_codecs.unicode_escape_encode',
'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
@@ -44,6 +42,8 @@
'charbuffer_encode': 'interp_codecs.buffer_encode',
'readbuffer_encode': 'interp_codecs.buffer_encode',
'charmap_decode' : 'interp_codecs.charmap_decode',
+ 'raw_unicode_escape_decode' : 'interp_codecs.raw_unicode_escape_decode',
+ 'raw_unicode_escape_encode' : 'interp_codecs.raw_unicode_escape_encode',
}
def __init__(self, space, *args):
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py Mon Jun 28 13:46:43 2010
@@ -46,13 +46,6 @@
v = s[1:-1]
return v, len(v)
-def raw_unicode_escape_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors)
- res = u''.join(res)
- return res, len(data)
-
def utf_7_decode( data, errors='strict'):
"""None
"""
@@ -205,13 +198,6 @@
res = ''.join(res)
return res, len(res)
-def raw_unicode_escape_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeRawUnicodeEscape(obj, len(obj))
- res = ''.join(res)
- return res, len(res)
-
# ----------------------------------------------------------------------
##import sys
@@ -690,30 +676,6 @@
p += ch
return p
-def PyUnicode_EncodeRawUnicodeEscape(s, size):
-
- if (size == 0):
- return ''
-
- p = []
- for ch in s:
-# /* Map 32-bit characters to '\Uxxxxxxxx' */
- if (ord(ch) >= 0x10000):
- p += '\\'
- p += 'U'
- p += '%08x' % (ord(ch))
- elif (ord(ch) >= 256) :
-# /* Map 16-bit characters to '\uxxxx' */
- p += '\\'
- p += 'u'
- p += '%04x' % (ord(ch))
-# /* Copy everything else as-is */
- else:
- p += chr(ord(ch))
-
- #p += '\0'
- return p
-
def charmapencode_output(c, mapping):
rep = mapping[c]
@@ -761,85 +723,6 @@
return res
-def PyUnicode_DecodeRawUnicodeEscape(s, size, errors):
- import sys
-
- if (size == 0):
- return u''
- pos = 0
- p = []
- while (pos < len(s)):
- ch = s[pos]
- #/* Non-escape characters are interpreted as Unicode ordinals */
- if (ch != '\\'):
- p += unichr(ord(ch))
- pos += 1
- continue
- startinpos = pos
-## /* \u-escapes are only interpreted iff the number of leading
-## backslashes is odd */
- bs = pos
- while pos < size:
- if (s[pos] != '\\'):
- break
- p += unichr(ord(s[pos]))
- pos += 1
-
- # we have a backlash at the end of the string, stop here
- if pos >= size:
- break
-
- if (((pos - bs) & 1) == 0 or
- pos >= size or
- (s[pos] != 'u' and s[pos] != 'U')) :
- p += unichr(ord(s[pos]))
- pos += 1
- continue
-
- p.pop(-1)
- if s[pos] == 'u':
- count = 4
- else:
- count = 8
- pos += 1
-
- #/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
- x = 0
- try:
- x = int(s[pos:pos+count], 16)
- except ValueError:
- res = unicode_call_errorhandler(
- errors, "rawunicodeescape", "truncated \\uXXXX",
- s, size, pos, pos+count)
- p += res[0]
- pos = res[1]
- else:
- #ifndef Py_UNICODE_WIDE
- if sys.maxunicode > 0xffff:
- if (x > sys.maxunicode):
- res = unicode_call_errorhandler(
- errors, "rawunicodeescape", "\\Uxxxxxxxx out of range",
- s, size, pos, pos+1)
- pos = res[1]
- p += res[0]
- else:
- p += unichr(x)
- pos += count
- else:
- if (x > 0x10000):
- res = unicode_call_errorhandler(
- errors, "rawunicodeescape", "\\Uxxxxxxxx out of range",
- s, size, pos, pos+1)
- pos = res[1]
- p += res[0]
-
- #endif
- else:
- p += unichr(x)
- pos += count
-
- return p
-
def charmap_build(somestring):
m = {}
num = 0
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py Mon Jun 28 13:46:43 2010
@@ -1,5 +1,5 @@
from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, applevel
+from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, interp2app
from pypy.interpreter.baseobjspace import W_Root
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
from pypy.rlib.objectmodel import we_are_translated
@@ -114,78 +114,124 @@
"unknown encoding: %s", encoding)
lookup_codec.unwrap_spec = [ObjSpace, str]
-app_errors = applevel("""
-def check_exception(exc):
+# ____________________________________________________________
+# Register standard error handlers
+
+def check_exception(space, w_exc):
try:
- delta = exc.end - exc.start
- if delta < 0 or not isinstance(exc.object, (unicode, str)):
- raise TypeError("wrong exception")
- except AttributeError:
- raise TypeError("wrong exception")
-
-def strict_errors(exc):
- if isinstance(exc, Exception):
- raise exc
- else:
- raise TypeError("codec must pass exception instance")
-
-def ignore_errors(exc):
- check_exception(exc)
- if isinstance(exc, UnicodeEncodeError):
- return u'', exc.end
- elif isinstance(exc, (UnicodeDecodeError, UnicodeTranslateError)):
- return u'', exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-Py_UNICODE_REPLACEMENT_CHARACTER = u"\ufffd"
-
-def replace_errors(exc):
- check_exception(exc)
- if isinstance(exc, UnicodeEncodeError):
- return u'?'*(exc.end-exc.start), exc.end
- elif isinstance(exc, (UnicodeTranslateError, UnicodeDecodeError)):
- return Py_UNICODE_REPLACEMENT_CHARACTER*(exc.end-exc.start), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-def xmlcharrefreplace_errors(exc):
- if isinstance(exc, UnicodeEncodeError):
- res = []
- for ch in exc.object[exc.start:exc.end]:
- res += '&#'
- res += str(ord(ch))
- res += ';'
- return u''.join(res), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-
-def backslashreplace_errors(exc):
- if isinstance(exc, UnicodeEncodeError):
- p = []
- for c in exc.object[exc.start:exc.end]:
- p += '\\\\'
- oc = ord(c)
- if (oc >= 0x00010000):
- p += 'U'
- p += "%.8x" % ord(c)
+ w_start = space.getattr(w_exc, space.wrap('start'))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ w_obj = space.getattr(w_exc, space.wrap('object'))
+ except OperationError, e:
+ if not e.match(space, space.w_AttributeError):
+ raise
+ raise OperationError(space.w_TypeError, space.wrap(
+ "wrong exception"))
+
+ delta = space.int_w(w_end) - space.int_w(w_start)
+ if delta < 0 or not (space.isinstance_w(w_obj, space.w_str) or
+ space.isinstance_w(w_obj, space.w_unicode)):
+ raise OperationError(space.w_TypeError, space.wrap(
+ "wrong exception"))
+
+def strict_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_BaseException):
+ raise OperationError(space.type(w_exc), w_exc)
+ else:
+ raise OperationError(space.w_TypeError, space.wrap(
+ "codec must pass exception instance"))
+
+def ignore_errors(space, w_exc):
+ check_exception(space, w_exc)
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ return space.newtuple([space.wrap(''), w_end])
+ elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+ space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+ return space.newtuple([space.wrap(u''), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def replace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ w_start = space.getattr(w_exc, space.wrap('start'))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ size = space.int_w(w_end) - space.int_w(w_start)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ text = '?' * size
+ return space.newtuple([space.wrap(text), w_end])
+ elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+ space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+ text = u'\ufffd' * size
+ return space.newtuple([space.wrap(text), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def xmlcharrefreplace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ end = space.int_w(w_end)
+ builder = UnicodeBuilder()
+ pos = start
+ while pos < end:
+ ch = obj[pos]
+ builder.append(u"&#")
+ builder.append(unicode(ord(ch)))
+ builder.append(u";")
+ pos += 1
+ return space.newtuple([space.wrap(builder.build()), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def backslashreplace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ end = space.int_w(w_end)
+ builder = UnicodeBuilder()
+ pos = start
+ while pos < end:
+ oc = ord(obj[pos])
+ num = hex(oc)
+ if (oc >= 0x10000):
+ builder.append(u"\\U")
+ zeros = 8
elif (oc >= 0x100):
- p += 'u'
- p += "%.4x" % ord(c)
+ builder.append(u"\\u")
+ zeros = 4
else:
- p += 'x'
- p += "%.2x" % ord(c)
- return u''.join(p), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-""")
+ builder.append(u"\\x")
+ zeros = 2
+ nb = zeros + 2 - len(num) # num starts with '0x'
+ if nb > 0:
+ builder.append_multiple_char(u'0', nb)
+ builder.append_slice(unicode(num), 2, 8)
+ pos += 1
+ return space.newtuple([space.wrap(builder.build()), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
def register_builtin_error_handlers(space):
+ "NOT_RPYTHON"
state = space.fromcache(CodecState)
for error in ("strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"):
name = error + "_errors"
- state.codec_error_registry[error] = app_errors.wget(space, name)
+ state.codec_error_registry[error] = space.wrap(interp2app(globals()[name]))
def lookup_error(space, errors):
@@ -312,6 +358,7 @@
"utf_16_encode",
"utf_16_be_encode",
"utf_16_le_encode",
+ "raw_unicode_escape_encode",
]:
make_encoder_wrapper(encoders)
@@ -322,6 +369,7 @@
"utf_16_decode",
"utf_16_be_decode",
"utf_16_le_decode",
+ "raw_unicode_escape_decode",
]:
make_decoder_wrapper(decoders)
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py Mon Jun 28 13:46:43 2010
@@ -15,7 +15,6 @@
def test_bigU_codecs(self):
import sys
- oldmaxunicode = sys.maxunicode
if sys.maxunicode <= 0xffff:
return # this test cannot run on UCS2 builds
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
@@ -23,17 +22,14 @@
'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
assert unicode(u.encode(encoding),encoding) == u
- sys.maxunicode = oldmaxunicode
def test_ucs4(self):
import sys
- oldmaxunicode = sys.maxunicode
if sys.maxunicode <= 0xffff:
- sys.maxunicode = 0xffffffff
+ return # this test cannot run on UCS2 builds
x = u'\U00100000'
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
assert x == y
- sys.maxunicode = oldmaxunicode
def test_named_unicode(self):
assert unicode('\\N{SPACE}','unicode-escape') == u" "
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Mon Jun 28 13:46:43 2010
@@ -491,6 +491,104 @@
res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 128)
return res
+# ____________________________________________________________
+# Raw unicode escape
+
+def str_decode_raw_unicode_escape(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if (size == 0):
+ return u'', 0
+
+ result = UnicodeBuilder(size)
+ pos = 0
+ while pos < len(s):
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if (ch != '\\'):
+ result.append(unichr(ord(ch)))
+ pos += 1
+ continue
+
+ startinpos = pos
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if (s[pos] != '\\'):
+ break
+ result.append(u'\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ result.append(u'\\')
+ break
+
+ if (((pos - bs) & 1) == 0 or
+ pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')) :
+ result.append(u'\\')
+ result.append(unichr(ord(s[pos])))
+ pos += 1
+ continue
+
+ if s[pos] == 'u':
+ count = 4
+ else:
+ count = 8
+ pos += 1
+
+ # \uXXXX with 4 hex digits, \Uxxxxxxxx with 8
+ x = 0
+ try:
+ x = int(s[pos:pos+count], 16)
+ except ValueError:
+ res, pos = errorhandler(errors, "rawunicodeescape",
+ "truncated \\uXXXX",
+ s, pos, size)
+ result.append(res)
+ continue
+
+ if (x > MAXUNICODE):
+ res, pos = errorhandler(errors, "rawunicodeescape",
+ "\\Uxxxxxxxx out of range",
+ s, pos, size)
+ result.append(res)
+ continue
+
+ result.append(unichr(x))
+ pos += count
+
+ return result.build(), pos
+
+def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
+ # errorhandler is not used: this function cannot cause Unicode errors
+ if (size == 0):
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ oc = ord(s[pos])
+ if oc < 0x100:
+ result.append(chr(oc))
+ else:
+ num = hex(oc)
+ if (oc >= 0x10000):
+ result.append("\\U")
+ zeros = 8
+ else:
+ result.append("\\u")
+ zeros = 4
+ nb = zeros + 2 - len(num) # num starts with '0x'
+ if nb > 0:
+ result.append_multiple_char('0', nb)
+ result.append_slice(num, 2, 8)
+ pos += 1
+
+ return result.build()
# ____________________________________________________________
# MBCS codecs for Windows
More information about the Pypy-commit
mailing list