[pypy-svn] r75657 - in pypy/branch/interplevel-codecs/pypy: module/_codecs rlib
afa at codespeak.net
afa at codespeak.net
Tue Jun 29 14:11:50 CEST 2010
Author: afa
Date: Tue Jun 29 14:11:48 2010
New Revision: 75657
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
rewrite the unicode_internal codec
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py Tue Jun 29 14:11:48 2010
@@ -8,8 +8,6 @@
'__name__' : 'app_codecs.__name__',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
- 'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
- 'unicode_internal_encode' : 'app_codecs.unicode_internal_encode',
}
interpleveldefs = {
'encode': 'interp_codecs.encode',
@@ -44,6 +42,8 @@
'unicode_escape_encode' : 'interp_codecs.unicode_escape_encode',
'raw_unicode_escape_decode' : 'interp_codecs.raw_unicode_escape_decode',
'raw_unicode_escape_encode' : 'interp_codecs.raw_unicode_escape_encode',
+ 'unicode_internal_decode' : 'interp_codecs.unicode_internal_decode',
+ 'unicode_internal_encode' : 'interp_codecs.unicode_internal_encode',
}
def __init__(self, space, *args):
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py Tue Jun 29 14:11:48 2010
@@ -43,71 +43,6 @@
v = s[1:-1]
return v, len(v)
-def unicode_internal_encode( obj, errors='strict'):
- """None
- """
- import sys
- if sys.maxunicode == 65535:
- unicode_bytes = 2
- else:
- unicode_bytes = 4
- p = []
- for x in obj:
- i = ord(x)
- bytes = []
- for j in xrange(unicode_bytes):
- bytes += chr(i%256)
- i >>= 8
- if sys.byteorder == "big":
- bytes.reverse()
- p += bytes
- res = ''.join(p)
- return res, len(res)
-
-def unicode_internal_decode( unistr, errors='strict'):
- if type(unistr) == unicode:
- return unistr, len(unistr)
- else:
- import sys
- if sys.maxunicode == 65535:
- unicode_bytes = 2
- else:
- unicode_bytes = 4
- p = []
- i = 0
- if sys.byteorder == "big":
- start = unicode_bytes - 1
- stop = -1
- step = -1
- else:
- start = 0
- stop = unicode_bytes
- step = 1
- while i < len(unistr):
- if len(unistr) - i < unicode_bytes:
- msg = 'truncated input'
- next, i = unicode_call_errorhandler(errors, 'unicode_internal', msg,
- unistr, i, len(unistr))
- p += next
- continue
- t = 0
- h = 0
- for j in range(start, stop, step):
- t += ord(unistr[i+j])<<(h*8)
- h += 1
- i += unicode_bytes
- try:
- p += unichr(t)
- except ValueError:
- startpos = i - unicode_bytes
- endpos = i
- msg = "unichr(%s) not in range" % (t,)
- next, i = unicode_call_errorhandler(errors, 'unicode_internal', msg,
- unistr, startpos, endpos)
- p += next
- res = u''.join(p)
- return res, len(unistr)
-
# XXX needs error messages when the input is invalid
def escape_decode(data, errors='strict'):
"""None
@@ -158,25 +93,3 @@
res = ''.join(res)
return res, len(data)
-# ----------------------------------------------------------------------
-
-def unicode_call_errorhandler(errors, encoding,
- reason, input, startinpos, endinpos, decode=True):
-
- import _codecs
- errorHandler = _codecs.lookup_error(errors)
- if decode:
- exceptionObject = UnicodeDecodeError(encoding, input, startinpos, endinpos, reason)
- else:
- exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason)
- res = errorHandler(exceptionObject)
- if isinstance(res, tuple) and len(res) == 2 and isinstance(res[0], unicode) and isinstance(res[1], int):
- newpos = res[1]
- if (newpos < 0):
- newpos = len(input) + newpos
- if newpos < 0 or newpos > len(input):
- raise IndexError( "position %d from error handler out of bounds" % newpos)
- return res[0], newpos
- else:
- raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
-
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py Tue Jun 29 14:11:48 2010
@@ -375,6 +375,7 @@
"utf_16_le_encode",
"unicode_escape_encode",
"raw_unicode_escape_encode",
+ "unicode_internal_encode",
]:
make_encoder_wrapper(encoders)
@@ -583,3 +584,25 @@
unicode_name_handler)
return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+# ____________________________________________________________
+# Unicode-internal
+
+ at unwrap_spec(ObjSpace, W_Root, str)
+def unicode_internal_decode(space, w_string, errors="strict"):
+ # special case for this codec: unicodes are returned as is
+ if space.isinstance_w(w_string, space.w_unicode):
+ return space.newtuple([w_string, space.len(w_string)])
+
+ string = space.str_w(w_string)
+
+ if len(string) == 0:
+ return space.newtuple([space.wrap(u''), space.wrap(0)])
+
+ final = True
+ state = space.fromcache(CodecState)
+ result, consumed = runicode.str_decode_unicode_internal(
+ string, len(string), errors,
+ final, state.decode_error_handler)
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Tue Jun 29 14:11:48 2010
@@ -1117,6 +1117,90 @@
return result.build()
# ____________________________________________________________
+# unicode-internal
+
+def str_decode_unicode_internal(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if (size == 0):
+ return u'', 0
+
+ if MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
+ if BYTEORDER == "little":
+ start = 0
+ stop = unicode_bytes
+ step = 1
+ else:
+ start = unicode_bytes - 1
+ stop = -1
+ step = -1
+
+ result = UnicodeBuilder(size // unicode_bytes)
+ pos = 0
+ while pos < size:
+ if pos > size - unicode_bytes:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "truncated input",
+ s, pos, size)
+ result.append(res)
+ if pos > size - unicode_bytes:
+ break
+ continue
+ t = 0
+ h = 0
+ for j in range(start, stop, step):
+ t += ord(s[pos + j]) << (h*8)
+ h += 1
+ if t > MAXUNICODE:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "unichr(%d) not in range" % (t,),
+ s, pos, pos + unicode_bytes)
+ result.append(res)
+ continue
+ result.append(unichr(t))
+ pos += unicode_bytes
+ return result.build(), pos
+
+def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
+ if (size == 0):
+ return ''
+
+ if MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
+
+ result = StringBuilder(size * unicode_bytes)
+ pos = 0
+ while pos < size:
+ oc = ord(s[pos])
+ if MAXUNICODE < 65536:
+ if BYTEORDER == "little":
+ result.append(chr(oc & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ else:
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc & 0xFF))
+ else:
+ if BYTEORDER == "little":
+ result.append(chr(oc & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 24 & 0xFF))
+ else:
+ result.append(chr(oc >> 24 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc & 0xFF))
+ pos += 1
+
+ return result.build()
+
+# ____________________________________________________________
# MBCS codecs for Windows
if sys.platform == 'win32':
More information about the Pypy-commit
mailing list