[pypy-svn] r48688 - in pypy/dist/pypy: interpreter interpreter/test module/_codecs module/_codecs/test objspace/std objspace/std/test rlib rlib/test tool
cfbolz at codespeak.net
cfbolz at codespeak.net
Wed Nov 14 20:28:12 CET 2007
Author: cfbolz
Date: Wed Nov 14 20:28:11 2007
New Revision: 48688
Added:
pypy/dist/pypy/module/_codecs/interp_codecs.py
- copied unchanged from r48687, pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
pypy/dist/pypy/rlib/runicode.py
- copied unchanged from r48687, pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
pypy/dist/pypy/rlib/test/test_runicode.py
- copied unchanged from r48687, pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
pypy/dist/pypy/tool/unicodefuzzer.py
- copied unchanged from r48687, pypy/branch/more-unicode-improvements/pypy/tool/unicodefuzzer.py
Modified:
pypy/dist/pypy/interpreter/gateway.py
pypy/dist/pypy/interpreter/test/test_gateway.py
pypy/dist/pypy/module/_codecs/__init__.py
pypy/dist/pypy/module/_codecs/app_codecs.py
pypy/dist/pypy/module/_codecs/test/test_codecs.py
pypy/dist/pypy/objspace/std/ropeobject.py
pypy/dist/pypy/objspace/std/stringobject.py
pypy/dist/pypy/objspace/std/test/test_stringobject.py
pypy/dist/pypy/objspace/std/test/test_unicodeobject.py
pypy/dist/pypy/objspace/std/unicodeobject.py
pypy/dist/pypy/objspace/std/unicodetype.py
Log:
merge the more-unicode-improvements branch. changes:
------------------------------------------------------------------------
r48674 | cfbolz | 2007-11-14 11:05:27 +0100 (Wed, 14 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodetype.py
unfail some string tests
------------------------------------------------------------------------
r48666 | cfbolz | 2007-11-13 21:22:42 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/ropeobject.py
move these operations on ropes to interplevel
------------------------------------------------------------------------
r48665 | cfbolz | 2007-11-13 21:22:19 +0100 (Tue, 13 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodetype.py
go through a bit less machinery when mixing strings and unicode strings in the
same operation.
------------------------------------------------------------------------
r48662 | cfbolz | 2007-11-13 19:30:18 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/test/test_codecs.py
oops, raise an app-level, not an interplevel error
------------------------------------------------------------------------
r48652 | cfbolz | 2007-11-13 16:32:38 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/test/test_unicodeobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
bug + fix
------------------------------------------------------------------------
r48650 | cfbolz | 2007-11-13 16:08:04 +0100 (Tue, 13 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/test/test_unicodeobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
refactor repr__Unicode, which seems to be written with the assumption that
RPython strings didn't have overalloction. also fix a bug in the process.
------------------------------------------------------------------------
r48649 | cfbolz | 2007-11-13 15:34:32 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
fix bug in utf-16-decoder
------------------------------------------------------------------------
r48644 | cfbolz | 2007-11-13 12:38:00 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
more speaking var names. don't use += for list appending
------------------------------------------------------------------------
r48639 | cfbolz | 2007-11-13 11:13:12 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
be a bit more consistent in variable names
------------------------------------------------------------------------
r48636 | cfbolz | 2007-11-13 10:51:09 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
fix the test plus a bit of niceification: call ord(ch) only once.
------------------------------------------------------------------------
r48635 | cfbolz | 2007-11-13 10:50:37 +0100 (Tue, 13 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
make the test more reliable: don't let CPython's unmarshaller renormalize the
string when reloading it from a pyc file.
------------------------------------------------------------------------
r48632 | cfbolz | 2007-11-13 10:05:41 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
test + fix
------------------------------------------------------------------------
r48627 | cfbolz | 2007-11-13 00:43:34 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
more typos
------------------------------------------------------------------------
r48626 | cfbolz | 2007-11-13 00:27:50 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
the annotator doesn't like reading from the sys module directly
------------------------------------------------------------------------
r48625 | cfbolz | 2007-11-13 00:19:03 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
typo
------------------------------------------------------------------------
r48624 | cfbolz | 2007-11-13 00:10:08 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
a sanity check
------------------------------------------------------------------------
r48623 | cfbolz | 2007-11-13 00:10:00 +0100 (Tue, 13 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
change latin1 to latin_1
------------------------------------------------------------------------
r48621 | cfbolz | 2007-11-12 23:45:55 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/interpreter/gateway.py
M /pypy/branch/more-unicode-improvements/pypy/interpreter/test/test_gateway.py
allow unicode in the unwrap_spec
------------------------------------------------------------------------
r48619 | cfbolz | 2007-11-12 23:12:00 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
kill kill kill some more code
------------------------------------------------------------------------
r48618 | cfbolz | 2007-11-12 23:02:18 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
this is no longer needed either
------------------------------------------------------------------------
r48617 | cfbolz | 2007-11-12 23:00:03 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/__init__.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
yet another decoder moved to use the runicode version
------------------------------------------------------------------------
r48616 | cfbolz | 2007-11-12 22:34:09 +0100 (Mon, 12 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
kill kill kill kill kill kill kill kill the applevel versions (they are buggy
anyway)
------------------------------------------------------------------------
r48615 | cfbolz | 2007-11-12 22:23:27 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/__init__.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
plug the RPython decoders and encoders into the _codecs module.
------------------------------------------------------------------------
r48612 | cfbolz | 2007-11-12 21:37:24 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
change names to be more consistent with CPython
------------------------------------------------------------------------
r48611 | cfbolz | 2007-11-12 20:39:51 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/tool/unicodefuzzer.py
fix typo (thanks Alexander)
------------------------------------------------------------------------
r48610 | cfbolz | 2007-11-12 20:15:51 +0100 (Mon, 12 Nov 2007) | 3 lines
Changed paths:
A /pypy/branch/more-unicode-improvements/pypy/tool/unicodefuzzer.py
small tool to encode and decode random unicode/byte strings to check encoders
and decoders. Finds examples that aren't working in PyPy incredibly fast.
------------------------------------------------------------------------
r48609 | cfbolz | 2007-11-12 19:50:55 +0100 (Mon, 12 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
grr. The error handling of PyPy's utf-16 decoder is broken. Fix the RPython
version at least.
------------------------------------------------------------------------
r48606 | cfbolz | 2007-11-12 18:49:47 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
tests for decoding errors, fix a small bug in utf8
------------------------------------------------------------------------
r48604 | cfbolz | 2007-11-12 18:18:42 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
add tests for encoding error handlers, fix problems found
------------------------------------------------------------------------
r48603 | cfbolz | 2007-11-12 18:00:33 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
port encoders to RPython
------------------------------------------------------------------------
r48601 | cfbolz | 2007-11-12 16:21:22 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
add utf16
------------------------------------------------------------------------
r48599 | cfbolz | 2007-11-12 15:36:22 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
ascii is simple too
------------------------------------------------------------------------
r48598 | cfbolz | 2007-11-12 15:29:42 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
M /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
decoding latin1 is simple
------------------------------------------------------------------------
r48597 | cfbolz | 2007-11-12 15:17:03 +0100 (Mon, 12 Nov 2007) | 3 lines
Changed paths:
A /pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
A /pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
start a new rlib library for unicode handling. so far it contains only an
rpython utf-8 decoder
------------------------------------------------------------------------
r48591 | cfbolz | 2007-11-12 03:25:43 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/__init__.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
A /pypy/branch/more-unicode-improvements/pypy/module/_codecs/interp_codecs.py
M /pypy/branch/more-unicode-improvements/pypy/module/_codecs/test/test_codecs.py
start moving some bits of the _codecs module to interplevel
------------------------------------------------------------------------
r48590 | cfbolz | 2007-11-12 02:01:08 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/stringobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodetype.py
tests are good: fix typos
------------------------------------------------------------------------
r48589 | cfbolz | 2007-11-12 01:40:05 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/stringobject.py
move string.translate to interplevel
------------------------------------------------------------------------
r48588 | cfbolz | 2007-11-12 00:16:11 +0100 (Mon, 12 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/stringobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/test/test_stringobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodetype.py
move string.encode and string.decode to interp-level. share code where possible.
------------------------------------------------------------------------
r48586 | cfbolz | 2007-11-11 23:51:23 +0100 (Sun, 11 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
fix error message
------------------------------------------------------------------------
r48585 | cfbolz | 2007-11-11 23:39:42 +0100 (Sun, 11 Nov 2007) | 2 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodetype.py
add an XXX
------------------------------------------------------------------------
r48584 | cfbolz | 2007-11-11 23:34:03 +0100 (Sun, 11 Nov 2007) | 3 lines
Changed paths:
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/test/test_unicodeobject.py
M /pypy/branch/more-unicode-improvements/pypy/objspace/std/unicodeobject.py
move unicode.encode to interplevel. add several encoding tests from CPython's
tests suite.
------------------------------------------------------------------------
r48581 | cfbolz | 2007-11-11 22:45:52 +0100 (Sun, 11 Nov 2007) | 2 lines
Changed paths:
A /pypy/branch/more-unicode-improvements (from /pypy/dist:48580)
create a new branch for more unicode improvements
------------------------------------------------------------------------
Modified: pypy/dist/pypy/interpreter/gateway.py
==============================================================================
--- pypy/dist/pypy/interpreter/gateway.py (original)
+++ pypy/dist/pypy/interpreter/gateway.py Wed Nov 14 20:28:11 2007
@@ -164,7 +164,7 @@
app_sig.varargname = argname[2:]
def visit__object(self, typ, app_sig):
- if typ not in (int, str, float, r_longlong):
+ if typ not in (int, str, float, unicode, r_longlong):
assert False, "unsupported basic type in unwrap_spec"
self.checked_space_method(typ.__name__, app_sig)
@@ -210,8 +210,8 @@
self.run_args.append(self.scopenext())
def visit__object(self, typ):
- if typ not in (int, str, float, r_longlong):
- assert False, "unsupported basic type in uwnrap_spec"
+ if typ not in (int, str, float, unicode, r_longlong):
+ assert False, "unsupported basic type in unwrap_spec"
if typ is r_int is r_longlong:
name = 'r_longlong'
else:
@@ -327,7 +327,7 @@
raise FastFuncNotSupported
def visit__object(self, typ):
- if typ not in (int, str, float, r_longlong):
+ if typ not in (int, str, float, unicode, r_longlong):
assert False, "unsupported basic type in uwnrap_spec"
self.unwrap.append("space.%s_w(%s)" % (typ.__name__,
self.nextarg()))
Modified: pypy/dist/pypy/interpreter/test/test_gateway.py
==============================================================================
--- pypy/dist/pypy/interpreter/test/test_gateway.py (original)
+++ pypy/dist/pypy/interpreter/test/test_gateway.py Wed Nov 14 20:28:11 2007
@@ -287,6 +287,27 @@
raises(gateway.OperationError,space.call_function,w_app_g3_f,w(None))
raises(gateway.OperationError,space.call_function,w_app_g3_f,w("foo"))
+ def test_interp2app_unwrap_spec_unicode(self):
+ space = self.space
+ w = space.wrap
+ def g3_u(space, uni):
+ return space.wrap(len(uni))
+ app_g3_u = gateway.interp2app_temp(g3_u,
+ unwrap_spec=[gateway.ObjSpace,
+ unicode])
+ w_app_g3_u = space.wrap(app_g3_u)
+ assert self.space.eq_w(
+ space.call_function(w_app_g3_u, w(u"foo")),
+ w(3))
+ assert self.space.eq_w(
+ space.call_function(w_app_g3_u, w("baz")),
+ w(3))
+ raises(gateway.OperationError, space.call_function, w_app_g3_u,
+ w(None))
+ raises(gateway.OperationError, space.call_function, w_app_g3_u,
+ w(42))
+
+
def test_interp2app_unwrap_spec_func(self):
space = self.space
w = space.wrap
Modified: pypy/dist/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/dist/pypy/module/_codecs/__init__.py (original)
+++ pypy/dist/pypy/module/_codecs/__init__.py Wed Nov 14 20:28:11 2007
@@ -4,41 +4,51 @@
appleveldefs = {
'__doc__' : 'app_codecs.__doc__',
'__name__' : 'app_codecs.__name__',
- 'ascii_decode' : 'app_codecs.ascii_decode',
- 'ascii_encode' : 'app_codecs.ascii_encode',
'charbuffer_encode' : 'app_codecs.charbuffer_encode',
'charmap_decode' : 'app_codecs.charmap_decode',
'charmap_encode' : 'app_codecs.charmap_encode',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
- 'latin_1_decode' : 'app_codecs.latin_1_decode',
- 'latin_1_encode' : 'app_codecs.latin_1_encode',
- 'lookup' : 'app_codecs.lookup',
- 'lookup_error' : 'app_codecs.lookup_error',
'mbcs_decode' : 'app_codecs.mbcs_decode',
'mbcs_encode' : 'app_codecs.mbcs_encode',
'raw_unicode_escape_decode' : 'app_codecs.raw_unicode_escape_decode',
'raw_unicode_escape_encode' : 'app_codecs.raw_unicode_escape_encode',
'readbuffer_encode' : 'app_codecs.readbuffer_encode',
- 'register' : 'app_codecs.register',
- 'register_error' : 'app_codecs.register_error',
'unicode_escape_decode' : 'app_codecs.unicode_escape_decode',
'unicode_escape_encode' : 'app_codecs.unicode_escape_encode',
'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
'unicode_internal_encode' : 'app_codecs.unicode_internal_encode',
- 'utf_16_be_decode' : 'app_codecs.utf_16_be_decode',
- 'utf_16_be_encode' : 'app_codecs.utf_16_be_encode',
- 'utf_16_decode' : 'app_codecs.utf_16_decode',
- 'utf_16_encode' : 'app_codecs.utf_16_encode',
- 'utf_16_ex_decode' : 'app_codecs.utf_16_ex_decode',
- 'utf_16_le_decode' : 'app_codecs.utf_16_le_decode',
- 'utf_16_le_encode' : 'app_codecs.utf_16_le_encode',
'utf_7_decode' : 'app_codecs.utf_7_decode',
'utf_7_encode' : 'app_codecs.utf_7_encode',
- 'utf_8_decode' : 'app_codecs.utf_8_decode',
- 'utf_8_encode' : 'app_codecs.utf_8_encode',
- 'encode': 'app_codecs.encode',
- 'decode': 'app_codecs.decode'
+ '_register_existing_errors': 'app_codecs._register_existing_errors',
}
interpleveldefs = {
+ 'encode': 'interp_codecs.encode',
+ 'decode': 'interp_codecs.decode',
+ 'lookup': 'interp_codecs.lookup_codec',
+ 'lookup_error': 'interp_codecs.lookup_error',
+ 'register': 'interp_codecs.register_codec',
+ 'register_error': 'interp_codecs.register_error',
+
+ # encoders and decoders
+ 'ascii_decode' : 'interp_codecs.ascii_decode',
+ 'ascii_encode' : 'interp_codecs.ascii_encode',
+ 'latin_1_decode' : 'interp_codecs.latin_1_decode',
+ 'latin_1_encode' : 'interp_codecs.latin_1_encode',
+ 'utf_8_decode' : 'interp_codecs.utf_8_decode',
+ 'utf_8_encode' : 'interp_codecs.utf_8_encode',
+ 'utf_16_be_decode' : 'interp_codecs.utf_16_be_decode',
+ 'utf_16_be_encode' : 'interp_codecs.utf_16_be_encode',
+ 'utf_16_decode' : 'interp_codecs.utf_16_decode',
+ 'utf_16_encode' : 'interp_codecs.utf_16_encode',
+ 'utf_16_le_decode' : 'interp_codecs.utf_16_le_decode',
+ 'utf_16_le_encode' : 'interp_codecs.utf_16_le_encode',
+ 'utf_16_ex_decode' : 'interp_codecs.utf_16_ex_decode',
}
+
+ def setup_after_space_initialization(self):
+ "NOT_RPYTHON"
+ self.space.appexec([], """():
+ import _codecs
+ _codecs._register_existing_errors()
+ """)
Modified: pypy/dist/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/dist/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/dist/pypy/module/_codecs/app_codecs.py Wed Nov 14 20:28:11 2007
@@ -41,107 +41,7 @@
#from unicodecodec import *
import sys
-#/* --- Registry ----------------------------------------------------------- */
-codec_search_path = []
-codec_search_cache = {}
-codec_error_registry = {}
-codec_need_encodings = [True]
-def codec_register( search_function ):
- """register(search_function)
-
- Register a codec search function. Search functions are expected to take
- one argument, the encoding name in all lower case letters, and return
- a tuple of functions (encoder, decoder, stream_reader, stream_writer).
- """
-
- if callable(search_function):
- codec_search_path.append(search_function)
-
-register = codec_register
-
-def codec_lookup(encoding):
- """lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
- Looks up a codec tuple in the Python codec registry and returns
- a tuple of functions.
- """
- if not isinstance(encoding, str):
- raise TypeError("Encoding must be a string")
- normalized_encoding = encoding.replace(" ", "-").lower()
- result = codec_search_cache.get(normalized_encoding, None)
- if not result:
- if codec_need_encodings:
- import encodings
- if len(codec_search_path) == 0:
- raise LookupError("no codec search functions registered: can't find encoding")
- del codec_need_encodings[:]
- for search in codec_search_path:
- result = search(normalized_encoding)
- if result:
- if not (type(result) == tuple and len(result) == 4):
- raise TypeError("codec search functions must return 4-tuples")
- else:
- codec_search_cache[normalized_encoding] = result
- return result
- if not result:
- raise LookupError("unknown encoding: %s" % encoding)
- return result
-
-
-lookup = codec_lookup
-
-def encode(v, encoding=None, errors='strict'):
- """encode(obj, [encoding[,errors]]) -> object
-
- Encodes obj using the codec registered for encoding. encoding defaults
- to the default encoding. errors may be given to set a different error
- handling scheme. Default is 'strict' meaning that encoding errors raise
- a ValueError. Other possible values are 'ignore', 'replace' and
- 'xmlcharrefreplace' as well as any other name registered with
- codecs.register_error that can handle ValueErrors.
- """
- if encoding == None:
- encoding = sys.getdefaultencoding()
- if isinstance(encoding, str):
- encoder = lookup(encoding)[0]
- if encoder and isinstance(errors, str):
- res = encoder(v, errors)
- return res[0]
- else:
- raise TypeError("Errors must be a string")
- else:
- raise TypeError("Encoding must be a string")
-
-def decode(obj, encoding=None, errors='strict'):
- """decode(obj, [encoding[,errors]]) -> object
-
- Decodes obj using the codec registered for encoding. encoding defaults
- to the default encoding. errors may be given to set a different error
- handling scheme. Default is 'strict' meaning that encoding errors raise
- a ValueError. Other possible values are 'ignore' and 'replace'
- as well as any other name registerd with codecs.register_error that is
- able to handle ValueErrors.
- """
- if encoding == None:
- encoding = sys.getdefaultencoding()
- if isinstance(encoding, str):
- decoder = lookup(encoding)[1]
- if decoder and isinstance(errors, str):
- res = decoder(obj, errors)
- if not isinstance(res, tuple) or len(res) != 2:
- raise TypeError("encoder must return a tuple (object, integer)")
- return res[0]
- else:
- raise TypeError("Errors must be a string")
- else:
- raise TypeError("Encoding must be a string")
-
-def latin_1_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeLatin1(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
# XXX MBCS codec might involve ctypes ?
def mbcs_decode():
"""None
@@ -161,16 +61,6 @@
v = s[1:-1]
return v, len(v)
-def utf_8_decode( data, errors='strict', final=False):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
- res = u''.join(res)
- return res, consumed
-
def raw_unicode_escape_decode( data, errors='strict'):
"""None
"""
@@ -192,23 +82,6 @@
res = ''.join(res)
return res, len(res)
-def latin_1_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeLatin1(data, len(data), errors)
- res = u''.join(res)
- return res, len(res)
-
-def utf_16_decode( data, errors='strict', final=False):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final)
- res = ''.join(res)
- return res, consumed
-
def unicode_escape_decode( data, errors='strict'):
"""None
"""
@@ -217,13 +90,6 @@
return res, len(res)
-def ascii_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeASCII(data, len(data), errors)
- res = u''.join(res)
- return res, len(res)
-
def charmap_encode(obj, errors='strict', mapping='latin-1'):
"""None
"""
@@ -284,22 +150,6 @@
res = u''.join(p)
return res, len(res)
-def utf_16_ex_decode( data, errors='strict', byteorder=0, final=0):
- """None
- """
- if byteorder == 0:
- bm = 'native'
- elif byteorder == -1:
- bm = 'little'
- else:
- bm = 'big'
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, bm, final)
- res = ''.join(res)
- return res, consumed, byteorder
-
# XXX needs error messages when the input is invalid
def escape_decode(data, errors='strict'):
"""None
@@ -383,20 +233,6 @@
## len(obj))
-def ascii_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeASCII(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native')
- res = ''.join(res)
- return res, len(res)
-
def raw_unicode_escape_encode( obj, errors='strict'):
"""None
"""
@@ -404,47 +240,6 @@
res = ''.join(res)
return res, len(res)
-def utf_8_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_le_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little')
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_be_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big')
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final)
- res = u''.join(res)
- return res, consumed
-
-def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final)
- res = u''.join(res)
- return res, consumed
-
def strict_errors(exc):
if isinstance(exc, Exception):
raise exc
@@ -500,6 +295,14 @@
raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
+def _register_existing_errors():
+ import _codecs
+ _codecs.register_error("strict", strict_errors)
+ _codecs.register_error("ignore", ignore_errors)
+ _codecs.register_error("replace", replace_errors)
+ _codecs.register_error("xmlcharrefreplace", xmlcharrefreplace_errors)
+ _codecs.register_error("backslashreplace", backslashreplace_errors)
+
# ----------------------------------------------------------------------
##import sys
@@ -528,41 +331,7 @@
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
]
-unicode_latin1 = [None]*256
-
-def lookup_error(errors):
- """lookup_error(errors) -> handler
-
- Return the error handler for the specified error handling name
- or raise a LookupError, if no handler exists under this name.
- """
-
- try:
- err_handler = codec_error_registry[errors]
- except KeyError:
- raise LookupError("unknown error handler name %s"%errors)
- return err_handler
-
-def register_error(errors, handler):
- """register_error(errors, handler)
-
- Register the specified error handler under the name
- errors. handler must be a callable object, that
- will be called with an exception instance containing
- information about the location of the encoding/decoding
- error and must return a (replacement, new position) tuple.
- """
- if callable(handler):
- codec_error_registry[errors] = handler
- else:
- raise TypeError("handler must be callable")
-
-register_error("strict", strict_errors)
-register_error("ignore", ignore_errors)
-register_error("replace", replace_errors)
-register_error("xmlcharrefreplace", xmlcharrefreplace_errors)
-register_error("backslashreplace", backslashreplace_errors)
def SPECIAL(c, encodeO, encodeWS):
c = ord(c)
@@ -831,197 +600,6 @@
p += p[1]
return p
-def PyUnicode_DecodeASCII(s, size, errors):
-
-# /* ASCII is equivalent to the first 128 ordinals in Unicode. */
- if (size == 1 and ord(s) < 128) :
- return [unichr(ord(s))]
- if (size == 0):
- return [u''] #unicode('')
- p = []
- pos = 0
- while pos < len(s):
- c = s[pos]
- if ord(c) < 128:
- p += unichr(ord(c))
- pos += 1
- else:
-
- res = unicode_call_errorhandler(
- errors, "ascii", "ordinal not in range(128)",
- s, pos, pos+1)
- p += [unichr(ord(x)) for x in res[0]]
- pos = res[1]
- return p
-
-def PyUnicode_EncodeASCII(p, size, errors):
-
- return unicode_encode_ucs1(p, size, errors, 128)
-
-def PyUnicode_AsASCIIString(unistr):
-
- if not type(unistr) == unicode:
- raise TypeError
- return PyUnicode_EncodeASCII(unicode(unistr),
- len(unicode),
- None)
-
-def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True):
-
- bo = 0 #/* assume native ordering by default */
- consumed = 0
- errmsg = ""
-
- if sys.byteorder == 'little':
- ihi = 1
- ilo = 0
- else:
- ihi = 0
- ilo = 1
-
-
- #/* Unpack UTF-16 encoded data */
-
-## /* Check for BOM marks (U+FEFF) in the input and adjust current
-## byte order setting accordingly. In native mode, the leading BOM
-## mark is skipped, in all other modes, it is copied to the output
-## stream as-is (giving a ZWNBSP character). */
- q = 0
- p = []
- if byteorder == 'native':
- if (size >= 2):
- bom = (ord(s[ihi]) << 8) | ord(s[ilo])
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if sys.byteorder == 'little':
- if (bom == 0xFEFF):
- q += 2
- bo = -1
- elif bom == 0xFFFE:
- q += 2
- bo = 1
- else:
- if bom == 0xFEFF:
- q += 2
- bo = 1
- elif bom == 0xFFFE:
- q += 2
- bo = -1
- elif byteorder == 'little':
- bo = -1
- else:
- bo = 1
-
- if (size == 0):
- return [u''], 0, bo
-
- if (bo == -1):
- #/* force LE */
- ihi = 1
- ilo = 0
-
- elif (bo == 1):
- #/* force BE */
- ihi = 0
- ilo = 1
-
- while (q < len(s)):
-
- #/* remaining bytes at the end? (size should be even) */
- if (len(s)-q<2):
- if not final:
- break
- errmsg = "truncated data"
- startinpos = q
- endinpos = len(s)
- unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
-# /* The remaining input chars are ignored if the callback
-## chooses to skip the input */
-
- ch = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
- q += 2
-
- if (ch < 0xD800 or ch > 0xDFFF):
- p += unichr(ch)
- continue
-
- #/* UTF-16 code pair: */
- if (q >= len(s)):
- errmsg = "unexpected end of data"
- startinpos = q-2
- endinpos = len(s)
- unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
-
- if (0xD800 <= ch and ch <= 0xDBFF):
- ch2 = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
- q += 2
- if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
- #ifndef Py_UNICODE_WIDE
- if sys.maxunicode < 65536:
- p += unichr(ch)
- p += unichr(ch2)
- else:
- p += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
- #endif
- continue
-
- else:
- errmsg = "illegal UTF-16 surrogate"
- startinpos = q-4
- endinpos = startinpos+2
- unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
-
- errmsg = "illegal encoding"
- startinpos = q-2
- endinpos = startinpos+2
- unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
-
- return p, q, bo
-
-# moved out of local scope, especially because it didn't
-# have any nested variables.
-
-def STORECHAR(CH, byteorder):
- hi = chr(((CH) >> 8) & 0xff)
- lo = chr((CH) & 0xff)
- if byteorder == 'little':
- return [lo, hi]
- else:
- return [hi, lo]
-
-def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
-
-# /* Offsets from p for storing byte pairs in the right order. */
-
-
- p = []
- bom = sys.byteorder
- if (byteorder == 'native'):
-
- bom = sys.byteorder
- p += STORECHAR(0xFEFF, bom)
-
- if (size == 0):
- return ""
-
- if (byteorder == 'little' ):
- bom = 'little'
- elif (byteorder == 'big'):
- bom = 'big'
-
-
- for c in s:
- ch = ord(c)
- ch2 = 0
- if (ch >= 0x10000) :
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
-
- p += STORECHAR(ch, bom)
- if (ch2):
- p += STORECHAR(ch2, bom)
-
- return p
-
def PyUnicode_DecodeMBCS(s, size, errors):
pass
@@ -1032,7 +610,8 @@
def unicode_call_errorhandler(errors, encoding,
reason, input, startinpos, endinpos, decode=True):
- errorHandler = lookup_error(errors)
+ import _codecs
+ errorHandler = _codecs.lookup_error(errors)
if decode:
exceptionObject = UnicodeDecodeError(encoding, input, startinpos, endinpos, reason)
else:
@@ -1048,288 +627,7 @@
else:
raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
-def PyUnicode_DecodeUTF8(s, size, errors):
- return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
-
-## /* Map UTF-8 encoded prefix byte to sequence length. zero means
-## illegal prefix. see RFC 2279 for details */
-utf8_code_length = [
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-]
-
-def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
-
- consumed = 0
- if (size == 0):
- if not final:
- consumed = 0
- return u'', consumed
- p = []
- pos = 0
- while pos < size:
- ch = s[pos]
- if ord(ch) < 0x80:
- p += ch
- pos += 1
- continue
-
- n = utf8_code_length[ord(ch)]
- startinpos = pos
- if (startinpos + n > size):
- if not final:
- break
- else:
- errmsg = "unexpected end of data"
- endinpos = size
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- if n == 0:
- errmsg = "unexpected code byte"
- endinpos = startinpos+1
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- elif n == 1:
- errmsg = "internal error"
- endinpos = startinpos+1
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- elif n == 2:
- if ((ord(s[pos+1]) & 0xc0) != 0x80):
- errmsg = "invalid data"
- endinpos = startinpos+2
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos]) & 0x1f) << 6) + (ord(s[pos+1]) & 0x3f)
- if c < 0x80:
- errmsg = "illegal encoding"
- endinpos = startinpos+2
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- p += unichr(c)
- pos += n
- #break
- elif n == 3:
- if ((ord(s[pos+1]) & 0xc0) != 0x80 or
- (ord(s[pos+2]) & 0xc0) != 0x80):
- errmsg = "invalid data"
- endinpos = startinpos+3
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos]) & 0x0f) << 12) + \
- ((ord(s[pos+1]) & 0x3f) << 6) +\
- (ord(s[pos+2]) & 0x3f)
-
-## /* Note: UTF-8 encodings of surrogates are considered
-## legal UTF-8 sequences;
-##
-## XXX For wide builds (UCS-4) we should probably try
-## to recombine the surrogates into a single code
-## unit.
-## */
- if c < 0x0800:
- errmsg = "illegal encoding"
- endinpos = startinpos+3
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- p += unichr(c)
- pos += n
- elif n == 4:
-## case 4:
- if ((ord(s[pos+1]) & 0xc0) != 0x80 or
- (ord(s[pos+2]) & 0xc0) != 0x80 or
- (ord(s[pos+3]) & 0xc0) != 0x80):
-
- errmsg = "invalid data"
- startinpos = pos
- endinpos = startinpos+4
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos+0]) & 0x7) << 18) + ((ord(s[pos+1]) & 0x3f) << 12) +\
- ((ord(s[pos+2]) & 0x3f) << 6) + (ord(s[pos+3]) & 0x3f)
- #/* validate and convert to UTF-16 */
- if ((c < 0x10000) or (c > 0x10ffff)):
- #/* minimum value allowed for 4 byte encoding */
- #/* maximum value allowed for UTF-16 */
-
- errmsg = "illegal encoding"
- startinpos = pos
- endinpos = startinpos+4
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
-#ifdef Py_UNICODE_WIDE
- if c < sys.maxunicode:
- p += unichr(c)
- pos += n
- else:
-## /* compute and append the two surrogates: */
-## /* translate from 10000..10FFFF to 0..FFFF */
- c -= 0x10000
- #/* high surrogate = top 10 bits added to D800 */
- p += unichr(0xD800 + (c >> 10))
- #/* low surrogate = bottom 10 bits added to DC00 */
- p += unichr(0xDC00 + (c & 0x03FF))
- pos += n
- else:
-## default:
-## /* Other sizes are only needed for UCS-4 */
- errmsg = "unsupported Unicode code range"
- startinpos = pos
- endinpos = startinpos+n
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
-
- #continue
-
- if not final:
- consumed = pos
- return p, pos # consumed
-def PyUnicode_EncodeUTF8(s, size, errors):
-
- #assert(s != None)
- assert(size >= 0)
- p = []
- i = 0
- while i < size:
- ch = s[i]
- i += 1
- if (ord(ch) < 0x80):
-## /* Encode ASCII */
- p += chr(ord(ch))
- elif (ord(ch) < 0x0800) :
-## /* Encode Latin-1 */
- p += chr((0xc0 | (ord(ch) >> 6)))
- p += chr((0x80 | (ord(ch) & 0x3f)))
- else:
-## /* Encode UCS2 Unicode ordinals */
- if (ord(ch) < 0x10000):
-## /* Special case: check for high surrogate */
- if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
- ch2 = s[i]
-## /* Check for low surrogate and combine the two to
-## form a UCS4 value */
- if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
- ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
- i += 1
- p.extend(encodeUCS4(ch3))
- continue
-## /* Fall through: handles isolated high surrogates */
- p += (chr((0xe0 | (ord(ch) >> 12))))
- p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
- p += (chr((0x80 | (ord(ch) & 0x3f))))
- continue
- else:
- p.extend(encodeUCS4(ord(ch)))
- return p
-
-def encodeUCS4(ch):
-## /* Encode UCS4 Unicode ordinals */
- p = []
- p += (chr((0xf0 | (ch >> 18))))
- p += (chr((0x80 | ((ch >> 12) & 0x3f))))
- p += (chr((0x80 | ((ch >> 6) & 0x3f))))
- p += (chr((0x80 | (ch & 0x3f))))
- return p
-
-#/* --- Latin-1 Codec ------------------------------------------------------ */
-
-def PyUnicode_DecodeLatin1(s, size, errors):
- #/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
-## if (size == 1):
-## return [PyUnicode_FromUnicode(s, 1)]
- pos = 0
- p = []
- while (pos < size):
- p += unichr(ord(s[pos]))
- pos += 1
- return p
-
-def unicode_encode_ucs1(p, size, errors, limit):
-
- if limit == 256:
- reason = "ordinal not in range(256)"
- encoding = "latin-1"
- else:
- reason = "ordinal not in range(128)"
- encoding = "ascii"
-
- if (size == 0):
- return ['']
- res = []
- pos = 0
- while pos < len(p):
- #for ch in p:
- ch = p[pos]
-
- if ord(ch) < limit:
- res += chr(ord(ch))
- pos += 1
- else:
- #/* startpos for collecting unencodable chars */
- collstart = pos
- collend = pos+1
- while collend < len(p) and ord(p[collend]) >= limit:
- collend += 1
- x = unicode_call_errorhandler(errors, encoding, reason, p, collstart, collend, False)
- res += str(x[0])
- pos = x[1]
-
- return res
-
-def PyUnicode_EncodeLatin1(p, size, errors):
- res = unicode_encode_ucs1(p, size, errors, 256)
- return res
hexdigits = [hex(i)[-1] for i in range(16)]+[hex(i)[-1].upper() for i in range(10, 16)]
@@ -1523,7 +821,8 @@
# /* Default to Latin-1 */
if mapping == 'latin-1':
- return PyUnicode_EncodeLatin1(p, size, errors)
+ import _codecs
+ return _codecs.latin_1_encode(p, size, errors)
if (size == 0):
return ''
inpos = 0
@@ -1548,7 +847,8 @@
## /* Default to Latin-1 */
if (mapping == None):
- return PyUnicode_DecodeLatin1(s, size, errors)
+ import _codecs
+ return _codecs.latin_1_decode(s, size, errors)
if (size == 0):
return u''
Modified: pypy/dist/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/dist/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/dist/pypy/module/_codecs/test/test_codecs.py Wed Nov 14 20:28:11 2007
@@ -6,6 +6,10 @@
space = gettestobjspace(usemodules=('unicodedata',))
cls.space = space
+ def test_register_noncallable(self):
+ import _codecs
+ raises(TypeError, _codecs.register, 1)
+
def test_bigU_codecs(self):
import sys
oldmaxunicode = sys.maxunicode
@@ -257,6 +261,20 @@
assert '\\253'.decode('string_escape') == chr(0253)
assert '\\312'.decode('string_escape') == chr(0312)
+
def test_decode_utf8_different_case(self):
constant = u"a"
assert constant.encode("utf-8") == constant.encode("UTF-8")
+
+ def test_codec_wrong_result(self):
+ import _codecs
+ def search_function(encoding):
+ def f(input, errors="strict"):
+ return 42
+ print encoding
+ if encoding == 'test.mytestenc':
+ return (f, f, None, None)
+ return None
+ _codecs.register(search_function)
+ raises(TypeError, "hello".decode, "test.mytestenc")
+ raises(TypeError, u"hello".encode, "test.mytestenc")
Modified: pypy/dist/pypy/objspace/std/ropeobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/ropeobject.py (original)
+++ pypy/dist/pypy/objspace/std/ropeobject.py Wed Nov 14 20:28:11 2007
@@ -1030,44 +1030,49 @@
return W_RopeObject(rope.rope_from_charlist(buf[:i+1]))
-
-app = gateway.applevel(r'''
- def str_translate__Rope_ANY_ANY(s, table, deletechars=''):
- """charfilter - unicode handling is not implemented
-
- Return a copy of the string where all characters occurring
- in the optional argument deletechars are removed, and the
- remaining characters have been mapped through the given translation table,
- which must be a string of length 256"""
-
- if len(table) != 256:
- raise ValueError("translation table must be 256 characters long")
-
- L = [ table[ord(s[i])] for i in range(len(s)) if s[i] not in deletechars ]
- return ''.join(L)
-
- def str_decode__Rope_ANY_ANY(str, encoding=None, errors=None):
- import codecs
- if encoding is None and errors is None:
- return unicode(str)
- elif errors is None:
- return codecs.getdecoder(encoding)(str)[0]
- else:
- return codecs.getdecoder(encoding)(str, errors)[0]
-
- def str_encode__Rope_ANY_ANY(str, encoding=None, errors=None):
- import codecs
- if encoding is None and errors is None:
- return unicode(str)
- elif errors is None:
- return codecs.getencoder(encoding)(str)[0]
- else:
- return codecs.getencoder(encoding)(str, errors)[0]
-''', filename=__file__)
-
-str_translate__Rope_ANY_ANY = app.interphook('str_translate__Rope_ANY_ANY')
-str_decode__Rope_ANY_ANY = app.interphook('str_decode__Rope_ANY_ANY')
-str_encode__Rope_ANY_ANY = app.interphook('str_encode__Rope_ANY_ANY')
+def str_translate__Rope_ANY_ANY(space, w_string, w_table, w_deletechars=''):
+ """charfilter - unicode handling is not implemented
+
+ Return a copy of the string where all characters occurring
+ in the optional argument deletechars are removed, and the
+ remaining characters have been mapped through the given translation table,
+ which must be a string of length 256"""
+
+ # XXX CPython accepts buffers, too, not sure what we should do
+ table = space.str_w(w_table)
+ if len(table) != 256:
+ raise OperationError(
+ space.w_ValueError,
+ space.wrap("translation table must be 256 characters long"))
+
+ node = w_string._node
+ chars = []
+ iter = rope.CharIterator(node)
+ while 1:
+ try:
+ c = iter.next()
+ i += 1
+ w_char = W_RopeObject.PREBUILT[ord(char)]
+ if not space.is_true(space.contains(w_deletechars, w_char)):
+ chars.append(table[ord(char)])
+ except StopIteration:
+ break
+ return W_RopeObject(rope.rope_from_charlist(chars))
+
+def str_decode__Rope_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
+ from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \
+ unicode_from_string, decode_object
+ encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
+ if encoding is None and errors is None:
+ return unicode_from_string(space, w_string)
+ return decode_object(space, w_string, encoding, errors)
+
+def str_encode__Rope_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
+ from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \
+ encode_object
+ encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
+ return encode_object(space, w_string, encoding, errors)
+
# methods of the iterator
Modified: pypy/dist/pypy/objspace/std/stringobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/stringobject.py (original)
+++ pypy/dist/pypy/objspace/std/stringobject.py Wed Nov 14 20:28:11 2007
@@ -51,7 +51,6 @@
# XXX should this use the default encoding?
return _decode_ascii(space, w_self._value)
-
def _is_generic(space, w_self, fun):
v = w_self._value
if len(v) == 0:
@@ -890,44 +889,43 @@
return space.wrap("".join(buf[:i+1])) # buffer was overallocated, so slice
-app = gateway.applevel(r'''
- def str_translate__String_ANY_ANY(s, table, deletechars=''):
- """charfilter - unicode handling is not implemented
-
- Return a copy of the string where all characters occurring
- in the optional argument deletechars are removed, and the
- remaining characters have been mapped through the given translation table,
- which must be a string of length 256"""
-
- if len(table) != 256:
- raise ValueError("translation table must be 256 characters long")
-
- L = [ table[ord(s[i])] for i in range(len(s)) if s[i] not in deletechars ]
- return ''.join(L)
-
- def str_decode__String_ANY_ANY(str, encoding=None, errors=None):
- import codecs
- if encoding is None and errors is None:
- return unicode(str)
- elif errors is None:
- return codecs.getdecoder(encoding)(str)[0]
- else:
- return codecs.getdecoder(encoding)(str, errors)[0]
-
- def str_encode__String_ANY_ANY(str, encoding=None, errors=None):
- import codecs
- if encoding is None and errors is None:
- return unicode(str)
- elif errors is None:
- return codecs.getencoder(encoding)(str)[0]
- else:
- return codecs.getencoder(encoding)(str, errors)[0]
-''', filename=__file__)
-
+def str_translate__String_ANY_ANY(space, w_string, w_table, w_deletechars=''):
+ """charfilter - unicode handling is not implemented
+
+ Return a copy of the string where all characters occurring
+ in the optional argument deletechars are removed, and the
+ remaining characters have been mapped through the given translation table,
+ which must be a string of length 256"""
+
+ # XXX CPython accepts buffers, too, not sure what we should do
+ table = space.str_w(w_table)
+ if len(table) != 256:
+ raise OperationError(
+ space.w_ValueError,
+ space.wrap("translation table must be 256 characters long"))
-str_translate__String_ANY_ANY = app.interphook('str_translate__String_ANY_ANY')
-str_decode__String_ANY_ANY = app.interphook('str_decode__String_ANY_ANY')
-str_encode__String_ANY_ANY = app.interphook('str_encode__String_ANY_ANY')
+ string = w_string._value
+ chars = []
+ for char in string:
+ w_char = W_StringObject.PREBUILT[ord(char)]
+ if not space.is_true(space.contains(w_deletechars, w_char)):
+ chars.append(table[ord(char)])
+ return W_StringObject(''.join(chars))
+
+def str_decode__String_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
+ from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \
+ unicode_from_string, decode_object
+ encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
+ if encoding is None and errors is None:
+ return unicode_from_string(space, w_string)
+ return decode_object(space, w_string, encoding, errors)
+
+def str_encode__String_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
+ #import pdb; pdb.set_trace()
+ from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \
+ encode_object
+ encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
+ return encode_object(space, w_string, encoding, errors)
# CPython's logic for deciding if ""%values is
# an error (1 value, 0 %-formatters) or not
Modified: pypy/dist/pypy/objspace/std/test/test_stringobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/test/test_stringobject.py (original)
+++ pypy/dist/pypy/objspace/std/test/test_stringobject.py Wed Nov 14 20:28:11 2007
@@ -645,6 +645,10 @@
def test_decode(self):
assert 'hello'.decode('rot-13') == 'uryyb'
assert 'hello'.decode('string-escape') == 'hello'
+
+ def test_encode(self):
+ assert 'hello'.encode() == 'hello'
+ assert type('hello'.encode()) is str
def test_hash(self):
# check that we have the same hash as CPython for at least 31 bits
Modified: pypy/dist/pypy/objspace/std/test/test_unicodeobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/test/test_unicodeobject.py (original)
+++ pypy/dist/pypy/objspace/std/test/test_unicodeobject.py Wed Nov 14 20:28:11 2007
@@ -1,3 +1,4 @@
+import py
import sys
@@ -377,3 +378,88 @@
def test_missing_cases(self):
# some random cases, which are discovered to not be tested during annotation
assert u'xxx'[1:1] == u''
+
+ # these tests test lots of encodings, so they really belong to the _codecs
+ # module. however, they test useful unicode methods too
+ # they are stolen from CPython's unit tests
+
+ def test_codecs_utf7(self):
+ utfTests = [
+ (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
+ (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
+ (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
+ (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
+ (u'+', '+-'),
+ (u'+-', '+--'),
+ (u'+?', '+-?'),
+ (u'\?', '+AFw?'),
+ (u'+?', '+-?'),
+ (ur'\\?', '+AFwAXA?'),
+ (ur'\\\?', '+AFwAXABc?'),
+ (ur'++--', '+-+---')
+ ]
+
+ for (x, y) in utfTests:
+ assert x.encode('utf-7') == y
+
+ # surrogates not supported
+ raises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
+
+ assert unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd'
+
+ def test_codecs_utf8(self):
+ assert u''.encode('utf-8') == ''
+ assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
+ assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
+ assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+ assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
+ assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
+ assert (u'\ud800\udc02'*1000).encode('utf-8') == '\xf0\x90\x80\x82'*1000
+ assert (
+ u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
+ u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
+ u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
+ u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
+ u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
+ u' Nunstuck git und'.encode('utf-8') ==
+ '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
+ '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
+ '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
+ '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
+ '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
+ '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
+ '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
+ '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
+ '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
+ '\xe3\x80\x8cWenn ist das Nunstuck git und'
+ )
+
+ # UTF-8 specific decoding tests
+ assert unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456'
+ assert unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002'
+ assert unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac'
+
+ def test_codecs_errors(self):
+ # Error handling (encoding)
+ raises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
+ raises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
+ assert u'Andr\202 x'.encode('ascii','ignore') == "Andr x"
+ assert u'Andr\202 x'.encode('ascii','replace') == "Andr? x"
+
+ # Error handling (decoding)
+ raises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
+ raises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
+ assert unicode('Andr\202 x','ascii','ignore') == u"Andr x"
+ assert unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x'
+
+ # Error handling (unknown character names)
+ assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
+
+ # Error handling (truncated escape sequence)
+ raises(UnicodeError, "\\".decode, "unicode-escape")
+
+ def test_repr_bug(self):
+ assert (repr(u'\U00090418\u027d\U000582b9\u54c3\U000fcb6e') ==
+ "u'\\U00090418\\u027d\\U000582b9\\u54c3\\U000fcb6e'")
+ assert (repr(u'\n') ==
+ "u'\\n'")
Modified: pypy/dist/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/unicodeobject.py (original)
+++ pypy/dist/pypy/objspace/std/unicodeobject.py Wed Nov 14 20:28:11 2007
@@ -26,6 +26,12 @@
def unwrap(w_self, space):
# for testing
return w_self._value
+
+ def create_if_subclassed(w_self):
+ if type(w_self) is W_UnicodeObject:
+ return w_self
+ return W_UnicodeObject(w_self._value)
+
W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
registerimplementation(W_UnicodeObject)
@@ -59,7 +65,8 @@
# string-to-unicode delegation
def delegate_String2Unicode(space, w_str):
- w_uni = space.call_function(space.w_unicode, w_str)
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ w_uni = unicode_from_string(space, w_str)
assert isinstance(w_uni, W_UnicodeObject) # help the annotator!
return w_uni
@@ -92,17 +99,20 @@
return W_UnicodeObject(w_left._value + w_right._value)
def add__String_Unicode(space, w_left, w_right):
- return space.add(space.call_function(space.w_unicode, w_left) , w_right)
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.add(unicode_from_string(space, w_left) , w_right)
add__Rope_Unicode = add__String_Unicode
def add__Unicode_String(space, w_left, w_right):
- return space.add(w_left, space.call_function(space.w_unicode, w_right))
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.add(w_left, unicode_from_string(space, w_right))
add__Unicode_Rope = add__Unicode_String
def contains__String_Unicode(space, w_container, w_item):
- return space.contains(space.call_function(space.w_unicode, w_container), w_item )
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.contains(unicode_from_string(space, w_container), w_item )
contains__Rope_Unicode = contains__String_Unicode
@@ -311,8 +321,9 @@
def unicode_strip__Unicode_Unicode(space, w_self, w_chars):
return _strip(space, w_self, w_chars, 1, 1)
def unicode_strip__Unicode_String(space, w_self, w_chars):
+ from pypy.objspace.std.unicodetype import unicode_from_string
return space.call_method(w_self, 'strip',
- space.call_function(space.w_unicode, w_chars))
+ unicode_from_string(space, w_chars))
unicode_strip__Unicode_Rope = unicode_strip__Unicode_String
def unicode_lstrip__Unicode_None(space, w_self, w_chars):
@@ -320,8 +331,9 @@
def unicode_lstrip__Unicode_Unicode(space, w_self, w_chars):
return _strip(space, w_self, w_chars, 1, 0)
def unicode_lstrip__Unicode_String(space, w_self, w_chars):
+ from pypy.objspace.std.unicodetype import unicode_from_string
return space.call_method(w_self, 'lstrip',
- space.call_function(space.w_unicode, w_chars))
+ unicode_from_string(space, w_chars))
unicode_lstrip__Unicode_Rope = unicode_lstrip__Unicode_String
@@ -330,8 +342,9 @@
def unicode_rstrip__Unicode_Unicode(space, w_self, w_chars):
return _strip(space, w_self, w_chars, 0, 1)
def unicode_rstrip__Unicode_String(space, w_self, w_chars):
+ from pypy.objspace.std.unicodetype import unicode_from_string
return space.call_method(w_self, 'rstrip',
- space.call_function(space.w_unicode, w_chars))
+ unicode_from_string(space, w_chars))
unicode_rstrip__Unicode_Rope = unicode_rstrip__Unicode_String
@@ -481,7 +494,7 @@
fillchar = _to_unichar_w(space, w_fillchar)
padding = width - len(self)
if padding < 0:
- return space.call_function(space.w_unicode, w_self)
+ return w_self.create_if_subclassed()
leftpad = padding // 2 + (padding & width & 1)
result = [fillchar] * width
for i in range(len(self)):
@@ -494,7 +507,7 @@
fillchar = _to_unichar_w(space, w_fillchar)
padding = width - len(self)
if padding < 0:
- return space.call_function(space.w_unicode, w_self)
+ return w_self.create_if_subclassed()
result = [fillchar] * width
for i in range(len(self)):
result[i] = self[i]
@@ -506,7 +519,7 @@
fillchar = _to_unichar_w(space, w_fillchar)
padding = width - len(self)
if padding < 0:
- return space.call_function(space.w_unicode, w_self)
+ return w_self.create_if_subclassed()
result = [fillchar] * width
for i in range(len(self)):
result[padding + i] = self[i]
@@ -519,7 +532,7 @@
return W_UnicodeObject(u'0' * width)
padding = width - len(self)
if padding <= 0:
- return space.call_function(space.w_unicode, w_self)
+ return w_self.create_if_subclassed()
result = [u'0'] * width
for i in range(len(self)):
result[padding + i] = self[i]
@@ -735,28 +748,17 @@
return W_UnicodeObject(w_new._value.join(parts))
-app = gateway.applevel(r'''
-import sys
-
-def unicode_encode__Unicode_ANY_ANY(unistr, encoding=None, errors=None):
- import codecs, sys
+def unicode_encode__Unicode_ANY_ANY(space, w_unistr,
+ w_encoding=None,
+ w_errors=None):
+
+ from pypy.objspace.std.unicodetype import getdefaultencoding, \
+ _get_encoding_and_errors, encode_object
+ encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
if encoding is None:
- encoding = sys.getdefaultencoding()
-
- encoder = codecs.getencoder(encoding)
- if errors is None:
- retval, lenght = encoder(unistr)
- else:
- retval, length = encoder(unistr, errors)
- if not isinstance(retval,str):
- raise TypeError("encoder did not return a string object (type=%s)" %
- type(retval).__name__)
- return retval
-''')
-
-
-
-unicode_encode__Unicode_ANY_ANY = app.interphook('unicode_encode__Unicode_ANY_ANY')
+ encoding = getdefaultencoding(space)
+ w_retval = encode_object(space, w_unistr, encoding, errors)
+ return w_retval
def unicode_partition__Unicode_Unicode(space, w_unistr, w_unisub):
unistr = w_unistr._value
@@ -859,36 +861,23 @@
quote = '"'
else:
quote = '\''
- result = ['\0'] * (3 + size*6)
- result[0] = 'u'
- result[1] = quote
- i = 2
+ result = ['u', quote]
j = 0
while j<len(chars):
ch = chars[j]
-## if ch == u"'":
-## quote ='''"'''
-## result[1] = quote
-## result[i] = '\''
-## #result[i + 1] = "'"
-## i += 1
-## continue
code = ord(ch)
if code >= 0x10000:
# Resize if needed
- if i + 12 > len(result):
- result.extend(['\0'] * 100)
- result[i] = '\\'
- result[i + 1] = "U"
- result[i + 2] = hexdigits[(code >> 28) & 0xf]
- result[i + 3] = hexdigits[(code >> 24) & 0xf]
- result[i + 4] = hexdigits[(code >> 20) & 0xf]
- result[i + 5] = hexdigits[(code >> 16) & 0xf]
- result[i + 6] = hexdigits[(code >> 12) & 0xf]
- result[i + 7] = hexdigits[(code >> 8) & 0xf]
- result[i + 8] = hexdigits[(code >> 4) & 0xf]
- result[i + 9] = hexdigits[(code >> 0) & 0xf]
- i += 10
+ result.extend(['\\', "U",
+ hexdigits[(code >> 28) & 0xf],
+ hexdigits[(code >> 24) & 0xf],
+ hexdigits[(code >> 20) & 0xf],
+ hexdigits[(code >> 16) & 0xf],
+ hexdigits[(code >> 12) & 0xf],
+ hexdigits[(code >> 8) & 0xf],
+ hexdigits[(code >> 4) & 0xf],
+ hexdigits[(code >> 0) & 0xf],
+ ])
j += 1
continue
if code >= 0xD800 and code < 0xDC00:
@@ -897,70 +886,59 @@
code2 = ord(ch2)
if code2 >= 0xDC00 and code2 <= 0xDFFF:
code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
- if i + 12 > len(result):
- result.extend(['\0'] * 100)
- result[i] = '\\'
- result[i + 1] = "U"
- result[i + 2] = hexdigits[(code >> 28) & 0xf]
- result[i + 3] = hexdigits[(code >> 24) & 0xf]
- result[i + 4] = hexdigits[(code >> 20) & 0xf]
- result[i + 5] = hexdigits[(code >> 16) & 0xf]
- result[i + 6] = hexdigits[(code >> 12) & 0xf]
- result[i + 7] = hexdigits[(code >> 8) & 0xf]
- result[i + 8] = hexdigits[(code >> 4) & 0xf]
- result[i + 9] = hexdigits[(code >> 0) & 0xf]
- i += 10
+ result.extend(["U",
+ hexdigits[(code >> 28) & 0xf],
+ hexdigits[(code >> 24) & 0xf],
+ hexdigits[(code >> 20) & 0xf],
+ hexdigits[(code >> 16) & 0xf],
+ hexdigits[(code >> 12) & 0xf],
+ hexdigits[(code >> 8) & 0xf],
+ hexdigits[(code >> 4) & 0xf],
+ hexdigits[(code >> 0) & 0xf],
+ ])
j += 2
continue
if code >= 0x100:
- result[i] = '\\'
- result[i + 1] = "u"
- result[i + 2] = hexdigits[(code >> 12) & 0xf]
- result[i + 3] = hexdigits[(code >> 8) & 0xf]
- result[i + 4] = hexdigits[(code >> 4) & 0xf]
- result[i + 5] = hexdigits[(code >> 0) & 0xf]
- i += 6
+ result.extend(['\\', "u",
+ hexdigits[(code >> 12) & 0xf],
+ hexdigits[(code >> 8) & 0xf],
+ hexdigits[(code >> 4) & 0xf],
+ hexdigits[(code >> 0) & 0xf],
+ ])
j += 1
continue
if code == ord('\\') or code == ord(quote):
- result[i] = '\\'
- result[i + 1] = chr(code)
- i += 2
+ result.append('\\')
+ result.append(chr(code))
j += 1
continue
if code == ord('\t'):
- result[i] = '\\'
- result[i + 1] = "t"
- i += 2
+ result.append('\\')
+ result.append('t')
j += 1
continue
if code == ord('\r'):
- result[i] = '\\'
- result[i + 1] = "r"
- i += 2
+ result.append('\\')
+ result.append('r')
j += 1
continue
if code == ord('\n'):
- result[i] = '\\'
- result[i + 1] = "n"
- i += 2
+ result.append('\\')
+ result.append('n')
j += 1
continue
if code < ord(' ') or code >= 0x7f:
- result[i] = '\\'
- result[i + 1] = "x"
- result[i + 2] = hexdigits[(code >> 4) & 0xf]
- result[i + 3] = hexdigits[(code >> 0) & 0xf]
- i += 4
+ result.extend(['\\', "x",
+ hexdigits[(code >> 4) & 0xf],
+ hexdigits[(code >> 0) & 0xf],
+ ])
j += 1
continue
- result[i] = chr(code)
- i += 1
+ result.append(chr(code))
j += 1
- result[i] = quote
- i += 1
- return space.wrap(''.join(result[:i]))
+ result.append(quote)
+ return space.wrap(''.join(result))
def mod__Unicode_ANY(space, w_format, w_values):
@@ -983,47 +961,58 @@
from pypy.objspace.std.stringobject import W_StringObject
from pypy.objspace.std.ropeobject import W_RopeObject
def str_strip__String_Unicode(space, w_self, w_chars):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'strip', w_chars)
str_strip__Rope_Unicode = str_strip__String_Unicode
def str_lstrip__String_Unicode(space, w_self, w_chars):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'lstrip', w_chars)
str_lstrip__Rope_Unicode = str_lstrip__String_Unicode
def str_rstrip__String_Unicode(space, w_self, w_chars):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'rstrip', w_chars)
str_rstrip__Rope_Unicode = str_rstrip__String_Unicode
def str_count__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'count', w_substr, w_start, w_end)
str_count__Rope_Unicode_ANY_ANY = str_count__String_Unicode_ANY_ANY
def str_find__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'find', w_substr, w_start, w_end)
str_find__Rope_Unicode_ANY_ANY = str_find__String_Unicode_ANY_ANY
def str_rfind__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'rfind', w_substr, w_start, w_end)
str_rfind__Rope_Unicode_ANY_ANY = str_rfind__String_Unicode_ANY_ANY
def str_index__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'index', w_substr, w_start, w_end)
str_index__Rope_Unicode_ANY_ANY = str_index__String_Unicode_ANY_ANY
def str_rindex__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'rindex', w_substr, w_start, w_end)
str_rindex__Rope_Unicode_ANY_ANY = str_rindex__String_Unicode_ANY_ANY
def str_replace__String_Unicode_Unicode_ANY(space, w_self, w_old, w_new, w_maxsplit):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'replace', w_old, w_new, w_maxsplit)
str_replace__Rope_Unicode_Unicode_ANY = str_replace__String_Unicode_Unicode_ANY
def str_split__String_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'split', w_delim, w_maxsplit)
str_split__Rope_Unicode_ANY = str_split__String_Unicode_ANY
def str_rsplit__String_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
- return space.call_method(space.call_function(space.w_unicode, w_self),
+ from pypy.objspace.std.unicodetype import unicode_from_string
+ return space.call_method(unicode_from_string(space, w_self),
'rsplit', w_delim, w_maxsplit)
str_rsplit__Rope_Unicode_ANY = str_rsplit__String_Unicode_ANY
register_all(vars(), stringtype)
Modified: pypy/dist/pypy/objspace/std/unicodetype.py
==============================================================================
--- pypy/dist/pypy/objspace/std/unicodetype.py (original)
+++ pypy/dist/pypy/objspace/std/unicodetype.py Wed Nov 14 20:28:11 2007
@@ -145,7 +145,36 @@
def getdefaultencoding(space):
return space.sys.defaultencoding
-def unicode_from_encoded_object(space, w_obj, encoding, errors):
+def _get_encoding_and_errors(space, w_encoding, w_errors):
+ if space.is_w(w_encoding, space.w_None):
+ encoding = None
+ else:
+ encoding = space.str_w(w_encoding)
+ if space.is_w(w_errors, space.w_None):
+ errors = None
+ else:
+ errors = space.str_w(w_errors)
+ return encoding, errors
+
+def encode_object(space, w_object, encoding, errors):
+ w_codecs = space.getbuiltinmodule("_codecs")
+ w_encode = space.getattr(w_codecs, space.wrap("encode"))
+ if encoding is None:
+ encoding = getdefaultencoding(space)
+ if errors is None:
+ w_retval = space.call_function(w_encode, w_object, space.wrap(encoding))
+ else:
+ w_retval = space.call_function(w_encode, w_object, space.wrap(encoding),
+ space.wrap(errors))
+ if not space.is_true(space.isinstance(w_retval, space.w_str)):
+ raise OperationError(
+ space.w_TypeError,
+ space.wrap(
+ "encoder did not return an string object (type=%s)" %
+ space.type(w_retval).getname(space, '?')))
+ return w_retval
+
+def decode_object(space, w_obj, encoding, errors):
w_codecs = space.getbuiltinmodule("_codecs")
if encoding is None:
encoding = getdefaultencoding(space)
@@ -155,6 +184,11 @@
else:
w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
space.wrap(errors))
+ return w_retval
+
+
+def unicode_from_encoded_object(space, w_obj, encoding, errors):
+ w_retval = decode_object(space, w_obj, encoding, errors)
if not space.is_true(space.isinstance(w_retval, space.w_unicode)):
raise OperationError(
space.w_TypeError,
@@ -163,7 +197,6 @@
space.type(w_retval).getname(space, '?')))
return w_retval
-
def unicode_from_object(space, w_obj):
if space.is_true(space.isinstance(w_obj, space.w_str)):
w_res = w_obj
@@ -188,7 +221,7 @@
from pypy.objspace.std.unicodeobject import W_UnicodeObject
encoding = getdefaultencoding(space)
if encoding != 'ascii':
- return unicode_from_object(space, w_str)
+ return unicode_from_encoded_object(space, w_str, encoding, "strict")
s = space.str_w(w_str)
try:
return W_UnicodeObject(s.decode("ascii"))
@@ -197,17 +230,6 @@
return unicode_from_object(space, w_str)
-def _get_encoding_and_errors(space, w_encoding, w_errors):
- if space.is_w(w_encoding, space.w_None):
- encoding = None
- else:
- encoding = space.str_w(w_encoding)
- if space.is_w(w_errors, space.w_None):
- errors = None
- else:
- errors = space.str_w(w_errors)
- return encoding, errors
-
def descr__new__(space, w_unicodetype, w_obj='', w_encoding=None, w_errors=None):
# NB. the default value of w_obj is really a *wrapped* empty string:
# there is gateway magic at work
@@ -231,7 +253,7 @@
w_value = unicode_from_object(space, w_obj)
else:
w_value = unicode_from_encoded_object(space, w_obj, encoding, errors)
- # help the annotator! also the ._value depends on W_UnicodeObject layout
+ # XXX this is not true when there are different unicode implementations
assert isinstance(w_value, W_UnicodeObject)
w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
W_UnicodeObject.__init__(w_newobj, w_value._value)
More information about the Pypy-commit
mailing list