[pypy-commit] pypy stdlib-2.7.3: Add PyUnicode_EncodeDecimal.
amauryfa
noreply at buildbot.pypy.org
Tue Jun 12 21:21:54 CEST 2012
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-2.7.3
Changeset: r55626:9d34d072a9e1
Date: 2012-06-12 21:16 +0200
http://bitbucket.org/pypy/pypy/changeset/9d34d072a9e1/
Log: Add PyUnicode_EncodeDecimal. Also change a bit how error handlers
are used: don't always .encode() the replacement string again, let
the encoding function do the conversion it wants.
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -53,14 +53,8 @@
raise operationerrfmt(
space.w_IndexError,
"position %d from error handler out of bounds", newpos)
- if decode:
- replace = space.unicode_w(w_replace)
- return replace, newpos
- else:
- from pypy.objspace.std.unicodetype import encode_object
- w_str = encode_object(space, w_replace, encoding, None)
- replace = space.str_w(w_str)
- return replace, newpos
+ replace = space.unicode_w(w_replace)
+ return replace, newpos
return unicode_call_errorhandler
def get_unicodedata_handler(self, space):
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -279,8 +279,10 @@
replace = "?"
else:
assert errorcb
- replace, end = errorcb(errors, namecb, reason,
- unicodedata, start, end)
+ ret, end = errorcb(errors, namecb, reason,
+ unicodedata, start, end)
+ codec = pypy_cjk_enc_getcodec(encodebuf)
+ replace = encode(codec, ret, "strict", errorcb, namecb)
inbuf = rffi.get_nonmovingbuffer(replace)
try:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -196,6 +196,31 @@
assert space.unwrap(w_s) == u'sp�m'.encode('utf-8')
rffi.free_wcharp(u)
+ def test_encode_decimal(self, space, api):
+ with rffi.scoped_unicode2wcharp(u' (12, 35 ABC)') as u:
+ with rffi.scoped_alloc_buffer(13) as buf:
+ res = api.PyUnicode_EncodeDecimal(u, 13, buf.raw, None)
+ s = buf.str(13)
+ assert res == 0
+ assert s == ' (12, 35 ABC)'
+
+ with rffi.scoped_unicode2wcharp(u' (12, \u1234\u1235)') as u:
+ with rffi.scoped_alloc_buffer(9) as buf:
+ res = api.PyUnicode_EncodeDecimal(u, 9, buf.raw, None)
+ assert res == -1
+ api.PyErr_Clear()
+
+ with rffi.scoped_unicode2wcharp(u' (12, \u1234\u1235)') as u:
+ with rffi.scoped_alloc_buffer(9) as buf:
+ with rffi.scoped_str2charp("replace") as errors:
+ res = api.PyUnicode_EncodeDecimal(u, 9, buf.raw,
+ errors)
+ s = buf.str(9)
+ assert res == 0
+ assert res == " (12, ??)"
+ api.PyErr_Clear()
+
+
def test_IS(self, space, api):
for char in [0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x85, 0xa0, 0x1680, 0x2000, 0x2001, 0x2002,
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -12,6 +12,7 @@
make_typedescr, get_typedescr)
from pypy.module.cpyext.stringobject import PyString_Check
from pypy.module.sys.interp_encoding import setdefaultencoding
+from pypy.module._codecs.interp_codecs import CodecState
from pypy.objspace.std import unicodeobject, unicodetype, stringtype
from pypy.rlib import runicode
from pypy.tool.sourcetools import func_renamer
@@ -610,6 +611,34 @@
return space.wrap(result)
+ at cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, rffi.CCHARP],
+ rffi.INT_real, error=-1)
+def PyUnicode_EncodeDecimal(space, s, length, output, llerrors):
+ """Takes a Unicode string holding a decimal value and writes it
+ into an output buffer using standard ASCII digit codes.
+
+ The output buffer has to provide at least length+1 bytes of
+ storage area. The output string is 0-terminated.
+
+ The encoder converts whitespace to ' ', decimal characters to
+ their corresponding ASCII digit and all other Latin-1 characters
+ except \0 as-is. Characters outside this range (Unicode ordinals
+ 1-256) are treated as errors. This includes embedded NULL bytes.
+
+ Returns 0 on success, -1 on failure.
+ """
+ u = rffi.wcharpsize2unicode(s, length)
+ if llerrors:
+ errors = rffi.charp2str(llerrors)
+ else:
+ errors = None
+ state = space.fromcache(CodecState)
+ result = runicode.unicode_encode_decimal(u, length, errors,
+ state.encode_error_handler)
+ for i in range(0, length):
+ output[i] = result[i]
+ return 0
+
@cpython_api([PyObject, PyObject], rffi.INT_real, error=-2)
def PyUnicode_Compare(space, w_left, w_right):
"""Compare two strings and return -1, 0, 1 for less than, equal, and greater
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -4,6 +4,7 @@
from pypy.rlib.objectmodel import we_are_translated, specialize
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
from pypy.rlib.rarithmetic import r_uint, intmask
+from pypy.module.unicodedata import unicodedb
if rffi.sizeof(lltype.UniChar) == 4:
MAXUNICODE = 0x10ffff
@@ -958,7 +959,12 @@
collend += 1
r, pos = errorhandler(errors, encoding, reason, p,
collstart, collend)
- result.append(r)
+ for ch in r:
+ if ord(ch) < limit:
+ result.append(chr(ord(ch)))
+ else:
+ errorhandler("strict", encoding, reason, p,
+ collstart, collend)
return result.build()
@@ -1027,7 +1033,7 @@
"character maps to <undefined>",
s, pos, pos + 1)
for ch2 in res:
- c2 = mapping.get(unichr(ord(ch2)), '')
+ c2 = mapping.get(ch2, '')
if len(c2) == 0:
errorhandler(
"strict", "charmap",
@@ -1544,3 +1550,71 @@
rffi.keep_buffer_alive_until_here(raw_buf, gc_buf)
finally:
rffi.free_nonmoving_unicodebuffer(p, dataptr)
+
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, size, errors, errorhandler=None):
+ """Converts whitespace to ' ', decimal characters to their
+ corresponding ASCII digit and all other Latin-1 characters except
+ \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+ are treated as errors. This includes embedded NULL bytes.
+ """
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_encode
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ pos += 1
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ pos += 1
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ pos += 1
+ continue
+ # All other characters are considered unencodable
+ collstart = pos
+ collend = collstart + 1
+ while collend < size:
+ ch = ord(s[collend])
+ try:
+ if (0 < ch < 256 or
+ unicodedb.isspace(ch) or
+ unicodedb.decimal(ch) >= 0):
+ break
+ except KeyError:
+ # not a decimal
+ pass
+ collend += 1
+ msg = "invalid decimal Unicode string"
+ r, pos = errorhandler(errors, 'decimal',
+ msg, s, collstart, collend)
+ for char in r:
+ ch = ord(char)
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ continue
+ errorhandler('strict', 'decimal',
+ msg, s, collstart, collend)
+ return result.build()
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -628,8 +628,13 @@
assert decoder(seq, len(seq), None, final=True,
errorhandler=self.ignore_handler) == (res, len(seq))
+class TestEncoding(UnicodeTests):
+ def replace_handler(self, errors, codec, message, input, start, end):
+ if errors=='strict':
+ runicode.raise_unicode_exception_encode(errors, codec, message,
+ input, start, end)
+ return u'?', end
-class TestEncoding(UnicodeTests):
def test_all_ascii(self):
for i in range(128):
if sys.version >= "2.7":
@@ -701,6 +706,12 @@
self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
+ def test_encode_decimal(self):
+ encoder = self.getencoder('decimal')
+ assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '
+ raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
+ assert encoder(u'u\u1234', 2, 'replace', self.replace_handler) == 'u?'
+
class TestTranslation(object):
def setup_class(cls):
if runicode.MAXUNICODE != sys.maxunicode:
More information about the pypy-commit
mailing list