[pypy-commit] pypy stdlib-2.7.3: Add PyUnicode_EncodeDecimal.

Tue Jun 12 21:21:54 CEST 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-2.7.3
Changeset: r55626:9d34d072a9e1
Date: 2012-06-12 21:16 +0200
http://bitbucket.org/pypy/pypy/changeset/9d34d072a9e1/

Log:	Add PyUnicode_EncodeDecimal. Also change a bit how error handlers
	are used: don't always .encode() the replacement string again, let
	the encoding function do the conversion it wants.

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -53,14 +53,8 @@
                 raise operationerrfmt(
                     space.w_IndexError,
                     "position %d from error handler out of bounds", newpos)
-            if decode:
-                replace = space.unicode_w(w_replace)
-                return replace, newpos
-            else:
-                from pypy.objspace.std.unicodetype import encode_object
-                w_str = encode_object(space, w_replace, encoding, None)
-                replace = space.str_w(w_str)
-                return replace, newpos
+            replace = space.unicode_w(w_replace)
+            return replace, newpos
         return unicode_call_errorhandler
 
     def get_unicodedata_handler(self, space):
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -279,8 +279,10 @@
             replace = "?"
     else:
         assert errorcb
-        replace, end = errorcb(errors, namecb, reason,
-                               unicodedata, start, end)
+        ret, end = errorcb(errors, namecb, reason,
+                           unicodedata, start, end)
+        codec = pypy_cjk_enc_getcodec(encodebuf)
+        replace = encode(codec, ret, "strict", errorcb, namecb)
     inbuf = rffi.get_nonmovingbuffer(replace)
     try:
         r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -196,6 +196,31 @@
         assert space.unwrap(w_s) == u'sp&#65533;m'.encode('utf-8')
         rffi.free_wcharp(u)
 
+    def test_encode_decimal(self, space, api):
+        with rffi.scoped_unicode2wcharp(u' (12, 35 ABC)') as u:
+            with rffi.scoped_alloc_buffer(13) as buf:
+                res = api.PyUnicode_EncodeDecimal(u, 13, buf.raw, None)
+                s = buf.str(13)
+        assert res == 0
+        assert s == ' (12, 35 ABC)'
+
+        with rffi.scoped_unicode2wcharp(u' (12, \u1234\u1235)') as u:
+            with rffi.scoped_alloc_buffer(9) as buf:
+                res = api.PyUnicode_EncodeDecimal(u, 9, buf.raw, None)
+        assert res == -1
+        api.PyErr_Clear()
+
+        with rffi.scoped_unicode2wcharp(u' (12, \u1234\u1235)') as u:
+            with rffi.scoped_alloc_buffer(9) as buf:
+                with rffi.scoped_str2charp("replace") as errors:
+                    res = api.PyUnicode_EncodeDecimal(u, 9, buf.raw,
+                                                      errors)
+                s = buf.str(9)
+        assert res == 0
+        assert res == " (12, ??)"
+        api.PyErr_Clear()
+
+
     def test_IS(self, space, api):
         for char in [0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x1c, 0x1d, 0x1e, 0x1f,
                      0x20, 0x85, 0xa0, 0x1680, 0x2000, 0x2001, 0x2002,
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -12,6 +12,7 @@
     make_typedescr, get_typedescr)
 from pypy.module.cpyext.stringobject import PyString_Check
 from pypy.module.sys.interp_encoding import setdefaultencoding
+from pypy.module._codecs.interp_codecs import CodecState
 from pypy.objspace.std import unicodeobject, unicodetype, stringtype
 from pypy.rlib import runicode
 from pypy.tool.sourcetools import func_renamer
@@ -610,6 +611,34 @@
 
     return space.wrap(result)
 
+ at cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, rffi.CCHARP],
+             rffi.INT_real, error=-1)
+def PyUnicode_EncodeDecimal(space, s, length, output, llerrors):
+    """Takes a Unicode string holding a decimal value and writes it
+    into an output buffer using standard ASCII digit codes.
+
+    The output buffer has to provide at least length+1 bytes of
+    storage area. The output string is 0-terminated.
+
+    The encoder converts whitespace to ' ', decimal characters to
+    their corresponding ASCII digit and all other Latin-1 characters
+    except \0 as-is. Characters outside this range (Unicode ordinals
+    1-256) are treated as errors. This includes embedded NULL bytes.
+
+    Returns 0 on success, -1 on failure.
+    """
+    u = rffi.wcharpsize2unicode(s, length)
+    if llerrors:
+        errors = rffi.charp2str(llerrors)
+    else:
+        errors = None
+    state = space.fromcache(CodecState)
+    result = runicode.unicode_encode_decimal(u, length, errors,
+                                             state.encode_error_handler)
+    for i in range(0, length):
+        output[i] = result[i]
+    return 0
+
 @cpython_api([PyObject, PyObject], rffi.INT_real, error=-2)
 def PyUnicode_Compare(space, w_left, w_right):
     """Compare two strings and return -1, 0, 1 for less than, equal, and greater
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -4,6 +4,7 @@
 from pypy.rlib.objectmodel import we_are_translated, specialize
 from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 from pypy.rlib.rarithmetic import r_uint, intmask
+from pypy.module.unicodedata import unicodedb
 
 if rffi.sizeof(lltype.UniChar) == 4:
     MAXUNICODE = 0x10ffff
@@ -958,7 +959,12 @@
                 collend += 1
             r, pos = errorhandler(errors, encoding, reason, p,
                                   collstart, collend)
-            result.append(r)
+            for ch in r:
+                if ord(ch) < limit:
+                    result.append(chr(ord(ch)))
+                else:
+                    errorhandler("strict", encoding, reason, p,
+                                 collstart, collend)
 
     return result.build()
 
@@ -1027,7 +1033,7 @@
                                     "character maps to <undefined>",
                                     s, pos, pos + 1)
             for ch2 in res:
-                c2 = mapping.get(unichr(ord(ch2)), '')
+                c2 = mapping.get(ch2, '')
                 if len(c2) == 0:
                     errorhandler(
                         "strict", "charmap",
@@ -1544,3 +1550,71 @@
                 rffi.keep_buffer_alive_until_here(raw_buf, gc_buf)
         finally:
             rffi.free_nonmoving_unicodebuffer(p, dataptr)
+
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, size, errors, errorhandler=None):
+    """Converts whitespace to ' ', decimal characters to their
+    corresponding ASCII digit and all other Latin-1 characters except
+    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+    are treated as errors. This includes embedded NULL bytes.
+    """
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_encode
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        if unicodedb.isspace(ch):
+            result.append(' ')
+            pos += 1
+            continue
+        try:
+            decimal = unicodedb.decimal(ch)
+        except KeyError:
+            pass
+        else:
+            result.append(chr(48 + decimal))
+            pos += 1
+            continue
+        if 0 < ch < 256:
+            result.append(chr(ch))
+            pos += 1
+            continue
+        # All other characters are considered unencodable
+        collstart = pos
+        collend = collstart + 1
+        while collend < size:
+            ch = ord(s[collend])
+            try:
+                if (0 < ch < 256 or
+                    unicodedb.isspace(ch) or
+                    unicodedb.decimal(ch) >= 0):
+                    break
+            except KeyError:
+                # not a decimal
+                pass
+            collend += 1
+        msg = "invalid decimal Unicode string"
+        r, pos = errorhandler(errors, 'decimal',
+                              msg, s, collstart, collend)
+        for char in r:
+            ch = ord(char)
+            if unicodedb.isspace(ch):
+                result.append(' ')
+                continue
+            try:
+                decimal = unicodedb.decimal(ch)
+            except KeyError:
+                pass
+            else:
+                result.append(chr(48 + decimal))
+                continue
+            if 0 < ch < 256:
+                result.append(chr(ch))
+                continue
+            errorhandler('strict', 'decimal',
+                         msg, s, collstart, collend)
+    return result.build()
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -628,8 +628,13 @@
             assert decoder(seq, len(seq), None, final=True,
                            errorhandler=self.ignore_handler) == (res, len(seq))
 
+class TestEncoding(UnicodeTests):
+    def replace_handler(self, errors, codec, message, input, start, end):
+        if errors=='strict':
+            runicode.raise_unicode_exception_encode(errors, codec, message,
+                                                    input, start, end)
+        return u'?', end
 
-class TestEncoding(UnicodeTests):
     def test_all_ascii(self):
         for i in range(128):
             if sys.version >= "2.7":
@@ -701,6 +706,12 @@
         self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
         self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
 
+    def test_encode_decimal(self):
+        encoder = self.getencoder('decimal')
+        assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '
+        raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
+        assert encoder(u'u\u1234', 2, 'replace', self.replace_handler) == 'u?'
+
 class TestTranslation(object):
     def setup_class(cls):
         if runicode.MAXUNICODE != sys.maxunicode: