[pypy-commit] pypy default: add force_ignore/replace=True (py2 behavior) toggles to the mbcs functions. for

pjenvey noreply at buildbot.pypy.org
Thu Apr 25 03:04:54 CEST 2013


Author: Philip Jenvey <pjenvey at underboss.org>
Branch: 
Changeset: r63597:c70e9a0f576f
Date: 2013-04-24 17:58 -0700
http://bitbucket.org/pypy/pypy/changeset/c70e9a0f576f/

Log:	add force_ignore/replace=True (py2 behavior) toggles to the mbcs
	functions. for py3k, which will specify =False

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1566,6 +1566,7 @@
     from rpython.rtyper.lltypesystem import lltype, rffi
     from rpython.rlib import rwin32
     CP_ACP = 0
+    BOOLP = lltype.Ptr(lltype.Array(rwin32.BOOL, hints={'nolength': True}))
 
     MultiByteToWideChar = rffi.llexternal('MultiByteToWideChar',
                                           [rffi.UINT, rwin32.DWORD,
@@ -1578,7 +1579,7 @@
                                           [rffi.UINT, rwin32.DWORD,
                                            rffi.CWCHARP, rffi.INT,
                                            rwin32.LPCSTR, rffi.INT,
-                                           rwin32.LPCSTR, rffi.VOIDP],
+                                           rwin32.LPCSTR, BOOLP],
                                           rffi.INT,
                                           calling_conv='win')
 
@@ -1586,12 +1587,31 @@
         # XXX don't know how to test this
         return False
 
-    def str_decode_mbcs(s, size, errors, final=False, errorhandler=None):
+    def _decode_mbcs_error(s, errorhandler):
+        if rwin32.GetLastError() == rwin32.ERROR_NO_UNICODE_TRANSLATION:
+            msg = ("No mapping for the Unicode character exists in the target "
+                   "multi-byte code page.")
+            errorhandler('strict', 'mbcs', msg, s, 0, 0)
+        else:
+            raise rwin32.lastWindowsError()
+
+    def str_decode_mbcs(s, size, errors, final=False, errorhandler=None,
+                        force_ignore=True):
+        if errorhandler is None:
+            errorhandler = default_unicode_error_decode
+
+        if not force_ignore and errors not in ('strict', 'ignore'):
+            msg = "mbcs encoding does not support errors='%s'" % errors
+            errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
         if size == 0:
             return u"", 0
 
-        if errorhandler is None:
-            errorhandler = default_unicode_error_decode
+        if force_ignore or errors == 'ignore':
+            flags = 0
+        else:
+            # strict
+            flags = rwin32.MB_ERR_INVALID_CHARS
 
         # Skip trailing lead-byte unless 'final' is set
         if not final and is_dbcs_lead_byte(s[size-1]):
@@ -1599,37 +1619,68 @@
 
         with rffi.scoped_nonmovingbuffer(s) as dataptr:
             # first get the size of the result
-            usize = MultiByteToWideChar(CP_ACP, 0,
+            usize = MultiByteToWideChar(CP_ACP, flags,
                                         dataptr, size,
                                         lltype.nullptr(rffi.CWCHARP.TO), 0)
             if usize == 0:
-                raise rwin32.lastWindowsError()
+                _decode_mbcs_error(s, errorhandler)
 
             with rffi.scoped_alloc_unicodebuffer(usize) as buf:
                 # do the conversion
-                if MultiByteToWideChar(CP_ACP, 0,
+                if MultiByteToWideChar(CP_ACP, flags,
                                        dataptr, size, buf.raw, usize) == 0:
-                    raise rwin32.lastWindowsError()
+                    _decode_mbcs_error(s, errorhandler)
                 return buf.str(usize), size
 
-    def unicode_encode_mbcs(s, size, errors, errorhandler=None):
+    def unicode_encode_mbcs(s, size, errors, errorhandler=None,
+                            force_replace=True):
+        if errorhandler is None:
+            errorhandler = default_unicode_error_encode
+
+        if not force_replace and errors not in ('strict', 'replace'):
+            msg = "mbcs encoding does not support errors='%s'" % errors
+            errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
         if size == 0:
             return ''
-        with rffi.scoped_nonmoving_unicodebuffer(s) as dataptr:
-            # first get the size of the result
-            mbcssize = WideCharToMultiByte(CP_ACP, 0,
-                                           dataptr, size, None, 0,
-                                           None, None)
-            if mbcssize == 0:
-                raise rwin32.lastWindowsError()
 
-            with rffi.scoped_alloc_buffer(mbcssize) as buf:
-                # do the conversion
-                if WideCharToMultiByte(CP_ACP, 0,
-                                       dataptr, size, buf.raw, mbcssize,
-                                       None, None) == 0:
+        if force_replace or errors == 'replace':
+            flags = 0
+            used_default_p = lltype.nullptr(BOOLP.TO)
+        else:
+            # strict
+            flags = rwin32.WC_NO_BEST_FIT_CHARS
+            used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+            used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+        try:
+            with rffi.scoped_nonmoving_unicodebuffer(s) as dataptr:
+                # first get the size of the result
+                mbcssize = WideCharToMultiByte(CP_ACP, flags,
+                                               dataptr, size, None, 0,
+                                               None, used_default_p)
+                if mbcssize == 0:
                     raise rwin32.lastWindowsError()
-                return buf.str(mbcssize)
+                # If we used a default char, then we failed!
+                if (used_default_p and
+                    rffi.cast(lltype.Bool, used_default_p[0])):
+                    errorhandler('strict', 'mbcs', "invalid character",
+                                 s, 0, 0)
+
+                with rffi.scoped_alloc_buffer(mbcssize) as buf:
+                    # do the conversion
+                    if WideCharToMultiByte(CP_ACP, flags,
+                                           dataptr, size, buf.raw, mbcssize,
+                                           None, used_default_p) == 0:
+                        raise rwin32.lastWindowsError()
+                    if (used_default_p and
+                        rffi.cast(lltype.Bool, used_default_p[0])):
+                        errorhandler('strict', 'mbcs', "invalid character",
+                                     s, 0, 0)
+                    return buf.str(mbcssize)
+        finally:
+            if used_default_p:
+                lltype.free(used_default_p, flavor='raw')
 
 # ____________________________________________________________
 # Decimal Encoder
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -91,6 +91,8 @@
                        PROCESS_VM_OPERATION PROCESS_VM_READ
                        PROCESS_VM_WRITE
                        CTRL_C_EVENT CTRL_BREAK_EVENT
+                       MB_ERR_INVALID_CHARS ERROR_NO_UNICODE_TRANSLATION
+                       WC_NO_BEST_FIT_CHARS
                     """
         from rpython.translator.platform import host_factory
         static_platform = host_factory()
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -728,6 +728,30 @@
         self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
         self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
 
+    def test_mbcs_decode_force_ignore(self):
+        if sys.platform != 'win32':
+            py.test.skip("mbcs encoding is win32-specific")
+
+        # XXX: requires a locale w/ a restrictive encoding to test
+        from rpython.rlib.rlocale import getdefaultlocale
+        if getdefaultlocale()[1] != 'cp932':
+            py.test.skip("requires cp932 locale")
+
+        s = '\xff\xf4\x8f\xbf\xbf'
+        encoder = self.getdecoder('mbcs')
+        assert encoder(s, len(s), 'strict') == (u'\U0010ffff', 5)
+        py.test.raises(UnicodeEncodeError, encoder, s, len(s), 'strict',
+                       force_ignore=False)
+
+    def test_mbcs_encode_force_replace(self):
+        if sys.platform != 'win32':
+            py.test.skip("mbcs encoding is win32-specific")
+        u = u'@test_2224_tmp-?L??\udc80'
+        encoder = self.getencoder('mbcs')
+        assert encoder(u, len(u), 'strict') == '@test_2224_tmp-?L???'
+        py.test.raises(UnicodeEncodeError, encoder, u, len(u), 'strict',
+                       force_replace=False)
+
     def test_encode_decimal(self):
         encoder = self.getencoder('decimal')
         assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '


More information about the pypy-commit mailing list