[pypy-commit] pypy default: add force_ignore/replace=True (py2 behavior) toggles to the mbcs functions. for
pjenvey
noreply at buildbot.pypy.org
Thu Apr 25 03:04:54 CEST 2013
Author: Philip Jenvey <pjenvey at underboss.org>
Branch:
Changeset: r63597:c70e9a0f576f
Date: 2013-04-24 17:58 -0700
http://bitbucket.org/pypy/pypy/changeset/c70e9a0f576f/
Log: add force_ignore/replace=True (py2 behavior) toggles to the mbcs
functions. for py3k, which will specify =False
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1566,6 +1566,7 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rlib import rwin32
CP_ACP = 0
+ BOOLP = lltype.Ptr(lltype.Array(rwin32.BOOL, hints={'nolength': True}))
MultiByteToWideChar = rffi.llexternal('MultiByteToWideChar',
[rffi.UINT, rwin32.DWORD,
@@ -1578,7 +1579,7 @@
[rffi.UINT, rwin32.DWORD,
rffi.CWCHARP, rffi.INT,
rwin32.LPCSTR, rffi.INT,
- rwin32.LPCSTR, rffi.VOIDP],
+ rwin32.LPCSTR, BOOLP],
rffi.INT,
calling_conv='win')
@@ -1586,12 +1587,31 @@
# XXX don't know how to test this
return False
- def str_decode_mbcs(s, size, errors, final=False, errorhandler=None):
+ def _decode_mbcs_error(s, errorhandler):
+ if rwin32.GetLastError() == rwin32.ERROR_NO_UNICODE_TRANSLATION:
+ msg = ("No mapping for the Unicode character exists in the target "
+ "multi-byte code page.")
+ errorhandler('strict', 'mbcs', msg, s, 0, 0)
+ else:
+ raise rwin32.lastWindowsError()
+
+ def str_decode_mbcs(s, size, errors, final=False, errorhandler=None,
+ force_ignore=True):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+
+ if not force_ignore and errors not in ('strict', 'ignore'):
+ msg = "mbcs encoding does not support errors='%s'" % errors
+ errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
if size == 0:
return u"", 0
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
+ if force_ignore or errors == 'ignore':
+ flags = 0
+ else:
+ # strict
+ flags = rwin32.MB_ERR_INVALID_CHARS
# Skip trailing lead-byte unless 'final' is set
if not final and is_dbcs_lead_byte(s[size-1]):
@@ -1599,37 +1619,68 @@
with rffi.scoped_nonmovingbuffer(s) as dataptr:
# first get the size of the result
- usize = MultiByteToWideChar(CP_ACP, 0,
+ usize = MultiByteToWideChar(CP_ACP, flags,
dataptr, size,
lltype.nullptr(rffi.CWCHARP.TO), 0)
if usize == 0:
- raise rwin32.lastWindowsError()
+ _decode_mbcs_error(s, errorhandler)
with rffi.scoped_alloc_unicodebuffer(usize) as buf:
# do the conversion
- if MultiByteToWideChar(CP_ACP, 0,
+ if MultiByteToWideChar(CP_ACP, flags,
dataptr, size, buf.raw, usize) == 0:
- raise rwin32.lastWindowsError()
+ _decode_mbcs_error(s, errorhandler)
return buf.str(usize), size
- def unicode_encode_mbcs(s, size, errors, errorhandler=None):
+ def unicode_encode_mbcs(s, size, errors, errorhandler=None,
+ force_replace=True):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+
+ if not force_replace and errors not in ('strict', 'replace'):
+ msg = "mbcs encoding does not support errors='%s'" % errors
+ errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
if size == 0:
return ''
- with rffi.scoped_nonmoving_unicodebuffer(s) as dataptr:
- # first get the size of the result
- mbcssize = WideCharToMultiByte(CP_ACP, 0,
- dataptr, size, None, 0,
- None, None)
- if mbcssize == 0:
- raise rwin32.lastWindowsError()
- with rffi.scoped_alloc_buffer(mbcssize) as buf:
- # do the conversion
- if WideCharToMultiByte(CP_ACP, 0,
- dataptr, size, buf.raw, mbcssize,
- None, None) == 0:
+ if force_replace or errors == 'replace':
+ flags = 0
+ used_default_p = lltype.nullptr(BOOLP.TO)
+ else:
+ # strict
+ flags = rwin32.WC_NO_BEST_FIT_CHARS
+ used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+ used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+ try:
+ with rffi.scoped_nonmoving_unicodebuffer(s) as dataptr:
+ # first get the size of the result
+ mbcssize = WideCharToMultiByte(CP_ACP, flags,
+ dataptr, size, None, 0,
+ None, used_default_p)
+ if mbcssize == 0:
raise rwin32.lastWindowsError()
- return buf.str(mbcssize)
+ # If we used a default char, then we failed!
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+
+ with rffi.scoped_alloc_buffer(mbcssize) as buf:
+ # do the conversion
+ if WideCharToMultiByte(CP_ACP, flags,
+ dataptr, size, buf.raw, mbcssize,
+ None, used_default_p) == 0:
+ raise rwin32.lastWindowsError()
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+ return buf.str(mbcssize)
+ finally:
+ if used_default_p:
+ lltype.free(used_default_p, flavor='raw')
# ____________________________________________________________
# Decimal Encoder
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -91,6 +91,8 @@
PROCESS_VM_OPERATION PROCESS_VM_READ
PROCESS_VM_WRITE
CTRL_C_EVENT CTRL_BREAK_EVENT
+ MB_ERR_INVALID_CHARS ERROR_NO_UNICODE_TRANSLATION
+ WC_NO_BEST_FIT_CHARS
"""
from rpython.translator.platform import host_factory
static_platform = host_factory()
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -728,6 +728,30 @@
self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
+ def test_mbcs_decode_force_ignore(self):
+ if sys.platform != 'win32':
+ py.test.skip("mbcs encoding is win32-specific")
+
+ # XXX: requires a locale w/ a restrictive encoding to test
+ from rpython.rlib.rlocale import getdefaultlocale
+ if getdefaultlocale()[1] != 'cp932':
+ py.test.skip("requires cp932 locale")
+
+ s = '\xff\xf4\x8f\xbf\xbf'
+ encoder = self.getdecoder('mbcs')
+ assert encoder(s, len(s), 'strict') == (u'\U0010ffff', 5)
+ py.test.raises(UnicodeEncodeError, encoder, s, len(s), 'strict',
+ force_ignore=False)
+
+ def test_mbcs_encode_force_replace(self):
+ if sys.platform != 'win32':
+ py.test.skip("mbcs encoding is win32-specific")
+ u = u'@test_2224_tmp-?L??\udc80'
+ encoder = self.getencoder('mbcs')
+ assert encoder(u, len(u), 'strict') == '@test_2224_tmp-?L???'
+ py.test.raises(UnicodeEncodeError, encoder, u, len(u), 'strict',
+ force_replace=False)
+
def test_encode_decimal(self):
encoder = self.getencoder('decimal')
assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '
More information about the pypy-commit
mailing list