[pypy-commit] pypy fastjson: handle surrogate pairs

antocuni noreply at buildbot.pypy.org
Tue Jun 25 19:01:45 CEST 2013


Author: Antonio Cuni <anto.cuni at gmail.com>
Branch: fastjson
Changeset: r64979:72939f271dd6
Date: 2013-06-25 18:37 +0200
http://bitbucket.org/pypy/pypy/changeset/72939f271dd6/

Log:	handle surrogate pairs

diff --git a/pypy/module/_fastjson/interp_decoder.py b/pypy/module/_fastjson/interp_decoder.py
--- a/pypy/module/_fastjson/interp_decoder.py
+++ b/pypy/module/_fastjson/interp_decoder.py
@@ -311,16 +311,29 @@
         i += 4
         hexdigits = self.getslice(start, i)
         try:
-            uchr = unichr(int(hexdigits, 16))
+            val = int(hexdigits, 16)
+            if val & 0xfc00 == 0xd800:
+                # surrogate pair
+                val = self.decode_surrogate_pair(i, val)
+                i += 6
         except ValueError:
             self._raise("Invalid \uXXXX escape (char %d)", i-1)
             return # help the annotator to know that we'll never go beyond
                    # this point
         #
+        uchr = unichr(val)
         utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
         builder.append(utf8_ch)
         return i
 
+    def decode_surrogate_pair(self, i, highsurr):
+        if self.ll_chars[i] != '\\' or self.ll_chars[i+1] != 'u':
+            self._raise("Unpaired high surrogate at char %d", i)
+        i += 2
+        hexdigits = self.getslice(i, i+4)
+        lowsurr = int(hexdigits, 16) # the possible ValueError is caugth by the caller
+        return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
+
 
 def loads(space, w_s):
     if space.isinstance_w(w_s, space.w_unicode):
diff --git a/pypy/module/_fastjson/test/test__fastjson.py b/pypy/module/_fastjson/test/test__fastjson.py
--- a/pypy/module/_fastjson/test/test__fastjson.py
+++ b/pypy/module/_fastjson/test/test__fastjson.py
@@ -149,3 +149,8 @@
         raises(ValueError, "_fastjson.loads('[1: 2]')")
         raises(ValueError, "_fastjson.loads('[1, 2')")
 
+    def test_big_unicode_decode(self):
+        import _fastjson
+        expected = u'z\U0001d120x'
+        res = _fastjson.loads('"z\\ud834\\udd20x"')
+        assert res == expected


More information about the pypy-commit mailing list