[Python-checkins] r51813 - in python/branches/release24-maint: Lib/test/test_codecencodings_cn.py Lib/test/test_multibytecodec.py Misc/NEWS Modules/cjkcodecs/_codecs_cn.c Modules/cjkcodecs/_codecs_iso2022.c Modules/cjkcodecs/cjkcodecs.h

hyeshik.chang python-checkins at python.org
Thu Sep 7 15:06:12 CEST 2006


Author: hyeshik.chang
Date: Thu Sep  7 15:06:10 2006
New Revision: 51813

Modified:
   python/branches/release24-maint/Lib/test/test_codecencodings_cn.py
   python/branches/release24-maint/Lib/test/test_multibytecodec.py
   python/branches/release24-maint/Misc/NEWS
   python/branches/release24-maint/Modules/cjkcodecs/_codecs_cn.c
   python/branches/release24-maint/Modules/cjkcodecs/_codecs_iso2022.c
   python/branches/release24-maint/Modules/cjkcodecs/cjkcodecs.h
Log:
Backport from trunk r51737:

Fixed a few bugs on cjkcodecs:
- gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly.
- iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312
 codepoints to conform the standard.
- iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 0213:2
 codepoints now.


Modified: python/branches/release24-maint/Lib/test/test_codecencodings_cn.py
==============================================================================
--- python/branches/release24-maint/Lib/test/test_codecencodings_cn.py	(original)
+++ python/branches/release24-maint/Lib/test/test_codecencodings_cn.py	Thu Sep  7 15:06:10 2006
@@ -33,6 +33,7 @@
         ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
         ("abc\x80\x80\xc1\xc4", "ignore",  u"abc\u804a"),
         ("\x83\x34\x83\x31", "strict", None),
+        (u"\u30fb", "strict", None),
     )
 
 class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
@@ -46,6 +47,7 @@
         ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
         ("abc\x80\x80\xc1\xc4", "ignore",  u"abc\u804a"),
         ("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"),
+        (u"\u30fb", "strict", "\x819\xa79"),
     )
     has_iso10646 = True
 

Modified: python/branches/release24-maint/Lib/test/test_multibytecodec.py
==============================================================================
--- python/branches/release24-maint/Lib/test/test_multibytecodec.py	(original)
+++ python/branches/release24-maint/Lib/test/test_multibytecodec.py	Thu Sep  7 15:06:10 2006
@@ -81,6 +81,12 @@
         uni = u':hu4:unit\xe9 de famille'
         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
 
+    def test_iso2022_jp_g0(self):
+        self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
+        for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
+            e = u'\u3406'.encode(encoding)
+            self.failIf(filter(lambda x: x >= '\x80', e))
+
 def test_main():
     suite = unittest.TestSuite()
     suite.addTest(unittest.makeSuite(Test_StreamWriter))

Modified: python/branches/release24-maint/Misc/NEWS
==============================================================================
--- python/branches/release24-maint/Misc/NEWS	(original)
+++ python/branches/release24-maint/Misc/NEWS	Thu Sep  7 15:06:10 2006
@@ -39,6 +39,12 @@
 
 - Patch #1488312, Fix memory alignment problem on SPARC in unicode
 
+- Fixed a few bugs on cjkcodecs:
+  - gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly.
+  - iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312
+    codepoints to conform the standard.
+  - iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2
+    codepoints now.
 
 Extension Modules
 -----------------

Modified: python/branches/release24-maint/Modules/cjkcodecs/_codecs_cn.c
==============================================================================
--- python/branches/release24-maint/Modules/cjkcodecs/_codecs_cn.c	(original)
+++ python/branches/release24-maint/Modules/cjkcodecs/_codecs_cn.c	Thu Sep  7 15:06:10 2006
@@ -16,14 +16,26 @@
 #undef hz
 #endif
 
-#define GBK_PREDECODE(dc1, dc2, assi) \
+/* GBK and GB2312 map differently in few codepoints that are listed below:
+ *
+ *		gb2312				gbk
+ * A1A4		U+30FB KATAKANA MIDDLE DOT	U+00B7 MIDDLE DOT
+ * A1AA		U+2015 HORIZONTAL BAR		U+2014 EM DASH
+ * A844		undefined			U+2015 HORIZONTAL BAR
+ */
+
+#define GBK_DECODE(dc1, dc2, assi) \
 	if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
 	else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
-	else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7;
-#define GBK_PREENCODE(code, assi) \
+	else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
+	else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
+	else TRYMAP_DEC(gbkext, assi, dc1, dc2);
+
+#define GBK_ENCODE(code, assi) \
 	if ((code) == 0x2014) (assi) = 0xa1aa; \
 	else if ((code) == 0x2015) (assi) = 0xa844; \
-	else if ((code) == 0x00b7) (assi) = 0xa1a4;
+	else if ((code) == 0x00b7) (assi) = 0xa1a4; \
+	else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
 
 /*
  * GB2312 codec
@@ -100,8 +112,7 @@
 
 		REQUIRE_OUTBUF(2)
 
-		GBK_PREENCODE(c, code)
-		else TRYMAP_ENC(gbcommon, code, c);
+		GBK_ENCODE(c, code)
 		else return 1;
 
 		OUT1((code >> 8) | 0x80)
@@ -130,9 +141,7 @@
 
 		REQUIRE_INBUF(2)
 
-		GBK_PREDECODE(c, IN2, **outbuf)
-		else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80);
-		else TRYMAP_DEC(gbkext, **outbuf, c, IN2);
+		GBK_DECODE(c, IN2, **outbuf)
 		else return 2;
 
 		NEXT(2, 1)
@@ -188,9 +197,7 @@
 
 		REQUIRE_OUTBUF(2)
 
-		GBK_PREENCODE(c, code)
-		else TRYMAP_ENC(gbcommon, code, c);
-		else TRYMAP_ENC(gb18030ext, code, c);
+		GBK_ENCODE(c, code)
 		else {
 			const struct _gb18030_to_unibmp_ranges *utrrange;
 
@@ -288,9 +295,7 @@
 			return 4;
 		}
 
-		GBK_PREDECODE(c, c2, **outbuf)
-		else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, c2 ^ 0x80);
-		else TRYMAP_DEC(gbkext, **outbuf, c, c2);
+		GBK_DECODE(c, c2, **outbuf)
 		else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
 		else return 2;
 

Modified: python/branches/release24-maint/Modules/cjkcodecs/_codecs_iso2022.c
==============================================================================
--- python/branches/release24-maint/Modules/cjkcodecs/_codecs_iso2022.c	(original)
+++ python/branches/release24-maint/Modules/cjkcodecs/_codecs_iso2022.c	Thu Sep  7 15:06:10 2006
@@ -855,7 +855,7 @@
 	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
 		return coded;
 	else if (coded & 0x8000)
-		return coded;
+		return coded & 0x7fff;
 	else
 		return MAP_UNMAPPABLE;
 }
@@ -902,7 +902,7 @@
 	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
 		return coded;
 	else if (coded & 0x8000)
-		return coded;
+		return coded & 0x7fff;
 	else
 		return MAP_UNMAPPABLE;
 }
@@ -993,7 +993,10 @@
 
 /*-*- registry tables -*-*/
 
-#define REGISTRY_KSX1001	{ CHARSET_KSX1001, 1, 2,		\
+#define REGISTRY_KSX1001_G0	{ CHARSET_KSX1001, 0, 2,		\
+				  ksx1001_init,				\
+				  ksx1001_decoder, ksx1001_encoder }
+#define REGISTRY_KSX1001_G1	{ CHARSET_KSX1001, 1, 2,		\
 				  ksx1001_init,				\
 				  ksx1001_decoder, ksx1001_encoder }
 #define REGISTRY_JISX0201_R	{ CHARSET_JISX0201_R, 0, 1,		\
@@ -1035,7 +1038,7 @@
 				  jisx0213_init,			\
 				  jisx0213_2004_2_decoder,		\
 				  jisx0213_2004_2_encoder }
-#define REGISTRY_GB2312		{ CHARSET_GB2312, 1, 2,			\
+#define REGISTRY_GB2312		{ CHARSET_GB2312, 0, 2,			\
 				  gb2312_init,				\
 				  gb2312_decoder, gb2312_encoder }
 #define REGISTRY_CNS11643_1	{ CHARSET_CNS11643_1, 1, 2,		\
@@ -1055,7 +1058,7 @@
 	};
 
 static const struct iso2022_designation iso2022_kr_designations[] = {
-	REGISTRY_KSX1001, REGISTRY_SENTINEL
+	REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
 };
 CONFIGDEF(kr, 0)
 
@@ -1072,7 +1075,7 @@
 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
 
 static const struct iso2022_designation iso2022_jp_2_designations[] = {
-	REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001,
+	REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
 	REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
 	REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
 };

Modified: python/branches/release24-maint/Modules/cjkcodecs/cjkcodecs.h
==============================================================================
--- python/branches/release24-maint/Modules/cjkcodecs/cjkcodecs.h	(original)
+++ python/branches/release24-maint/Modules/cjkcodecs/cjkcodecs.h	Thu Sep  7 15:06:10 2006
@@ -156,29 +156,32 @@
 #endif
 
 #define _TRYMAP_ENC(m, assi, val)				\
-	if ((m)->map != NULL && (val) >= (m)->bottom &&		\
+	((m)->map != NULL && (val) >= (m)->bottom &&		\
 	    (val)<= (m)->top && ((assi) = (m)->map[(val) -	\
 	    (m)->bottom]) != NOCHAR)
-#define TRYMAP_ENC(charset, assi, uni)				\
+#define TRYMAP_ENC_COND(charset, assi, uni)			\
 	_TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
+#define TRYMAP_ENC(charset, assi, uni)				\
+	if TRYMAP_ENC_COND(charset, assi, uni)
+
 #define _TRYMAP_DEC(m, assi, val)				\
-	if ((m)->map != NULL && (val) >= (m)->bottom &&		\
+	((m)->map != NULL && (val) >= (m)->bottom &&		\
 	    (val)<= (m)->top && ((assi) = (m)->map[(val) -	\
 	    (m)->bottom]) != UNIINV)
 #define TRYMAP_DEC(charset, assi, c1, c2)			\
-	_TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
+	if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
 
 #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val)	\
-	if ((m)->map != NULL && (val) >= (m)->bottom &&		\
+	((m)->map != NULL && (val) >= (m)->bottom &&		\
 	    (val)<= (m)->top &&					\
 	    ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
 	    (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \
 	    (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1))
 #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni)	\
-	_TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
+	if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
 			   assplane, asshi, asslo, (uni) & 0xff)
 #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2)		\
-	_TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
+	if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
 
 #if Py_UNICODE_SIZE == 2
 #define DECODE_SURROGATE(c)					\


More information about the Python-checkins mailing list