[Python-checkins] cpython: Issue #17693: CJK encoders now use the new Unicode API (PEP 393)

victor.stinner python-checkins at python.org
Sun Apr 14 02:06:47 CEST 2013


http://hg.python.org/cpython/rev/d621bdaed7c3
changeset:   83317:d621bdaed7c3
user:        Victor Stinner <victor.stinner at gmail.com>
date:        Sun Apr 14 02:06:32 2013 +0200
summary:
  Issue #17693: CJK encoders now use the new Unicode API (PEP 393)

files:
  Modules/cjkcodecs/_codecs_cn.c      |  135 ++++++------
  Modules/cjkcodecs/_codecs_hk.c      |   44 ++-
  Modules/cjkcodecs/_codecs_iso2022.c |  113 ++++-----
  Modules/cjkcodecs/_codecs_jp.c      |  168 ++++++++-------
  Modules/cjkcodecs/_codecs_kr.c      |   98 ++++----
  Modules/cjkcodecs/_codecs_tw.c      |   44 ++-
  Modules/cjkcodecs/cjkcodecs.h       |   68 ++---
  Modules/cjkcodecs/multibytecodec.c  |  172 ++++++++-------
  Modules/cjkcodecs/multibytecodec.h  |    6 +-
  9 files changed, 430 insertions(+), 418 deletions(-)


diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -42,16 +42,18 @@
 
 ENCODER(gb2312)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
         TRYMAP_ENC(gbcommon, code, c);
@@ -60,9 +62,9 @@
         if (code & 0x8000) /* MSB set: GBK */
             return 1;
 
-        OUT1((code >> 8) | 0x80)
-        OUT2((code & 0xFF) | 0x80)
-        NEXT(1, 2)
+        OUTBYTE1((code >> 8) | 0x80)
+        OUTBYTE2((code & 0xFF) | 0x80)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -80,7 +82,7 @@
         }
 
         REQUIRE_INBUF(2)
-        TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) {
+        TRYMAP_DEC(gb2312, writer, c ^ 0x80, INBYTE2 ^ 0x80) {
             NEXT_IN(2);
         }
         else return 1;
@@ -96,28 +98,30 @@
 
 ENCODER(gbk)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
 
         GBK_ENCODE(c, code)
         else return 1;
 
-        OUT1((code >> 8) | 0x80)
+        OUTBYTE1((code >> 8) | 0x80)
         if (code & 0x8000)
-            OUT2((code & 0xFF)) /* MSB set: GBK */
+            OUTBYTE2((code & 0xFF)) /* MSB set: GBK */
         else
-            OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
-        NEXT(1, 2)
+            OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
+        NEXT(1, 2);
     }
 
     return 0;
@@ -126,7 +130,7 @@
 DECODER(gbk)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -136,7 +140,7 @@
 
         REQUIRE_INBUF(2)
 
-        GBK_DECODE(c, IN2, writer)
+        GBK_DECODE(c, INBYTE2, writer)
         else return 1;
 
         NEXT_IN(2);
@@ -152,41 +156,31 @@
 
 ENCODER(gb18030)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1(c)
-            NEXT(1, 1)
+            WRITEBYTE1(c)
+            NEXT(1, 1);
             continue;
         }
 
-        DECODE_SURROGATE(c)
-        if (c > 0x10FFFF)
-#if Py_UNICODE_SIZE == 2
-            return 2; /* surrogates pair */
-#else
-            return 1;
-#endif
-        else if (c >= 0x10000) {
+        if (c >= 0x10000) {
             Py_UCS4 tc = c - 0x10000;
+            assert (c <= 0x10FFFF);
 
             REQUIRE_OUTBUF(4)
 
-            OUT4((unsigned char)(tc % 10) + 0x30)
+            OUTBYTE4((unsigned char)(tc % 10) + 0x30)
             tc /= 10;
-            OUT3((unsigned char)(tc % 126) + 0x81)
+            OUTBYTE3((unsigned char)(tc % 126) + 0x81)
             tc /= 126;
-            OUT2((unsigned char)(tc % 10) + 0x30)
+            OUTBYTE2((unsigned char)(tc % 10) + 0x30)
             tc /= 10;
-            OUT1((unsigned char)(tc + 0x90))
+            OUTBYTE1((unsigned char)(tc + 0x90))
 
-#if Py_UNICODE_SIZE == 2
-            NEXT(2, 4) /* surrogates pair */
-#else
-            NEXT(1, 4)
-#endif
+            NEXT(1, 4);
             continue;
         }
 
@@ -209,15 +203,15 @@
                     tc = c - utrrange->first +
                          utrrange->base;
 
-                    OUT4((unsigned char)(tc % 10) + 0x30)
+                    OUTBYTE4((unsigned char)(tc % 10) + 0x30)
                     tc /= 10;
-                    OUT3((unsigned char)(tc % 126) + 0x81)
+                    OUTBYTE3((unsigned char)(tc % 126) + 0x81)
                     tc /= 126;
-                    OUT2((unsigned char)(tc % 10) + 0x30)
+                    OUTBYTE2((unsigned char)(tc % 10) + 0x30)
                     tc /= 10;
-                    OUT1((unsigned char)tc + 0x81)
+                    OUTBYTE1((unsigned char)tc + 0x81)
 
-                    NEXT(1, 4)
+                    NEXT(1, 4);
                     break;
                 }
 
@@ -226,13 +220,13 @@
             continue;
         }
 
-        OUT1((code >> 8) | 0x80)
+        OUTBYTE1((code >> 8) | 0x80)
         if (code & 0x8000)
-            OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
+            OUTBYTE2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
         else
-            OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
+            OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
 
-        NEXT(1, 2)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -241,7 +235,7 @@
 DECODER(gb18030)
 {
     while (inleft > 0) {
-        unsigned char c = IN1, c2;
+        unsigned char c = INBYTE1, c2;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -251,15 +245,15 @@
 
         REQUIRE_INBUF(2)
 
-        c2 = IN2;
+        c2 = INBYTE2;
         if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
             const struct _gb18030_to_unibmp_ranges *utr;
             unsigned char c3, c4;
             Py_UCS4 lseq;
 
             REQUIRE_INBUF(4)
-            c3 = IN3;
-            c4 = IN4;
+            c3 = INBYTE3;
+            c4 = INBYTE4;
             if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
                 return 1;
             c -= 0x81;  c2 -= 0x30;
@@ -313,33 +307,34 @@
 ENCODER_RESET(hz)
 {
     if (state->i != 0) {
-        WRITE2('~', '}')
+        WRITEBYTE2('~', '}')
         state->i = 0;
-        NEXT_OUT(2)
+        NEXT_OUT(2);
     }
     return 0;
 }
 
 ENCODER(hz)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
             if (state->i == 0) {
-                WRITE1((unsigned char)c)
-                NEXT(1, 1)
+                WRITEBYTE1((unsigned char)c)
+                NEXT(1, 1);
             }
             else {
-                WRITE3('~', '}', (unsigned char)c)
-                NEXT(1, 3)
+                WRITEBYTE3('~', '}', (unsigned char)c)
+                NEXT(1, 3);
                 state->i = 0;
             }
             continue;
         }
 
-        UCS4INVALID(c)
+        if (c > 0xFFFF)
+            return 1;
 
         TRYMAP_ENC(gbcommon, code, c);
         else return 1;
@@ -348,13 +343,13 @@
             return 1;
 
         if (state->i == 0) {
-            WRITE4('~', '{', code >> 8, code & 0xff)
-            NEXT(1, 4)
+            WRITEBYTE4('~', '{', code >> 8, code & 0xff)
+            NEXT(1, 4);
             state->i = 1;
         }
         else {
-            WRITE2(code >> 8, code & 0xff)
-            NEXT(1, 2)
+            WRITEBYTE2(code >> 8, code & 0xff)
+            NEXT(1, 2);
         }
     }
 
@@ -376,10 +371,10 @@
 DECODER(hz)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c == '~') {
-            unsigned char c2 = IN2;
+            unsigned char c2 = INBYTE2;
 
             REQUIRE_INBUF(2)
             if (c2 == '~') {
@@ -408,7 +403,7 @@
         }
         else { /* GB mode */
             REQUIRE_INBUF(2)
-            TRYMAP_DEC(gb2312, writer, c, IN2) {
+            TRYMAP_DEC(gb2312, writer, c, INBYTE2) {
                 NEXT_IN(2);
             }
             else
diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c
--- a/Modules/cjkcodecs/_codecs_hk.c
+++ b/Modules/cjkcodecs/_codecs_hk.c
@@ -38,35 +38,39 @@
 
 ENCODER(big5hkscs)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = **inbuf;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
         Py_ssize_t insize;
 
         if (c < 0x80) {
             REQUIRE_OUTBUF(1)
             **outbuf = (unsigned char)c;
-            NEXT(1, 1)
+            NEXT(1, 1);
             continue;
         }
 
-        DECODE_SURROGATE(c)
-        insize = GET_INSIZE(c);
-
+        insize = 1;
         REQUIRE_OUTBUF(2)
 
         if (c < 0x10000) {
             TRYMAP_ENC(big5hkscs_bmp, code, c) {
                 if (code == MULTIC) {
-                    if (inleft >= 2 &&
+                    Py_UCS4 c2;
+                    if (inlen - *inpos >= 2)
+                        c2 = INCHAR2;
+                    else
+                        c2 = 0;
+
+                    if (inlen - *inpos >= 2 &&
                         ((c & 0xffdf) == 0x00ca) &&
-                        (((*inbuf)[1] & 0xfff7) == 0x0304)) {
+                        ((c2 & 0xfff7) == 0x0304)) {
                         code = big5hkscs_pairenc_table[
                             ((c >> 4) |
-                             ((*inbuf)[1] >> 3)) & 3];
+                             (c2 >> 3)) & 3];
                         insize = 2;
                     }
-                    else if (inleft < 2 &&
+                    else if (inlen - *inpos < 2 &&
                              !(flags & MBENC_FLUSH))
                         return MBERR_TOOFEW;
                     else {
@@ -89,9 +93,9 @@
         else
             return insize;
 
-        OUT1(code >> 8)
-        OUT2(code & 0xFF)
-        NEXT(insize, 2)
+        OUTBYTE1(code >> 8)
+        OUTBYTE2(code & 0xFF)
+        NEXT(insize, 2);
     }
 
     return 0;
@@ -102,7 +106,7 @@
 DECODER(big5hkscs)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
         Py_UCS4 decoded;
 
         if (c < 0x80) {
@@ -113,20 +117,20 @@
 
         REQUIRE_INBUF(2)
 
-        if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) {
-            TRYMAP_DEC(big5, writer, c, IN2) {
+        if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) {
+            TRYMAP_DEC(big5, writer, c, INBYTE2) {
                 NEXT_IN(2);
                 continue;
             }
         }
 
-        TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2)
+        TRYMAP_DEC_CHAR(big5hkscs, decoded, c, INBYTE2)
         {
-            int s = BH2S(c, IN2);
+            int s = BH2S(c, INBYTE2);
             const unsigned char *hintbase;
 
             assert(0x87 <= c && c <= 0xfe);
-            assert(0x40 <= IN2 && IN2 <= 0xfe);
+            assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe);
 
             if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) {
                     hintbase = big5hkscs_phint_0;
@@ -154,7 +158,7 @@
             continue;
         }
 
-        switch ((c << 8) | IN2) {
+        switch ((c << 8) | INBYTE2) {
         case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
         case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
         case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c
--- a/Modules/cjkcodecs/_codecs_iso2022.c
+++ b/Modules/cjkcodecs/_codecs_iso2022.c
@@ -141,13 +141,13 @@
 ENCODER_RESET(iso2022)
 {
     if (STATE_GETFLAG(F_SHIFTED)) {
-        WRITE1(SI)
-        NEXT_OUT(1)
+        WRITEBYTE1(SI)
+        NEXT_OUT(1);
         STATE_CLEARFLAG(F_SHIFTED)
     }
     if (STATE_G0 != CHARSET_ASCII) {
-        WRITE3(ESC, '(', 'B')
-        NEXT_OUT(3)
+        WRITEBYTE3(ESC, '(', 'B')
+        NEXT_OUT(3);
         STATE_SETG0(CHARSET_ASCII)
     }
     return 0;
@@ -155,30 +155,29 @@
 
 ENCODER(iso2022)
 {
-    while (inleft > 0) {
+    while (*inpos < inlen) {
         const struct iso2022_designation *dsg;
         DBCHAR encoded;
-        Py_UCS4 c = **inbuf;
+        Py_UCS4 c = INCHAR1;
         Py_ssize_t insize;
 
         if (c < 0x80) {
             if (STATE_G0 != CHARSET_ASCII) {
-                WRITE3(ESC, '(', 'B')
+                WRITEBYTE3(ESC, '(', 'B')
                 STATE_SETG0(CHARSET_ASCII)
-                NEXT_OUT(3)
+                NEXT_OUT(3);
             }
             if (STATE_GETFLAG(F_SHIFTED)) {
-                WRITE1(SI)
+                WRITEBYTE1(SI)
                 STATE_CLEARFLAG(F_SHIFTED)
-                NEXT_OUT(1)
+                NEXT_OUT(1);
             }
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
 
-        DECODE_SURROGATE(c)
-        insize = GET_INSIZE(c);
+        insize = 1;
 
         encoded = MAP_UNMAPPABLE;
         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
@@ -187,24 +186,14 @@
             if (encoded == MAP_MULTIPLE_AVAIL) {
                 /* this implementation won't work for pair
                  * of non-bmp characters. */
-                if (inleft < 2) {
+                if (inlen - *inpos < 2) {
                     if (!(flags & MBENC_FLUSH))
                         return MBERR_TOOFEW;
                     length = -1;
                 }
                 else
                     length = 2;
-#if Py_UNICODE_SIZE == 2
-                if (length == 2) {
-                    Py_UCS4 u4in[2];
-                    u4in[0] = (Py_UCS4)IN1;
-                    u4in[1] = (Py_UCS4)IN2;
-                    encoded = dsg->encoder(u4in, &length);
-                } else
-                    encoded = dsg->encoder(&c, &length);
-#else
                 encoded = dsg->encoder(&c, &length);
-#endif
                 if (encoded != MAP_UNMAPPABLE) {
                     insize = length;
                     break;
@@ -221,47 +210,47 @@
         switch (dsg->plane) {
         case 0: /* G0 */
             if (STATE_GETFLAG(F_SHIFTED)) {
-                WRITE1(SI)
+                WRITEBYTE1(SI)
                 STATE_CLEARFLAG(F_SHIFTED)
-                NEXT_OUT(1)
+                NEXT_OUT(1);
             }
             if (STATE_G0 != dsg->mark) {
                 if (dsg->width == 1) {
-                    WRITE3(ESC, '(', ESCMARK(dsg->mark))
+                    WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark))
                     STATE_SETG0(dsg->mark)
-                    NEXT_OUT(3)
+                    NEXT_OUT(3);
                 }
                 else if (dsg->mark == CHARSET_JISX0208) {
-                    WRITE3(ESC, '$', ESCMARK(dsg->mark))
+                    WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark))
                     STATE_SETG0(dsg->mark)
-                    NEXT_OUT(3)
+                    NEXT_OUT(3);
                 }
                 else {
-                    WRITE4(ESC, '$', '(',
+                    WRITEBYTE4(ESC, '$', '(',
                         ESCMARK(dsg->mark))
                     STATE_SETG0(dsg->mark)
-                    NEXT_OUT(4)
+                    NEXT_OUT(4);
                 }
             }
             break;
         case 1: /* G1 */
             if (STATE_G1 != dsg->mark) {
                 if (dsg->width == 1) {
-                    WRITE3(ESC, ')', ESCMARK(dsg->mark))
+                    WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark))
                     STATE_SETG1(dsg->mark)
-                    NEXT_OUT(3)
+                    NEXT_OUT(3);
                 }
                 else {
-                    WRITE4(ESC, '$', ')',
+                    WRITEBYTE4(ESC, '$', ')',
                         ESCMARK(dsg->mark))
                     STATE_SETG1(dsg->mark)
-                    NEXT_OUT(4)
+                    NEXT_OUT(4);
                 }
             }
             if (!STATE_GETFLAG(F_SHIFTED)) {
-                WRITE1(SO)
+                WRITEBYTE1(SO)
                 STATE_SETFLAG(F_SHIFTED)
-                NEXT_OUT(1)
+                NEXT_OUT(1);
             }
             break;
         default: /* G2 and G3 is not supported: no encoding in
@@ -270,14 +259,14 @@
         }
 
         if (dsg->width == 1) {
-            WRITE1((unsigned char)encoded)
-            NEXT_OUT(1)
+            WRITEBYTE1((unsigned char)encoded)
+            NEXT_OUT(1);
         }
         else {
-            WRITE2(encoded >> 8, encoded & 0xff)
-            NEXT_OUT(2)
+            WRITEBYTE2(encoded >> 8, encoded & 0xff)
+            NEXT_OUT(2);
         }
-        NEXT_IN(insize);
+        NEXT_INCHAR(insize);
     }
 
     return 0;
@@ -323,26 +312,26 @@
 
     switch (esclen) {
     case 3:
-        if (IN2 == '$') {
-            charset = IN3 | CHARSET_DBCS;
+        if (INBYTE2 == '$') {
+            charset = INBYTE3 | CHARSET_DBCS;
             designation = 0;
         }
         else {
-            charset = IN3;
-            if (IN2 == '(') designation = 0;
-            else if (IN2 == ')') designation = 1;
-            else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
+            charset = INBYTE3;
+            if (INBYTE2 == '(') designation = 0;
+            else if (INBYTE2 == ')') designation = 1;
+            else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
                 designation = 2;
             else return 3;
         }
         break;
     case 4:
-        if (IN2 != '$')
+        if (INBYTE2 != '$')
             return 4;
 
-        charset = IN4 | CHARSET_DBCS;
-        if (IN3 == '(') designation = 0;
-        else if (IN3 == ')') designation = 1;
+        charset = INBYTE4 | CHARSET_DBCS;
+        if (INBYTE3 == '(') designation = 0;
+        else if (INBYTE3 == ')') designation = 1;
         else return 4;
         break;
     case 6: /* designation with prefix */
@@ -395,18 +384,18 @@
     /* not written to use encoder, decoder functions because only few
      * encodings use G2 designations in CJKCodecs */
     if (STATE_G2 == CHARSET_ISO8859_1) {
-        if (IN3 < 0x80)
-            OUTCHAR(IN3 + 0x80);
+        if (INBYTE3 < 0x80)
+            OUTCHAR(INBYTE3 + 0x80);
         else
             return 3;
     }
     else if (STATE_G2 == CHARSET_ISO8859_7) {
-        ISO8859_7_DECODE(IN3 ^ 0x80, writer)
+        ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
         else return 3;
     }
     else if (STATE_G2 == CHARSET_ASCII) {
-        if (IN3 & 0x80) return 3;
-        else OUTCHAR(IN3);
+        if (INBYTE3 & 0x80) return 3;
+        else OUTCHAR(INBYTE3);
     }
     else
         return MBERR_INTERNAL;
@@ -421,7 +410,7 @@
     const struct iso2022_designation *dsgcache = NULL;
 
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
         Py_ssize_t err;
 
         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
@@ -438,13 +427,13 @@
         switch (c) {
         case ESC:
             REQUIRE_INBUF(2)
-            if (IS_ISO2022ESC(IN2)) {
+            if (IS_ISO2022ESC(INBYTE2)) {
                 err = iso2022processesc(config, state,
                                         inbuf, &inleft);
                 if (err != 0)
                     return err;
             }
-            else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
+            else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
                 REQUIRE_INBUF(3)
                 err = iso2022processg2(config, state,
                                        inbuf, &inleft, writer);
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -19,38 +19,39 @@
 
 ENCODER(cp932)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
         unsigned char c1, c2;
 
         if (c <= 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
         else if (c >= 0xff61 && c <= 0xff9f) {
-            WRITE1(c - 0xfec0)
-            NEXT(1, 1)
+            WRITEBYTE1(c - 0xfec0)
+            NEXT(1, 1);
             continue;
         }
         else if (c >= 0xf8f0 && c <= 0xf8f3) {
             /* Windows compatibility */
             REQUIRE_OUTBUF(1)
             if (c == 0xf8f0)
-                OUT1(0xa0)
+                OUTBYTE1(0xa0)
             else
-                OUT1(c - 0xfef1 + 0xfd)
-            NEXT(1, 1)
+                OUTBYTE1(c - 0xfef1 + 0xfd)
+            NEXT(1, 1);
             continue;
         }
 
-        UCS4INVALID(c)
+        if (c > 0xFFFF)
+            return 1;
         REQUIRE_OUTBUF(2)
 
         TRYMAP_ENC(cp932ext, code, c) {
-            OUT1(code >> 8)
-            OUT2(code & 0xff)
+            OUTBYTE1(code >> 8)
+            OUTBYTE2(code & 0xff)
         }
         else TRYMAP_ENC(jisxcommon, code, c) {
             if (code & 0x8000) /* MSB set: JIS X 0212 */
@@ -61,20 +62,20 @@
             c2 = code & 0xff;
             c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
             c1 = (c1 - 0x21) >> 1;
-            OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
-            OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
+            OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
+            OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
         }
         else if (c >= 0xe000 && c < 0xe758) {
             /* User-defined area */
             c1 = (Py_UCS4)(c - 0xe000) / 188;
             c2 = (Py_UCS4)(c - 0xe000) % 188;
-            OUT1(c1 + 0xf0)
-            OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
+            OUTBYTE1(c1 + 0xf0)
+            OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
         }
         else
             return 1;
 
-        NEXT(1, 2)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -83,7 +84,7 @@
 DECODER(cp932)
 {
     while (inleft > 0) {
-        unsigned char c = IN1, c2;
+        unsigned char c = INBYTE1, c2;
 
         if (c <= 0x80) {
             OUTCHAR(c);
@@ -106,7 +107,7 @@
         }
 
         REQUIRE_INBUF(2)
-        c2 = IN2;
+        c2 = INBYTE2;
 
         TRYMAP_DEC(cp932ext, writer, c, c2);
         else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
@@ -145,25 +146,24 @@
 
 ENCODER(euc_jis_2004)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
         Py_ssize_t insize;
 
         if (c < 0x80) {
-            WRITE1(c)
-            NEXT(1, 1)
+            WRITEBYTE1(c)
+            NEXT(1, 1);
             continue;
         }
 
-        DECODE_SURROGATE(c)
-        insize = GET_INSIZE(c);
+        insize = 1;
 
         if (c <= 0xFFFF) {
             EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
             else TRYMAP_ENC(jisx0213_bmp, code, c) {
                 if (code == MULTIC) {
-                    if (inleft < 2) {
+                    if (inlen - *inpos < 2) {
                         if (flags & MBENC_FLUSH) {
                             code = find_pairencmap(
                                 (ucs2_t)c, 0,
@@ -176,8 +176,9 @@
                             return MBERR_TOOFEW;
                     }
                     else {
+                        Py_UCS4 c2 = INCHAR2;
                         code = find_pairencmap(
-                            (ucs2_t)c, (*inbuf)[1],
+                            (ucs2_t)c, c2,
                             jisx0213_pair_encmap,
                             JISX0213_ENCPAIRS);
                         if (code == DBCINV) {
@@ -195,8 +196,8 @@
             else TRYMAP_ENC(jisxcommon, code, c);
             else if (c >= 0xff61 && c <= 0xff9f) {
                 /* JIS X 0201 half-width katakana */
-                WRITE2(0x8e, c - 0xfec0)
-                NEXT(1, 2)
+                WRITEBYTE2(0x8e, c - 0xfec0)
+                NEXT(1, 2);
                 continue;
             }
             else if (c == 0xff3c)
@@ -218,12 +219,12 @@
 
         if (code & 0x8000) {
             /* Codeset 2 */
-            WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
-            NEXT(insize, 3)
+            WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
+            NEXT(insize, 3);
         } else {
             /* Codeset 1 */
-            WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
-            NEXT(insize, 2)
+            WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
+            NEXT(insize, 2);
         }
     }
 
@@ -233,7 +234,7 @@
 DECODER(euc_jis_2004)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
         Py_UCS4 code;
 
         if (c < 0x80) {
@@ -247,7 +248,7 @@
             unsigned char c2;
 
             REQUIRE_INBUF(2)
-            c2 = IN2;
+            c2 = INBYTE2;
             if (c2 >= 0xa1 && c2 <= 0xdf) {
                 OUTCHAR(0xfec0 + c2);
                 NEXT_IN(2);
@@ -259,8 +260,8 @@
             unsigned char c2, c3;
 
             REQUIRE_INBUF(3)
-            c2 = IN2 ^ 0x80;
-            c3 = IN3 ^ 0x80;
+            c2 = INBYTE2 ^ 0x80;
+            c3 = INBYTE3 ^ 0x80;
 
             /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
             EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c2, c3)
@@ -279,7 +280,7 @@
 
             REQUIRE_INBUF(2)
             c ^= 0x80;
-            c2 = IN2 ^ 0x80;
+            c2 = INBYTE2 ^ 0x80;
 
             /* JIS X 0213 Plane 1 */
             EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c, c2)
@@ -312,35 +313,36 @@
 
 ENCODER(euc_jp)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
 
-        UCS4INVALID(c)
+        if (c > 0xFFFF)
+            return 1;
 
         TRYMAP_ENC(jisxcommon, code, c);
         else if (c >= 0xff61 && c <= 0xff9f) {
             /* JIS X 0201 half-width katakana */
-            WRITE2(0x8e, c - 0xfec0)
-            NEXT(1, 2)
+            WRITEBYTE2(0x8e, c - 0xfec0)
+            NEXT(1, 2);
             continue;
         }
 #ifndef STRICT_BUILD
         else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
             code = 0x2140;
         else if (c == 0xa5) { /* YEN SIGN */
-            WRITE1(0x5c);
-            NEXT(1, 1)
+            WRITEBYTE1(0x5c);
+            NEXT(1, 1);
             continue;
         } else if (c == 0x203e) { /* OVERLINE */
-            WRITE1(0x7e);
-            NEXT(1, 1)
+            WRITEBYTE1(0x7e);
+            NEXT(1, 1);
             continue;
         }
 #endif
@@ -349,12 +351,12 @@
 
         if (code & 0x8000) {
             /* JIS X 0212 */
-            WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
-            NEXT(1, 3)
+            WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
+            NEXT(1, 3);
         } else {
             /* JIS X 0208 */
-            WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
-            NEXT(1, 2)
+            WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
+            NEXT(1, 2);
         }
     }
 
@@ -364,7 +366,7 @@
 DECODER(euc_jp)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -377,7 +379,7 @@
             unsigned char c2;
 
             REQUIRE_INBUF(2)
-            c2 = IN2;
+            c2 = INBYTE2;
             if (c2 >= 0xa1 && c2 <= 0xdf) {
                 OUTCHAR(0xfec0 + c2);
                 NEXT_IN(2);
@@ -389,8 +391,8 @@
             unsigned char c2, c3;
 
             REQUIRE_INBUF(3)
-            c2 = IN2;
-            c3 = IN3;
+            c2 = INBYTE2;
+            c3 = INBYTE3;
             /* JIS X 0212 */
             TRYMAP_DEC(jisx0212, writer, c2 ^ 0x80, c3 ^ 0x80) {
                 NEXT_IN(3);
@@ -402,7 +404,7 @@
             unsigned char c2;
 
             REQUIRE_INBUF(2)
-            c2 = IN2;
+            c2 = INBYTE2;
             /* JIS X 0208 */
 #ifndef STRICT_BUILD
             if (c == 0xa1 && c2 == 0xc0)
@@ -427,8 +429,8 @@
 
 ENCODER(shift_jis)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
         unsigned char c1, c2;
 
@@ -440,14 +442,16 @@
         else if (c == 0x203e) code = 0x7e; /* OVERLINE */
 #endif
         else JISX0201_K_ENCODE(c, code)
-        else UCS4INVALID(c)
-        else code = NOCHAR;
+        else if (c > 0xFFFF)
+            return 1;
+        else
+            code = NOCHAR;
 
         if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
             REQUIRE_OUTBUF(1)
 
-            OUT1((unsigned char)code)
-            NEXT(1, 1)
+            OUTBYTE1((unsigned char)code)
+            NEXT(1, 1);
             continue;
         }
 
@@ -470,9 +474,9 @@
         c2 = code & 0xff;
         c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
         c1 = (c1 - 0x21) >> 1;
-        OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
-        OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
-        NEXT(1, 2)
+        OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
+        OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -481,7 +485,7 @@
 DECODER(shift_jis)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
 #ifdef STRICT_BUILD
         JISX0201_R_DECODE(c, writer)
@@ -493,7 +497,7 @@
             unsigned char c1, c2;
 
             REQUIRE_INBUF(2)
-            c2 = IN2;
+            c2 = INBYTE2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
                 return 1;
 
@@ -533,30 +537,29 @@
 
 ENCODER(shift_jis_2004)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code = NOCHAR;
         int c1, c2;
         Py_ssize_t insize;
 
         JISX0201_ENCODE(c, code)
-        else DECODE_SURROGATE(c)
 
         if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
-            WRITE1((unsigned char)code)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)code)
+            NEXT(1, 1);
             continue;
         }
 
         REQUIRE_OUTBUF(2)
-        insize = GET_INSIZE(c);
+        insize = 1;
 
         if (code == NOCHAR) {
             if (c <= 0xffff) {
                 EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
                 else TRYMAP_ENC(jisx0213_bmp, code, c) {
                     if (code == MULTIC) {
-                        if (inleft < 2) {
+                        if (inlen - *inpos < 2) {
                             if (flags & MBENC_FLUSH) {
                             code = find_pairencmap
                                 ((ucs2_t)c, 0,
@@ -569,8 +572,9 @@
                                 return MBERR_TOOFEW;
                         }
                         else {
+                            Py_UCS4 ch2 = INCHAR2;
                             code = find_pairencmap(
-                                (ucs2_t)c, IN2,
+                                (ucs2_t)c, ch2,
                               jisx0213_pair_encmap,
                                 JISX0213_ENCPAIRS);
                             if (code == DBCINV) {
@@ -615,10 +619,10 @@
 
         if (c1 & 1) c2 += 0x5e;
         c1 >>= 1;
-        OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
-        OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
+        OUTBYTE1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
+        OUTBYTE2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
 
-        NEXT(insize, 2)
+        NEXT(insize, 2);
     }
 
     return 0;
@@ -627,7 +631,7 @@
 DECODER(shift_jis_2004)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         JISX0201_DECODE(c, writer)
         else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
@@ -635,7 +639,7 @@
             Py_UCS4 code;
 
             REQUIRE_INBUF(2)
-            c2 = IN2;
+            c2 = INBYTE2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
                 return 1;
 
diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c
--- a/Modules/cjkcodecs/_codecs_kr.c
+++ b/Modules/cjkcodecs/_codecs_kr.c
@@ -33,16 +33,18 @@
 
 ENCODER(euc_kr)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
         TRYMAP_ENC(cp949, code, c);
@@ -50,9 +52,9 @@
 
         if ((code & 0x8000) == 0) {
             /* KS X 1001 coded character */
-            OUT1((code >> 8) | 0x80)
-            OUT2((code & 0xFF) | 0x80)
-            NEXT(1, 2)
+            OUTBYTE1((code >> 8) | 0x80)
+            OUTBYTE2((code & 0xFF) | 0x80)
+            NEXT(1, 2);
         }
         else {          /* Mapping is found in CP949 extension,
                  * but we encode it in KS X 1001:1998 Annex 3,
@@ -61,23 +63,23 @@
             REQUIRE_OUTBUF(8)
 
             /* syllable composition precedence */
-            OUT1(EUCKR_JAMO_FIRSTBYTE)
-            OUT2(EUCKR_JAMO_FILLER)
+            OUTBYTE1(EUCKR_JAMO_FIRSTBYTE)
+            OUTBYTE2(EUCKR_JAMO_FILLER)
 
             /* All codepoints in CP949 extension are in unicode
              * Hangul Syllable area. */
             assert(0xac00 <= c && c <= 0xd7a3);
             c -= 0xac00;
 
-            OUT3(EUCKR_JAMO_FIRSTBYTE)
-            OUT4(u2cgk_choseong[c / 588])
-            NEXT_OUT(4)
+            OUTBYTE3(EUCKR_JAMO_FIRSTBYTE)
+            OUTBYTE4(u2cgk_choseong[c / 588])
+            NEXT_OUT(4);
 
-            OUT1(EUCKR_JAMO_FIRSTBYTE)
-            OUT2(u2cgk_jungseong[(c / 28) % 21])
-            OUT3(EUCKR_JAMO_FIRSTBYTE)
-            OUT4(u2cgk_jongseong[c % 28])
-            NEXT(1, 4)
+            OUTBYTE1(EUCKR_JAMO_FIRSTBYTE)
+            OUTBYTE2(u2cgk_jungseong[(c / 28) % 21])
+            OUTBYTE3(EUCKR_JAMO_FIRSTBYTE)
+            OUTBYTE4(u2cgk_jongseong[c % 28])
+            NEXT(1, 4);
         }
     }
 
@@ -102,7 +104,7 @@
 DECODER(euc_kr)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -113,7 +115,7 @@
         REQUIRE_INBUF(2)
 
         if (c == EUCKR_JAMO_FIRSTBYTE &&
-            IN2 == EUCKR_JAMO_FILLER) {
+            INBYTE2 == EUCKR_JAMO_FILLER) {
             /* KS X 1001:1998 Annex 3 make-up sequence */
             DBCHAR cho, jung, jong;
 
@@ -146,7 +148,7 @@
             OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
             NEXT_IN(8);
         }
-        else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80) {
+        else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80) {
             NEXT_IN(2);
         }
         else
@@ -164,27 +166,29 @@
 
 ENCODER(cp949)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
         TRYMAP_ENC(cp949, code, c);
         else return 1;
 
-        OUT1((code >> 8) | 0x80)
+        OUTBYTE1((code >> 8) | 0x80)
         if (code & 0x8000)
-            OUT2(code & 0xFF) /* MSB set: CP949 */
+            OUTBYTE2(code & 0xFF) /* MSB set: CP949 */
         else
-            OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
-        NEXT(1, 2)
+            OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
+        NEXT(1, 2);
     }
 
     return 0;
@@ -193,7 +197,7 @@
 DECODER(cp949)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -202,8 +206,8 @@
         }
 
         REQUIRE_INBUF(2)
-        TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80);
-        else TRYMAP_DEC(cp949ext, writer, c, IN2);
+        TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80);
+        else TRYMAP_DEC(cp949ext, writer, c, INBYTE2);
         else return 1;
 
         NEXT_IN(2);
@@ -246,16 +250,18 @@
 
 ENCODER(johab)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
 
@@ -281,9 +287,9 @@
                 t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
                           (c1 - 0x21 + 0x197));
                 t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
-                OUT1(t1 >> 1)
-                OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
-                NEXT(1, 2)
+                OUTBYTE1(t1 >> 1)
+                OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
+                NEXT(1, 2);
                 continue;
             }
             else
@@ -292,9 +298,9 @@
         else
             return 1;
 
-        OUT1(code >> 8)
-        OUT2(code & 0xff)
-        NEXT(1, 2)
+        OUTBYTE1(code >> 8)
+        OUTBYTE2(code & 0xff)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -344,7 +350,7 @@
 DECODER(johab)
 {
     while (inleft > 0) {
-        unsigned char    c = IN1, c2;
+        unsigned char    c = INBYTE1, c2;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -353,7 +359,7 @@
         }
 
         REQUIRE_INBUF(2)
-        c2 = IN2;
+        c2 = INBYTE2;
 
         if (c < 0xd8) {
             /* johab hangul */
diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c
--- a/Modules/cjkcodecs/_codecs_tw.c
+++ b/Modules/cjkcodecs/_codecs_tw.c
@@ -13,26 +13,28 @@
 
 ENCODER(big5)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = **inbuf;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
             REQUIRE_OUTBUF(1)
             **outbuf = (unsigned char)c;
-            NEXT(1, 1)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
 
         TRYMAP_ENC(big5, code, c);
         else return 1;
 
-        OUT1(code >> 8)
-        OUT2(code & 0xFF)
-        NEXT(1, 2)
+        OUTBYTE1(code >> 8)
+        OUTBYTE2(code & 0xFF)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -41,7 +43,7 @@
 DECODER(big5)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -50,7 +52,7 @@
         }
 
         REQUIRE_INBUF(2)
-        TRYMAP_DEC(big5, writer, c, IN2) {
+        TRYMAP_DEC(big5, writer, c, INBYTE2) {
             NEXT_IN(2);
         }
         else return 1;
@@ -66,25 +68,27 @@
 
 ENCODER(cp950)
 {
-    while (inleft > 0) {
-        Py_UCS4 c = IN1;
+    while (*inpos < inlen) {
+        Py_UCS4 c = INCHAR1;
         DBCHAR code;
 
         if (c < 0x80) {
-            WRITE1((unsigned char)c)
-            NEXT(1, 1)
+            WRITEBYTE1((unsigned char)c)
+            NEXT(1, 1);
             continue;
         }
-        UCS4INVALID(c)
+
+        if (c > 0xFFFF)
+            return 1;
 
         REQUIRE_OUTBUF(2)
         TRYMAP_ENC(cp950ext, code, c);
         else TRYMAP_ENC(big5, code, c);
         else return 1;
 
-        OUT1(code >> 8)
-        OUT2(code & 0xFF)
-        NEXT(1, 2)
+        OUTBYTE1(code >> 8)
+        OUTBYTE2(code & 0xFF)
+        NEXT(1, 2);
     }
 
     return 0;
@@ -93,7 +97,7 @@
 DECODER(cp950)
 {
     while (inleft > 0) {
-        unsigned char c = IN1;
+        unsigned char c = INBYTE1;
 
         if (c < 0x80) {
             OUTCHAR(c);
@@ -103,8 +107,8 @@
 
         REQUIRE_INBUF(2)
 
-        TRYMAP_DEC(cp950ext, writer, c, IN2);
-        else TRYMAP_DEC(big5, writer, c, IN2);
+        TRYMAP_DEC(cp950ext, writer, c, INBYTE2);
+        else TRYMAP_DEC(big5, writer, c, INBYTE2);
         else return 1;
 
         NEXT_IN(2);
diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h
--- a/Modules/cjkcodecs/cjkcodecs.h
+++ b/Modules/cjkcodecs/cjkcodecs.h
@@ -72,7 +72,8 @@
 #define ENCODER(encoding)                                               \
     static Py_ssize_t encoding##_encode(                                \
         MultibyteCodec_State *state, const void *config,                \
-        const Py_UNICODE **inbuf, Py_ssize_t inleft,                    \
+        int kind, void *data,                          \
+        Py_ssize_t *inpos, Py_ssize_t inlen,                            \
         unsigned char **outbuf, Py_ssize_t outleft, int flags)
 #define ENCODER_RESET(encoding)                                         \
     static Py_ssize_t encoding##_encode_reset(                          \
@@ -91,25 +92,25 @@
     static Py_ssize_t encoding##_decode_reset(                          \
         MultibyteCodec_State *state, const void *config)
 
-#if Py_UNICODE_SIZE == 4
-#define UCS4INVALID(code)       \
-    if ((code) > 0xFFFF)        \
-    return 1;
-#else
-#define UCS4INVALID(code)       \
-    if (0) ;
-#endif
-
 #define NEXT_IN(i)                              \
     do {                                        \
         (*inbuf) += (i);                        \
         (inleft) -= (i);                        \
     } while (0)
+#define NEXT_INCHAR(i)                          \
+    do {                                        \
+        (*inpos) += (i);                        \
+    } while (0)
 #define NEXT_OUT(o)                             \
-    (*outbuf) += (o);                           \
-    (outleft) -= (o);
+    do {                                        \
+        (*outbuf) += (o);                       \
+        (outleft) -= (o);                       \
+    } while (0)
 #define NEXT(i, o)                              \
-    NEXT_IN(i); NEXT_OUT(o)
+    do {                                        \
+        NEXT_INCHAR(i);                        \
+        NEXT_OUT(o);                        \
+    } while (0)
 
 #define REQUIRE_INBUF(n)                        \
     if (inleft < (n))                           \
@@ -118,10 +119,13 @@
     if (outleft < (n))                          \
         return MBERR_TOOSMALL;
 
-#define IN1 ((*inbuf)[0])
-#define IN2 ((*inbuf)[1])
-#define IN3 ((*inbuf)[2])
-#define IN4 ((*inbuf)[3])
+#define INBYTE1 ((*inbuf)[0])
+#define INBYTE2 ((*inbuf)[1])
+#define INBYTE3 ((*inbuf)[2])
+#define INBYTE4 ((*inbuf)[3])
+
+#define INCHAR1 PyUnicode_READ(kind, data, *inpos)
+#define INCHAR2 PyUnicode_READ(kind, data, *inpos + 1)
 
 #define OUTCHAR(c)                                                         \
     do {                                                                   \
@@ -140,24 +144,24 @@
         writer->pos += 2;                                                  \
     } while (0)
 
-#define OUT1(c) ((*outbuf)[0]) = (c);
-#define OUT2(c) ((*outbuf)[1]) = (c);
-#define OUT3(c) ((*outbuf)[2]) = (c);
-#define OUT4(c) ((*outbuf)[3]) = (c);
+#define OUTBYTE1(c) ((*outbuf)[0]) = (c);
+#define OUTBYTE2(c) ((*outbuf)[1]) = (c);
+#define OUTBYTE3(c) ((*outbuf)[2]) = (c);
+#define OUTBYTE4(c) ((*outbuf)[3]) = (c);
 
-#define WRITE1(c1)              \
+#define WRITEBYTE1(c1)              \
     REQUIRE_OUTBUF(1)           \
     (*outbuf)[0] = (c1);
-#define WRITE2(c1, c2)          \
+#define WRITEBYTE2(c1, c2)          \
     REQUIRE_OUTBUF(2)           \
     (*outbuf)[0] = (c1);        \
     (*outbuf)[1] = (c2);
-#define WRITE3(c1, c2, c3)      \
+#define WRITEBYTE3(c1, c2, c3)      \
     REQUIRE_OUTBUF(3)           \
     (*outbuf)[0] = (c1);        \
     (*outbuf)[1] = (c2);        \
     (*outbuf)[2] = (c3);
-#define WRITE4(c1, c2, c3, c4)  \
+#define WRITEBYTE4(c1, c2, c3, c4)  \
     REQUIRE_OUTBUF(4)           \
     (*outbuf)[0] = (c1);        \
     (*outbuf)[1] = (c2);        \
@@ -209,20 +213,6 @@
 #define TRYMAP_DEC_MPLANE(charset, writer, plane, c1, c2)         \
     if _TRYMAP_DEC(&charset##_decmap[plane][c1], writer, c2)
 
-#if Py_UNICODE_SIZE == 2
-#define DECODE_SURROGATE(c)                                     \
-    if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {                      \
-        REQUIRE_INBUF(2)                                        \
-        if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) {                 \
-            c = Py_UNICODE_JOIN_SURROGATES(c, IN2);             \
-        }                                                       \
-    }
-#define GET_INSIZE(c)   ((c) > 0xffff ? 2 : 1)
-#else
-#define DECODE_SURROGATE(c) {;}
-#define GET_INSIZE(c)   1
-#endif
-
 #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
 #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
 #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -10,7 +10,8 @@
 #include "multibytecodec.h"
 
 typedef struct {
-    const Py_UNICODE    *inbuf, *inbuf_top, *inbuf_end;
+    PyObject            *inobj;
+    Py_ssize_t          inpos, inlen;
     unsigned char       *outbuf, *outbuf_end;
     PyObject            *excobj, *outobj;
 } MultibyteEncodeBuffer;
@@ -45,7 +46,7 @@
 static char *streamkwarglist[] = {"stream", "errors", NULL};
 
 static PyObject *multibytecodec_encode(MultibyteCodec *,
-                MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t,
+                MultibyteCodec_State *, PyObject *, Py_ssize_t *,
                 PyObject *, int);
 
 #define MBENC_RESET     MBENC_MAX<<1 /* reset after an encoding session */
@@ -224,7 +225,7 @@
             return 0; /* retry it */
         case MBERR_TOOFEW:
             reason = "incomplete multibyte sequence";
-            esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf);
+            esize = (Py_ssize_t)buf->inpos;
             break;
         case MBERR_INTERNAL:
             PyErr_SetString(PyExc_RuntimeError,
@@ -238,14 +239,24 @@
     }
 
     if (errors == ERROR_REPLACE) {
-        const Py_UNICODE replchar = '?', *inbuf = &replchar;
+        PyObject *replchar;
         Py_ssize_t r;
+        Py_ssize_t inpos;
+        int kind;
+        void *data;
 
+        replchar = PyUnicode_FromOrdinal('?');
+        if (replchar == NULL)
+            goto errorexit;
+        kind = PyUnicode_KIND(replchar);
+        data = PyUnicode_DATA(replchar);
+
+        inpos = 0;
         for (;;) {
-            Py_ssize_t outleft;
+            Py_ssize_t outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf);
 
-            outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf);
-            r = codec->encode(state, codec->config, &inbuf, 1,
+            r = codec->encode(state, codec->config,
+                              kind, data, &inpos, 1,
                               &buf->outbuf, outleft, 0);
             if (r == MBERR_TOOSMALL) {
                 REQUIRE_ENCODEBUFFER(buf, -1);
@@ -255,25 +266,27 @@
                 break;
         }
 
+        Py_DECREF(replchar);
+
         if (r != 0) {
             REQUIRE_ENCODEBUFFER(buf, 1);
             *buf->outbuf++ = '?';
         }
     }
     if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) {
-        buf->inbuf += esize;
+        buf->inpos += esize;
         return 0;
     }
 
-    start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top);
+    start = (Py_ssize_t)buf->inpos;
     end = start + esize;
 
     /* use cached exception object if available */
     if (buf->excobj == NULL) {
-        buf->excobj = PyUnicodeEncodeError_Create(codec->encoding,
-                        buf->inbuf_top,
-                        buf->inbuf_end - buf->inbuf_top,
-                        start, end, reason);
+        buf->excobj =  PyObject_CallFunction(PyExc_UnicodeEncodeError,
+                                             "sOnns",
+                                             codec->encoding, buf->inobj,
+                                             start, end, reason);
         if (buf->excobj == NULL)
             goto errorexit;
     }
@@ -302,10 +315,10 @@
     }
 
     if (PyUnicode_Check(tobj)) {
-        const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj);
+        Py_ssize_t inpos;
 
-        retstr = multibytecodec_encode(codec, state, &uraw,
-                        PyUnicode_GET_SIZE(tobj), ERROR_STRICT,
+        retstr = multibytecodec_encode(codec, state, tobj,
+                        &inpos, ERROR_STRICT,
                         MBENC_FLUSH);
         if (retstr == NULL)
             goto errorexit;
@@ -324,15 +337,15 @@
 
     newpos = PyLong_AsSsize_t(PyTuple_GET_ITEM(retobj, 1));
     if (newpos < 0 && !PyErr_Occurred())
-        newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top);
-    if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) {
+        newpos += (Py_ssize_t)buf->inlen;
+    if (newpos < 0 || newpos > buf->inlen) {
         PyErr_Clear();
         PyErr_Format(PyExc_IndexError,
                      "position %zd from error handler out of bounds",
                      newpos);
         goto errorexit;
     }
-    buf->inbuf = buf->inbuf_top + newpos;
+    buf->inpos = newpos;
 
     Py_DECREF(retobj);
     Py_DECREF(retstr);
@@ -449,19 +462,29 @@
 static PyObject *
 multibytecodec_encode(MultibyteCodec *codec,
                       MultibyteCodec_State *state,
-                      const Py_UNICODE **data, Py_ssize_t datalen,
+                      PyObject *text, Py_ssize_t *inpos_t,
                       PyObject *errors, int flags)
 {
     MultibyteEncodeBuffer buf;
     Py_ssize_t finalsize, r = 0;
+    Py_ssize_t datalen;
+    int kind;
+    void *data;
+
+    if (PyUnicode_READY(text) < 0)
+        return NULL;
+    datalen = PyUnicode_GET_LENGTH(text);
 
     if (datalen == 0 && !(flags & MBENC_RESET))
         return PyBytes_FromStringAndSize(NULL, 0);
 
     buf.excobj = NULL;
     buf.outobj = NULL;
-    buf.inbuf = buf.inbuf_top = *data;
-    buf.inbuf_end = buf.inbuf_top + datalen;
+    buf.inobj = text;   /* borrowed reference */
+    buf.inpos = 0;
+    buf.inlen = datalen;
+    kind = PyUnicode_KIND(buf.inobj);
+    data = PyUnicode_DATA(buf.inobj);
 
     if (datalen > (PY_SSIZE_T_MAX - 16) / 2) {
         PyErr_NoMemory();
@@ -474,14 +497,14 @@
     buf.outbuf = (unsigned char *)PyBytes_AS_STRING(buf.outobj);
     buf.outbuf_end = buf.outbuf + PyBytes_GET_SIZE(buf.outobj);
 
-    while (buf.inbuf < buf.inbuf_end) {
-        Py_ssize_t inleft, outleft;
-
+    while (buf.inpos < buf.inlen) {
         /* we don't reuse inleft and outleft here.
          * error callbacks can relocate the cursor anywhere on buffer*/
-        inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf);
-        outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf);
-        r = codec->encode(state, codec->config, &buf.inbuf, inleft,
+        Py_ssize_t outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf);
+
+        r = codec->encode(state, codec->config,
+                          kind, data,
+                          &buf.inpos, buf.inlen,
                           &buf.outbuf, outleft, flags);
         if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH)))
             break;
@@ -512,7 +535,8 @@
         if (_PyBytes_Resize(&buf.outobj, finalsize) == -1)
             goto errorexit;
 
-    *data = buf.inbuf;
+    if (inpos_t)
+        *inpos_t = buf.inpos;
     Py_XDECREF(buf.excobj);
     return buf.outobj;
 
@@ -527,7 +551,6 @@
                       PyObject *args, PyObject *kwargs)
 {
     MultibyteCodec_State state;
-    Py_UNICODE *data;
     PyObject *errorcb, *r, *arg, *ucvt;
     const char *errors = NULL;
     Py_ssize_t datalen;
@@ -550,11 +573,11 @@
         }
     }
 
-    data = PyUnicode_AsUnicodeAndSize(arg, &datalen);
-    if (data == NULL) {
+    if (PyUnicode_READY(arg) < 0) {
         Py_XDECREF(ucvt);
         return NULL;
     }
+    datalen = PyUnicode_GET_LENGTH(arg);
 
     errorcb = internal_error_callback(errors);
     if (errorcb == NULL) {
@@ -566,7 +589,7 @@
         self->codec->encinit(&state, self->codec->config) != 0)
         goto errorexit;
     r = multibytecodec_encode(self->codec, &state,
-                    (const Py_UNICODE **)&data, datalen, errorcb,
+                    arg, NULL, errorcb,
                     MBENC_FLUSH | MBENC_RESET);
     if (r == NULL)
         goto errorexit;
@@ -712,8 +735,9 @@
                         PyObject *unistr, int final)
 {
     PyObject *ucvt, *r = NULL;
-    Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL;
-    Py_ssize_t datalen, origpending;
+    PyObject *inbuf = NULL;
+    Py_ssize_t inpos, datalen;
+    PyObject *origpending = NULL;
     wchar_t *data;
 
     if (PyUnicode_Check(unistr))
@@ -733,66 +757,64 @@
     data = PyUnicode_AsUnicodeAndSize(unistr, &datalen);
     if (data == NULL)
         goto errorexit;
-    origpending = ctx->pendingsize;
 
-    if (origpending > 0) {
-        if (datalen > PY_SSIZE_T_MAX - ctx->pendingsize) {
-            PyErr_NoMemory();
-            /* inbuf_tmp == NULL */
-            goto errorexit;
-        }
-        inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize);
+    if (ctx->pending) {
+        PyObject *inbuf_tmp;
+
+        Py_INCREF(ctx->pending);
+        origpending = ctx->pending;
+
+        Py_INCREF(ctx->pending);
+        inbuf_tmp = ctx->pending;
+        PyUnicode_Append(&inbuf_tmp, unistr);
         if (inbuf_tmp == NULL)
             goto errorexit;
-        memcpy(inbuf_tmp, ctx->pending,
-            Py_UNICODE_SIZE * ctx->pendingsize);
-        memcpy(inbuf_tmp + ctx->pendingsize,
-            PyUnicode_AS_UNICODE(unistr),
-            Py_UNICODE_SIZE * datalen);
-        datalen += ctx->pendingsize;
-        ctx->pendingsize = 0;
+        Py_CLEAR(ctx->pending);
         inbuf = inbuf_tmp;
     }
-    else
-        inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr);
+    else {
+        origpending = NULL;
 
-    inbuf_end = inbuf + datalen;
+        Py_INCREF(unistr);
+        inbuf = unistr;
+    }
+    if (PyUnicode_READY(inbuf) < 0)
+        goto errorexit;
+    inpos = 0;
+    datalen = PyUnicode_GET_LENGTH(inbuf);
 
     r = multibytecodec_encode(ctx->codec, &ctx->state,
-                    (const Py_UNICODE **)&inbuf, datalen,
-                    ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0);
+                              inbuf, &inpos,
+                              ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0);
     if (r == NULL) {
         /* recover the original pending buffer */
-        if (origpending > 0)
-            memcpy(ctx->pending, inbuf_tmp,
-                Py_UNICODE_SIZE * origpending);
-        ctx->pendingsize = origpending;
+        Py_CLEAR(ctx->pending);
+        ctx->pending = origpending;
+        origpending = NULL;
         goto errorexit;
     }
 
-    if (inbuf < inbuf_end) {
-        ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf);
-        if (ctx->pendingsize > MAXENCPENDING) {
+    if (inpos < datalen) {
+        if (datalen - inpos > MAXENCPENDING) {
             /* normal codecs can't reach here */
-            ctx->pendingsize = 0;
             PyErr_SetString(PyExc_UnicodeError,
                             "pending buffer overflow");
             goto errorexit;
         }
-        memcpy(ctx->pending, inbuf,
-            ctx->pendingsize * Py_UNICODE_SIZE);
+        ctx->pending = PyUnicode_Substring(inbuf, inpos, datalen);
+        if (ctx->pending == NULL) {
+            /* normal codecs can't reach here */
+            goto errorexit;
+        }
     }
 
-    if (inbuf_tmp != NULL)
-        PyMem_Del(inbuf_tmp);
     Py_XDECREF(ucvt);
     return r;
 
 errorexit:
-    if (inbuf_tmp != NULL)
-        PyMem_Del(inbuf_tmp);
     Py_XDECREF(r);
     Py_XDECREF(ucvt);
+    Py_XDECREF(origpending);
     return NULL;
 }
 
@@ -876,7 +898,7 @@
         if (r != 0)
             return NULL;
     }
-    self->pendingsize = 0;
+    Py_CLEAR(self->pending);
     Py_RETURN_NONE;
 }
 
@@ -912,7 +934,7 @@
     }
 
     self->codec = ((MultibyteCodecObject *)codec)->codec;
-    self->pendingsize = 0;
+    self->pending = NULL;
     self->errors = internal_error_callback(errors);
     if (self->errors == NULL)
         goto errorexit;
@@ -1598,18 +1620,16 @@
 static PyObject *
 mbstreamwriter_reset(MultibyteStreamWriterObject *self)
 {
-    const Py_UNICODE *pending;
     PyObject *pwrt;
 
-    pending = self->pending;
     pwrt = multibytecodec_encode(self->codec, &self->state,
-                    &pending, self->pendingsize, self->errors,
+                    self->pending, NULL, self->errors,
                     MBENC_FLUSH | MBENC_RESET);
     /* some pending buffer can be truncated when UnicodeEncodeError is
      * raised on 'strict' mode. but, 'reset' method is designed to
      * reset the pending buffer or states so failed string sequence
      * ought to be missed */
-    self->pendingsize = 0;
+    Py_CLEAR(self->pending);
     if (pwrt == NULL)
         return NULL;
 
@@ -1655,7 +1675,7 @@
     self->codec = ((MultibyteCodecObject *)codec)->codec;
     self->stream = stream;
     Py_INCREF(stream);
-    self->pendingsize = 0;
+    self->pending = NULL;
     self->errors = internal_error_callback(errors);
     if (self->errors == NULL)
         goto errorexit;
diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h
--- a/Modules/cjkcodecs/multibytecodec.h
+++ b/Modules/cjkcodecs/multibytecodec.h
@@ -27,7 +27,8 @@
 typedef int (*mbcodec_init)(const void *config);
 typedef Py_ssize_t (*mbencode_func)(MultibyteCodec_State *state,
                         const void *config,
-                        const Py_UNICODE **inbuf, Py_ssize_t inleft,
+                        int kind, void *data,
+                        Py_ssize_t *inpos, Py_ssize_t inlen,
                         unsigned char **outbuf, Py_ssize_t outleft,
                         int flags);
 typedef int (*mbencodeinit_func)(MultibyteCodec_State *state,
@@ -75,8 +76,7 @@
 #define MAXENCPENDING   2
 #define _MultibyteStatefulEncoder_HEAD          \
     _MultibyteStatefulCodec_HEAD                \
-    Py_UNICODE pending[MAXENCPENDING];          \
-    Py_ssize_t pendingsize;
+    PyObject *pending;
 typedef struct {
     _MultibyteStatefulEncoder_HEAD
 } MultibyteStatefulEncoderContext;

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list