[pypy-commit] pypy testing-cleanup-py3k: Update _sre.c to CPython 3.3.5 (doesn't compile because we're missing most of the PEP393 API)

rlamy pypy.commits at gmail.com
Tue Jun 21 11:42:54 EDT 2016


Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: testing-cleanup-py3k
Changeset: r85309:b60d0188fb25
Date: 2016-06-21 16:41 +0100
http://bitbucket.org/pypy/pypy/changeset/b60d0188fb25/

Log:	Update _sre.c to CPython 3.3.5 (doesn't compile because we're
	missing most of the PEP393 API)

diff --git a/pypy/module/cpyext/test/_sre.c b/pypy/module/cpyext/test/_sre.c
--- a/pypy/module/cpyext/test/_sre.c
+++ b/pypy/module/cpyext/test/_sre.c
@@ -164,21 +164,17 @@
 
 /* unicode-specific character predicates */
 
-#if defined(HAVE_UNICODE)
-
-#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
-#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
-#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
-#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
-#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
+#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
+#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
+#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
+#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
+#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
 
 static unsigned int sre_lower_unicode(unsigned int ch)
 {
-    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
+    return (unsigned int) Py_UNICODE_TOLOWER(ch);
 }
 
-#endif
-
 LOCAL(int)
 sre_category(SRE_CODE category, unsigned int ch)
 {
@@ -206,7 +202,6 @@
     case SRE_CATEGORY_LOC_NOT_WORD:
         return !SRE_LOC_IS_WORD(ch);
 
-#if defined(HAVE_UNICODE)
     case SRE_CATEGORY_UNI_DIGIT:
         return SRE_UNI_IS_DIGIT(ch);
     case SRE_CATEGORY_UNI_NOT_DIGIT:
@@ -223,24 +218,6 @@
         return SRE_UNI_IS_LINEBREAK(ch);
     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
         return !SRE_UNI_IS_LINEBREAK(ch);
-#else
-    case SRE_CATEGORY_UNI_DIGIT:
-        return SRE_IS_DIGIT(ch);
-    case SRE_CATEGORY_UNI_NOT_DIGIT:
-        return !SRE_IS_DIGIT(ch);
-    case SRE_CATEGORY_UNI_SPACE:
-        return SRE_IS_SPACE(ch);
-    case SRE_CATEGORY_UNI_NOT_SPACE:
-        return !SRE_IS_SPACE(ch);
-    case SRE_CATEGORY_UNI_WORD:
-        return SRE_LOC_IS_WORD(ch);
-    case SRE_CATEGORY_UNI_NOT_WORD:
-        return !SRE_LOC_IS_WORD(ch);
-    case SRE_CATEGORY_UNI_LINEBREAK:
-        return SRE_IS_LINEBREAK(ch);
-    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
-        return !SRE_IS_LINEBREAK(ch);
-#endif
     }
     return 0;
 }
@@ -266,7 +243,7 @@
     if (cursize < minsize) {
         void* stack;
         cursize = minsize+minsize/4+1024;
-        TRACE(("allocate/grow stack %d\n", cursize));
+        TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
         stack = PyMem_REALLOC(state->data_stack, cursize);
         if (!stack) {
             data_stack_dealloc(state);
@@ -281,6 +258,7 @@
 /* generate 8-bit version */
 
 #define SRE_CHAR unsigned char
+#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
 #define SRE_AT sre_at
 #define SRE_COUNT sre_count
 #define SRE_CHARSET sre_charset
@@ -288,15 +266,11 @@
 #define SRE_MATCH sre_match
 #define SRE_MATCH_CONTEXT sre_match_context
 #define SRE_SEARCH sre_search
-#define SRE_LITERAL_TEMPLATE sre_literal_template
-
-#if defined(HAVE_UNICODE)
 
 #define SRE_RECURSIVE
 #include "_sre.c"
 #undef SRE_RECURSIVE
 
-#undef SRE_LITERAL_TEMPLATE
 #undef SRE_SEARCH
 #undef SRE_MATCH
 #undef SRE_MATCH_CONTEXT
@@ -305,10 +279,15 @@
 #undef SRE_COUNT
 #undef SRE_AT
 #undef SRE_CHAR
-
-/* generate 16-bit unicode version */
-
-#define SRE_CHAR Py_UNICODE
+#undef SRE_CHARGET
+
+/* generate 8/16/32-bit unicode version */
+
+#define SRE_CHAR void
+#define SRE_CHARGET(state, buf, index) \
+    ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
+     (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
+     ((Py_UCS4*)buf)[index])
 #define SRE_AT sre_uat
 #define SRE_COUNT sre_ucount
 #define SRE_CHARSET sre_ucharset
@@ -316,8 +295,6 @@
 #define SRE_MATCH sre_umatch
 #define SRE_MATCH_CONTEXT sre_umatch_context
 #define SRE_SEARCH sre_usearch
-#define SRE_LITERAL_TEMPLATE sre_uliteral_template
-#endif
 
 #endif /* SRE_RECURSIVE */
 
@@ -328,7 +305,7 @@
    settings */
 
 LOCAL(int)
-SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
+SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
 {
     /* check if pointer is at given position */
 
@@ -342,16 +319,16 @@
 
     case SRE_AT_BEGINNING_LINE:
         return ((void*) ptr == state->beginning ||
-                SRE_IS_LINEBREAK((int) ptr[-1]));
+                SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
 
     case SRE_AT_END:
-        return (((void*) (ptr+1) == state->end &&
-                 SRE_IS_LINEBREAK((int) ptr[0])) ||
+        return (((void*) (ptr+state->charsize) == state->end &&
+                 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
                 ((void*) ptr == state->end));
 
     case SRE_AT_END_LINE:
         return ((void*) ptr == state->end ||
-                SRE_IS_LINEBREAK((int) ptr[0]));
+                SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
 
     case SRE_AT_END_STRING:
         return ((void*) ptr == state->end);
@@ -360,57 +337,55 @@
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_IS_WORD((int) ptr[-1]) : 0;
+            SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_IS_WORD((int) ptr[0]) : 0;
+            SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp != thatp;
 
     case SRE_AT_NON_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_IS_WORD((int) ptr[-1]) : 0;
+            SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_IS_WORD((int) ptr[0]) : 0;
+            SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp == thatp;
 
     case SRE_AT_LOC_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+            SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+            SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp != thatp;
 
     case SRE_AT_LOC_NON_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+            SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+            SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp == thatp;
 
-#if defined(HAVE_UNICODE)
     case SRE_AT_UNI_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+            SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+            SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp != thatp;
 
     case SRE_AT_UNI_NON_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         thatp = ((void*) ptr > state->beginning) ?
-            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+            SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
         thisp = ((void*) ptr < state->end) ?
-            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+            SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
         return thisp == thatp;
-#endif
 
     }
 
@@ -513,28 +488,29 @@
 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
 {
     SRE_CODE chr;
-    SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
-    SRE_CHAR* end = (SRE_CHAR *)state->end;
+    char* ptr = (char *)state->ptr;
+    char* end = (char *)state->end;
     Py_ssize_t i;
 
     /* adjust end */
-    if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
-        end = ptr + maxcount;
+    if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
+        end = ptr + maxcount*state->charsize;
 
     switch (pattern[0]) {
 
     case SRE_OP_IN:
         /* repeated set */
         TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
-        while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
-            ptr++;
+        while (ptr < end &&
+               SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
+            ptr += state->charsize;
         break;
 
     case SRE_OP_ANY:
         /* repeated dot wildcard. */
         TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
-        while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
-            ptr++;
+        while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
+            ptr += state->charsize;
         break;
 
     case SRE_OP_ANY_ALL:
@@ -548,51 +524,52 @@
         /* repeated literal */
         chr = pattern[1];
         TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
-        while (ptr < end && (SRE_CODE) *ptr == chr)
-            ptr++;
+        while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
+            ptr += state->charsize;
         break;
 
     case SRE_OP_LITERAL_IGNORE:
         /* repeated literal */
         chr = pattern[1];
         TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
-        while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
-            ptr++;
+        while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
+            ptr += state->charsize;
         break;
 
     case SRE_OP_NOT_LITERAL:
         /* repeated non-literal */
         chr = pattern[1];
         TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
-        while (ptr < end && (SRE_CODE) *ptr != chr)
-            ptr++;
+        while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
+            ptr += state->charsize;
         break;
 
     case SRE_OP_NOT_LITERAL_IGNORE:
         /* repeated non-literal */
         chr = pattern[1];
         TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
-        while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
-            ptr++;
+        while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
+            ptr += state->charsize;
         break;
 
     default:
         /* repeated single character pattern */
         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
-        while ((SRE_CHAR*) state->ptr < end) {
+        while ((char*) state->ptr < end) {
             i = SRE_MATCH(state, pattern);
             if (i < 0)
                 return i;
             if (!i)
                 break;
         }
-        TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
-               (SRE_CHAR*) state->ptr - ptr));
-        return (SRE_CHAR*) state->ptr - ptr;
+        TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
+               ((char*)state->ptr - ptr)/state->charsize));
+        return ((char*)state->ptr - ptr)/state->charsize;
     }
 
-    TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
-    return ptr - (SRE_CHAR*) state->ptr;
+    TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
+           (ptr - (char*) state->ptr)/state->charsize));
+    return (ptr - (char*) state->ptr)/state->charsize;
 }
 
 #if 0 /* not used in this release */
@@ -603,19 +580,19 @@
        returns the number of SRE_CODE objects to skip if successful, 0
        if no match */
 
-    SRE_CHAR* end = state->end;
-    SRE_CHAR* ptr = state->ptr;
+    char* end = state->end;
+    char* ptr = state->ptr;
     Py_ssize_t i;
 
     /* check minimal length */
-    if (pattern[3] && (end - ptr) < pattern[3])
+    if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
         return 0;
 
     /* check known prefix */
     if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
         /* <length> <skip> <prefix data> <overlap data> */
         for (i = 0; i < pattern[5]; i++)
-            if ((SRE_CODE) ptr[i] != pattern[7 + i])
+            if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
                 return 0;
         return pattern[0] + 2 * pattern[6];
     }
@@ -678,9 +655,10 @@
 #define DATA_STACK_ALLOC(state, type, ptr) \
 do { \
     alloc_pos = state->data_stack_base; \
-    TRACE(("allocating %s in %d (%d)\n", \
+    TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
+           "(%" PY_FORMAT_SIZE_T "d)\n", \
            SFY(type), alloc_pos, sizeof(type))); \
-    if (state->data_stack_size < alloc_pos+sizeof(type)) { \
+    if (sizeof(type) > state->data_stack_size - alloc_pos) { \
         int j = data_stack_grow(state, sizeof(type)); \
         if (j < 0) return j; \
         if (ctx_pos != -1) \
@@ -692,15 +670,16 @@
 
 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
 do { \
-    TRACE(("looking up %s at %d\n", SFY(type), pos)); \
+    TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
     ptr = (type*)(state->data_stack+pos); \
 } while (0)
 
 #define DATA_STACK_PUSH(state, data, size) \
 do { \
-    TRACE(("copy data in %p to %d (%d)\n", \
+    TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
+           "(%" PY_FORMAT_SIZE_T "d)\n", \
            data, state->data_stack_base, size)); \
-    if (state->data_stack_size < state->data_stack_base+size) { \
+    if (size > state->data_stack_size - state->data_stack_base) { \
         int j = data_stack_grow(state, size); \
         if (j < 0) return j; \
         if (ctx_pos != -1) \
@@ -712,7 +691,8 @@
 
 #define DATA_STACK_POP(state, data, size, discard) \
 do { \
-    TRACE(("copy data to %p from %d (%d)\n", \
+    TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
+           "(%" PY_FORMAT_SIZE_T "d)\n", \
            data, state->data_stack_base-size, size)); \
     memcpy(data, state->data_stack+state->data_stack_base-size, size); \
     if (discard) \
@@ -721,7 +701,8 @@
 
 #define DATA_STACK_POP_DISCARD(state, size) \
 do { \
-    TRACE(("discard data from %d (%d)\n", \
+    TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
+           "(%" PY_FORMAT_SIZE_T "d)\n", \
            state->data_stack_base-size, size)); \
     state->data_stack_base -= size; \
 } while(0)
@@ -784,7 +765,7 @@
 typedef struct {
     Py_ssize_t last_ctx_pos;
     Py_ssize_t jump;
-    SRE_CHAR* ptr;
+    char* ptr;
     SRE_CODE* pattern;
     Py_ssize_t count;
     Py_ssize_t lastmark;
@@ -800,7 +781,7 @@
 LOCAL(Py_ssize_t)
 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
 {
-    SRE_CHAR* end = (SRE_CHAR *)state->end;
+    char* end = (char*)state->end;
     Py_ssize_t alloc_pos, ctx_pos = -1;
     Py_ssize_t i, ret = 0;
     Py_ssize_t jump;
@@ -819,14 +800,16 @@
 
 entrance:
 
-    ctx->ptr = (SRE_CHAR *)state->ptr;
+    ctx->ptr = (char *)state->ptr;
 
     if (ctx->pattern[0] == SRE_OP_INFO) {
         /* optimization info block */
         /* <INFO> <1=skip> <2=flags> <3=min> ... */
-        if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
-            TRACE(("reject (got %d chars, need %d)\n",
-                   (end - ctx->ptr), ctx->pattern[3]));
+        if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
+            TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
+                   "need %" PY_FORMAT_SIZE_T "d)\n",
+                   (end - ctx->ptr)/state->charsize,
+                   (Py_ssize_t) ctx->pattern[3]));
             RETURN_FAILURE;
         }
         ctx->pattern += ctx->pattern[1] + 1;
@@ -866,10 +849,10 @@
             /* <LITERAL> <code> */
             TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
                    ctx->ptr, *ctx->pattern));
-            if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
+            if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
                 RETURN_FAILURE;
             ctx->pattern++;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_NOT_LITERAL:
@@ -877,10 +860,10 @@
             /* <NOT_LITERAL> <code> */
             TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
                    ctx->ptr, *ctx->pattern));
-            if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
+            if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
                 RETURN_FAILURE;
             ctx->pattern++;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_SUCCESS:
@@ -903,19 +886,19 @@
             /* <CATEGORY> <code> */
             TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
                    ctx->ptr, *ctx->pattern));
-            if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
+            if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
                 RETURN_FAILURE;
             ctx->pattern++;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_ANY:
             /* match anything (except a newline) */
             /* <ANY> */
             TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
-            if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
-                RETURN_FAILURE;
-            ctx->ptr++;
+                if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
+                    RETURN_FAILURE;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_ANY_ALL:
@@ -924,47 +907,47 @@
             TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
             if (ctx->ptr >= end)
                 RETURN_FAILURE;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_IN:
             /* match set member (or non_member) */
             /* <IN> <skip> <set> */
             TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
-            if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
-                RETURN_FAILURE;
+                if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
+                    RETURN_FAILURE;
             ctx->pattern += ctx->pattern[0];
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_LITERAL_IGNORE:
             TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, ctx->pattern[0]));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
+                state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
                 RETURN_FAILURE;
             ctx->pattern++;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_NOT_LITERAL_IGNORE:
             TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, *ctx->pattern));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
+                state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
                 RETURN_FAILURE;
             ctx->pattern++;
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_IN_IGNORE:
             TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
             if (ctx->ptr >= end
                 || !SRE_CHARSET(ctx->pattern+1,
-                                (SRE_CODE)state->lower(*ctx->ptr)))
+                                (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
                 RETURN_FAILURE;
             ctx->pattern += ctx->pattern[0];
-            ctx->ptr++;
+            ctx->ptr += state->charsize;
             break;
 
         case SRE_OP_JUMP:
@@ -987,11 +970,11 @@
             for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
                 if (ctx->pattern[1] == SRE_OP_LITERAL &&
                     (ctx->ptr >= end ||
-                     (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
+                     (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
                     continue;
                 if (ctx->pattern[1] == SRE_OP_IN &&
                     (ctx->ptr >= end ||
-                     !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
+                     !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
                     continue;
                 state->ptr = ctx->ptr;
                 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
@@ -1022,7 +1005,7 @@
             TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
                    ctx->pattern[1], ctx->pattern[2]));
 
-            if (ctx->ptr + ctx->pattern[1] > end)
+            if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
                 RETURN_FAILURE; /* cannot match */
 
             state->ptr = ctx->ptr;
@@ -1031,7 +1014,7 @@
             RETURN_ON_ERROR(ret);
             DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
             ctx->count = ret;
-            ctx->ptr += ctx->count;
+            ctx->ptr += state->charsize * ctx->count;
 
             /* when we arrive here, count contains the number of
                matches, and ctx->ptr points to the tail of the target
@@ -1055,8 +1038,9 @@
                 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
                 for (;;) {
                     while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
-                           (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
-                        ctx->ptr--;
+                           (ctx->ptr >= end ||
+                            SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
+                        ctx->ptr -= state->charsize;
                         ctx->count--;
                     }
                     if (ctx->count < (Py_ssize_t) ctx->pattern[1])
@@ -1071,7 +1055,7 @@
 
                     LASTMARK_RESTORE();
 
-                    ctx->ptr--;
+                    ctx->ptr -= state->charsize;
                     ctx->count--;
                 }
 
@@ -1085,7 +1069,7 @@
                         RETURN_ON_ERROR(ret);
                         RETURN_SUCCESS;
                     }
-                    ctx->ptr--;
+                    ctx->ptr -= state->charsize;
                     ctx->count--;
                     LASTMARK_RESTORE();
                 }
@@ -1105,7 +1089,7 @@
             TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
                    ctx->pattern[1], ctx->pattern[2]));
 
-            if (ctx->ptr + ctx->pattern[1] > end)
+            if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
                 RETURN_FAILURE; /* cannot match */
 
             state->ptr = ctx->ptr;
@@ -1122,7 +1106,7 @@
                     RETURN_FAILURE;
                 /* advance past minimum matches of repeat */
                 ctx->count = ret;
-                ctx->ptr += ctx->count;
+                ctx->ptr += state->charsize * ctx->count;
             }
 
             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
@@ -1149,7 +1133,7 @@
                     if (ret == 0)
                         break;
                     assert(ret == 1);
-                    ctx->ptr++;
+                    ctx->ptr += state->charsize;
                     ctx->count++;
                     LASTMARK_RESTORE();
                 }
@@ -1201,10 +1185,10 @@
 
             ctx->count = ctx->u.rep->count+1;
 
-            TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
+            TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
                    ctx->ptr, ctx->count));
 
-            if (ctx->count < ctx->u.rep->pattern[1]) {
+            if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
                 /* not enough matches */
                 ctx->u.rep->count = ctx->count;
                 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
@@ -1218,7 +1202,7 @@
                 RETURN_FAILURE;
             }
 
-            if ((ctx->count < ctx->u.rep->pattern[2] ||
+            if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
                 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
                 state->ptr != ctx->u.rep->last_ptr) {
                 /* we may have enough matches, but if we can
@@ -1264,10 +1248,10 @@
 
             ctx->count = ctx->u.rep->count+1;
 
-            TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
+            TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
                    ctx->ptr, ctx->count, ctx->u.rep->pattern));
 
-            if (ctx->count < ctx->u.rep->pattern[1]) {
+            if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
                 /* not enough matches */
                 ctx->u.rep->count = ctx->count;
                 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
@@ -1296,7 +1280,7 @@
 
             LASTMARK_RESTORE();
 
-            if ((ctx->count >= ctx->u.rep->pattern[2]
+            if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
                 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
                 state->ptr == ctx->u.rep->last_ptr)
                 RETURN_FAILURE;
@@ -1326,14 +1310,16 @@
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
-                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
-                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
+                    char* p = (char*) state->mark[groupref];
+                    char* e = (char*) state->mark[groupref+1];
                     if (!p || !e || e < p)
                         RETURN_FAILURE;
                     while (p < e) {
-                        if (ctx->ptr >= end || *ctx->ptr != *p)
+                        if (ctx->ptr >= end ||
+                            SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
                             RETURN_FAILURE;
-                        p++; ctx->ptr++;
+                        p += state->charsize;
+                        ctx->ptr += state->charsize;
                     }
                 }
             }
@@ -1350,15 +1336,17 @@
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
-                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
-                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
+                    char* p = (char*) state->mark[groupref];
+                    char* e = (char*) state->mark[groupref+1];
                     if (!p || !e || e < p)
                         RETURN_FAILURE;
                     while (p < e) {
                         if (ctx->ptr >= end ||
-                            state->lower(*ctx->ptr) != state->lower(*p))
+                            state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
+                            state->lower(SRE_CHARGET(state, p, 0)))
                             RETURN_FAILURE;
-                        p++; ctx->ptr++;
+                        p += state->charsize;
+                        ctx->ptr += state->charsize;
                     }
                 }
             }
@@ -1392,7 +1380,7 @@
             /* <ASSERT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
                    ctx->ptr, ctx->pattern[1]));
-            state->ptr = ctx->ptr - ctx->pattern[1];
+            state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
             if (state->ptr < state->beginning)
                 RETURN_FAILURE;
             DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
@@ -1405,7 +1393,7 @@
             /* <ASSERT_NOT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
                    ctx->ptr, ctx->pattern[1]));
-            state->ptr = ctx->ptr - ctx->pattern[1];
+            state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
             if (state->ptr >= state->beginning) {
                 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
                 if (ret) {
@@ -1477,7 +1465,8 @@
             TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
             goto jump_assert_not;
         case JUMP_NONE:
-            TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
+            TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
+                   ctx->ptr, ret));
             break;
     }
 
@@ -1487,8 +1476,8 @@
 LOCAL(Py_ssize_t)
 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
 {
-    SRE_CHAR* ptr = (SRE_CHAR *)state->start;
-    SRE_CHAR* end = (SRE_CHAR *)state->end;
+    char* ptr = (char*)state->start;
+    char* end = (char*)state->end;
     Py_ssize_t status = 0;
     Py_ssize_t prefix_len = 0;
     Py_ssize_t prefix_skip = 0;
@@ -1506,9 +1495,9 @@
         if (pattern[3] > 1) {
             /* adjust end point (but make sure we leave at least one
                character in there, so literal search will work) */
-            end -= pattern[3]-1;
+            end -= (pattern[3]-1) * state->charsize;
             if (end <= ptr)
-                end = ptr+1;
+                end = ptr + state->charsize;
         }
 
         if (flags & SRE_INFO_PREFIX) {
@@ -1526,7 +1515,8 @@
         pattern += 1 + pattern[1];
     }
 
-    TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
+    TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
+           prefix, prefix_len, prefix_skip));
     TRACE(("charset = %p\n", charset));
 
 #if defined(USE_FAST_SEARCH)
@@ -1534,10 +1524,10 @@
         /* pattern starts with a known prefix.  use the overlap
            table to skip forward as fast as we possibly can */
         Py_ssize_t i = 0;
-        end = (SRE_CHAR *)state->end;
+        end = (char *)state->end;
         while (ptr < end) {
             for (;;) {
-                if ((SRE_CODE) ptr[0] != prefix[i]) {
+                if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
                     if (!i)
                         break;
                     else
@@ -1546,8 +1536,8 @@
                     if (++i == prefix_len) {
                         /* found a potential match */
                         TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
-                        state->start = ptr + 1 - prefix_len;
-                        state->ptr = ptr + 1 - prefix_len + prefix_skip;
+                        state->start = ptr - (prefix_len - 1) * state->charsize;
+                        state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
                         if (flags & SRE_INFO_LITERAL)
                             return 1; /* we got all of it */
                         status = SRE_MATCH(state, pattern + 2*prefix_skip);
@@ -1559,7 +1549,7 @@
                     break;
                 }
             }
-            ptr++;
+            ptr += state->charsize;
         }
         return 0;
     }
@@ -1569,15 +1559,16 @@
         /* pattern starts with a literal character.  this is used
            for short prefixes, and if fast search is disabled */
         SRE_CODE chr = pattern[1];
-        end = (SRE_CHAR *)state->end;
+        end = (char*)state->end;
         for (;;) {
-            while (ptr < end && (SRE_CODE) ptr[0] != chr)
-                ptr++;
+            while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
+                ptr += state->charsize;
             if (ptr >= end)
                 return 0;
             TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
             state->start = ptr;
-            state->ptr = ++ptr;
+            ptr += state->charsize;
+            state->ptr = ptr;
             if (flags & SRE_INFO_LITERAL)
                 return 1; /* we got all of it */
             status = SRE_MATCH(state, pattern + 2);
@@ -1586,10 +1577,10 @@
         }
     } else if (charset) {
         /* pattern starts with a character from a known set */
-        end = (SRE_CHAR *)state->end;
+        end = (char*)state->end;
         for (;;) {
-            while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
-                ptr++;
+            while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
+                ptr += state->charsize;
             if (ptr >= end)
                 return 0;
             TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
@@ -1598,13 +1589,14 @@
             status = SRE_MATCH(state, pattern);
             if (status != 0)
                 break;
-            ptr++;
+            ptr += state->charsize;
         }
     } else
         /* general case */
         while (ptr <= end) {
             TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
-            state->start = state->ptr = ptr++;
+            state->start = state->ptr = ptr;
+            ptr += state->charsize;
             status = SRE_MATCH(state, pattern);
             if (status != 0)
                 break;
@@ -1613,25 +1605,32 @@
     return status;
 }
 
-LOCAL(int)
-SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
+#if !defined(SRE_RECURSIVE)
+
+/* -------------------------------------------------------------------- */
+/* factories and destructors */
+
+/* see sre.h for object declarations */
+static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
+static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
+
+static int
+sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
 {
     /* check if given string is a literal template (i.e. no escapes) */
-    while (len-- > 0)
-        if (*ptr++ == '\\')
+    struct {
+        int charsize;
+    } state = {
+        charsize
+    };
+    while (len-- > 0) {
+        if (SRE_CHARGET((&state), ptr, 0) == '\\')
             return 0;
+        ptr += charsize;
+    }
     return 1;
 }
 
-#if !defined(SRE_RECURSIVE)
-
-/* -------------------------------------------------------------------- */
-/* factories and destructors */
-
-/* see sre.h for object declarations */
-static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
-static PyObject*pattern_scanner(PatternObject*, PyObject*);
-
 static PyObject *
 sre_codesize(PyObject* self, PyObject *unused)
 {
@@ -1647,11 +1646,7 @@
     if (flags & SRE_FLAG_LOCALE)
         return Py_BuildValue("i", sre_lower_locale(character));
     if (flags & SRE_FLAG_UNICODE)
-#if defined(HAVE_UNICODE)
         return Py_BuildValue("i", sre_lower_unicode(character));
-#else
-        return Py_BuildValue("i", sre_lower_locale(character));
-#endif
     return Py_BuildValue("i", sre_lower(character));
 }
 
@@ -1670,7 +1665,9 @@
 }
 
 static void*
-getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize, Py_buffer *view)
+getstring(PyObject* string, Py_ssize_t* p_length,
+          int* p_logical_charsize, int* p_charsize,
+          Py_buffer *view)
 {
     /* given a python object, return a data pointer, a length (in
        characters), and a character size.  return NULL if the object
@@ -1684,13 +1681,16 @@
     /* Unicode objects do not support the buffer API. So, get the data
        directly instead. */
     if (PyUnicode_Check(string)) {
-        ptr = (void *)PyUnicode_AS_DATA(string);
-        *p_length = PyUnicode_GET_SIZE(string);
-        *p_charsize = sizeof(Py_UNICODE);
+        if (PyUnicode_READY(string) == -1)
+            return NULL;
+        ptr = PyUnicode_DATA(string);
+        *p_length = PyUnicode_GET_LENGTH(string);
+        *p_charsize = PyUnicode_KIND(string);
+        *p_logical_charsize = 4;
         return ptr;
     }
 
-    /* get pointer to string buffer */
+    /* get pointer to byte string buffer */
     view->len = -1;
     buffer = Py_TYPE(string)->tp_as_buffer;
     if (!buffer || !buffer->bf_getbuffer ||
@@ -1713,10 +1713,6 @@
 
     if (PyBytes_Check(string) || bytes == size)
         charsize = 1;
-#if defined(HAVE_UNICODE)
-    else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
-        charsize = sizeof(Py_UNICODE);
-#endif
     else {
         PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
         goto err;
@@ -1724,6 +1720,7 @@
 
     *p_length = size;
     *p_charsize = charsize;
+    *p_logical_charsize = charsize;
 
     if (ptr == NULL) {
             PyErr_SetString(PyExc_ValueError,
@@ -1744,7 +1741,7 @@
     /* prepare state object */
 
     Py_ssize_t length;
-    int charsize;
+    int logical_charsize, charsize;
     void* ptr;
 
     memset(state, 0, sizeof(SRE_STATE));
@@ -1753,18 +1750,18 @@
     state->lastindex = -1;
 
     state->buffer.buf = NULL;
-    ptr = getstring(string, &length, &charsize, &state->buffer);
+    ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
     if (!ptr)
         goto err;
 
-    if (charsize == 1 && pattern->charsize > 1) {
+    if (logical_charsize == 1 && pattern->logical_charsize > 1) {
         PyErr_SetString(PyExc_TypeError,
-			"can't use a string pattern on a bytes-like object");
+                        "can't use a string pattern on a bytes-like object");
         goto err;
     }
-    if (charsize > 1 && pattern->charsize == 1) {
+    if (logical_charsize > 1 && pattern->logical_charsize == 1) {
         PyErr_SetString(PyExc_TypeError,
-			"can't use a bytes pattern on a string-like object");
+                        "can't use a bytes pattern on a string-like object");
         goto err;
     }
 
@@ -1779,6 +1776,7 @@
     else if (end > length)
         end = length;
 
+    state->logical_charsize = logical_charsize;
     state->charsize = charsize;
 
     state->beginning = ptr;
@@ -1794,11 +1792,7 @@
     if (pattern->flags & SRE_FLAG_LOCALE)
         state->lower = sre_lower_locale;
     else if (pattern->flags & SRE_FLAG_UNICODE)
-#if defined(HAVE_UNICODE)
         state->lower = sre_lower_unicode;
-#else
-        state->lower = sre_lower_locale;
-#endif
     else
         state->lower = sre_lower;
 
@@ -1905,12 +1899,10 @@
 
     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
 
-    if (state.charsize == 1) {
+    if (state.logical_charsize == 1) {
         status = sre_match(&state, PatternObject_GetCode(self));
     } else {
-#if defined(HAVE_UNICODE)
         status = sre_umatch(&state, PatternObject_GetCode(self));
-#endif
     }
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
@@ -1942,12 +1934,10 @@
 
     TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
 
-    if (state.charsize == 1) {
+    if (state.logical_charsize == 1) {
         status = sre_search(&state, PatternObject_GetCode(self));
     } else {
-#if defined(HAVE_UNICODE)
         status = sre_usearch(&state, PatternObject_GetCode(self));
-#endif
     }
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
@@ -2089,16 +2079,14 @@
 
         state.ptr = state.start;
 
-        if (state.charsize == 1) {
+        if (state.logical_charsize == 1) {
             status = sre_search(&state, PatternObject_GetCode(self));
         } else {
-#if defined(HAVE_UNICODE)
             status = sre_usearch(&state, PatternObject_GetCode(self));
-#endif
         }
 
-	if (PyErr_Occurred())
-	    goto error;
+        if (PyErr_Occurred())
+            goto error;
 
         if (status <= 0) {
             if (status == 0)
@@ -2160,13 +2148,13 @@
 
 #if PY_VERSION_HEX >= 0x02020000
 static PyObject*
-pattern_finditer(PatternObject* pattern, PyObject* args)
+pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
 {
     PyObject* scanner;
     PyObject* search;
     PyObject* iterator;
 
-    scanner = pattern_scanner(pattern, args);
+    scanner = pattern_scanner(pattern, args, kw);
     if (!scanner)
         return NULL;
 
@@ -2219,16 +2207,14 @@
 
         state.ptr = state.start;
 
-        if (state.charsize == 1) {
+        if (state.logical_charsize == 1) {
             status = sre_search(&state, PatternObject_GetCode(self));
         } else {
-#if defined(HAVE_UNICODE)
             status = sre_usearch(&state, PatternObject_GetCode(self));
-#endif
         }
 
-	if (PyErr_Occurred())
-	    goto error;
+        if (PyErr_Occurred())
+            goto error;
 
         if (status <= 0) {
             if (status == 0)
@@ -2309,7 +2295,7 @@
     int status;
     Py_ssize_t n;
     Py_ssize_t i, b, e;
-    int bint;
+    int logical_charsize, charsize;
     int filter_is_callable;
     Py_buffer view;
 
@@ -2322,16 +2308,10 @@
         /* if not callable, check if it's a literal string */
         int literal;
         view.buf = NULL;
-        ptr = getstring(ptemplate, &n, &bint, &view);
-        b = bint;
+        ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
+        b = charsize;
         if (ptr) {
-            if (b == 1) {
-		    literal = sre_literal_template((unsigned char *)ptr, n);
-            } else {
-#if defined(HAVE_UNICODE)
-		    literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
-#endif
-            }
+            literal = sre_literal_template(b, ptr, n);
         } else {
             PyErr_Clear();
             literal = 0;
@@ -2375,16 +2355,14 @@
 
         state.ptr = state.start;
 
-        if (state.charsize == 1) {
+        if (state.logical_charsize == 1) {
             status = sre_search(&state, PatternObject_GetCode(self));
         } else {
-#if defined(HAVE_UNICODE)
             status = sre_usearch(&state, PatternObject_GetCode(self));
-#endif
         }
 
-	if (PyErr_Occurred())
-	    goto error;
+        if (PyErr_Occurred())
+            goto error;
 
         if (status <= 0) {
             if (status == 0)
@@ -2565,35 +2543,35 @@
 }
 
 PyDoc_STRVAR(pattern_match_doc,
-"match(string[, pos[, endpos]]) -> match object or None.\n\n\
+"match(string[, pos[, endpos]]) -> match object or None.\n\
     Matches zero or more characters at the beginning of the string");
 
 PyDoc_STRVAR(pattern_search_doc,
-"search(string[, pos[, endpos]]) -> match object or None.\n\n\
+"search(string[, pos[, endpos]]) -> match object or None.\n\
     Scan through string looking for a match, and return a corresponding\n\
     match object instance. Return None if no position in the string matches.");
 
 PyDoc_STRVAR(pattern_split_doc,
-"split(string[, maxsplit = 0])  -> list.\n\n\
+"split(string[, maxsplit = 0])  -> list.\n\
     Split string by the occurrences of pattern.");
 
 PyDoc_STRVAR(pattern_findall_doc,
-"findall(string[, pos[, endpos]]) -> list.\n\n\
+"findall(string[, pos[, endpos]]) -> list.\n\
    Return a list of all non-overlapping matches of pattern in string.");
 
 PyDoc_STRVAR(pattern_finditer_doc,
-"finditer(string[, pos[, endpos]]) -> iterator.\n\n\
+"finditer(string[, pos[, endpos]]) -> iterator.\n\
     Return an iterator over all non-overlapping matches for the \n\
     RE pattern in string. For each match, the iterator returns a\n\
     match object.");
 
 PyDoc_STRVAR(pattern_sub_doc,
-"sub(repl, string[, count = 0]) -> newstring.\n\n\
+"sub(repl, string[, count = 0]) -> newstring.\n\
     Return the string obtained by replacing the leftmost non-overlapping\n\
     occurrences of pattern in string by the replacement repl.");
 
 PyDoc_STRVAR(pattern_subn_doc,
-"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\n\
+"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
     Return the tuple (new_string, number_of_subs_made) found by replacing\n\
     the leftmost non-overlapping occurrences of pattern with the\n\
     replacement repl.");
@@ -2602,22 +2580,22 @@
 
 static PyMethodDef pattern_methods[] = {
     {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
-	pattern_match_doc},
+        pattern_match_doc},
     {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
-	pattern_search_doc},
+        pattern_search_doc},
     {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
-	pattern_sub_doc},
+        pattern_sub_doc},
     {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
-	pattern_subn_doc},
+        pattern_subn_doc},
     {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
-	pattern_split_doc},
+        pattern_split_doc},
     {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
-	pattern_findall_doc},
+        pattern_findall_doc},
 #if PY_VERSION_HEX >= 0x02020000
-    {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
-	pattern_finditer_doc},
+    {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
+        pattern_finditer_doc},
 #endif
-    {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
+    {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
     {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
     {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
     {NULL, NULL}
@@ -2636,31 +2614,31 @@
     PyVarObject_HEAD_INIT(NULL, 0)
     "_" SRE_MODULE ".SRE_Pattern",
     sizeof(PatternObject), sizeof(SRE_CODE),
-    (destructor)pattern_dealloc,	/* tp_dealloc */
-    0,					/* tp_print */
-    0,					/* tp_getattr */
-    0,					/* tp_setattr */
-    0,					/* tp_reserved */
-    0,					/* tp_repr */
-    0,					/* tp_as_number */
-    0,					/* tp_as_sequence */
-    0,					/* tp_as_mapping */
-    0,					/* tp_hash */
-    0,					/* tp_call */
-    0,					/* tp_str */
-    0,					/* tp_getattro */
-    0,					/* tp_setattro */
-    0,					/* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,			/* tp_flags */
-    pattern_doc,			/* tp_doc */
-    0,					/* tp_traverse */
-    0,					/* tp_clear */
-    0,					/* tp_richcompare */
-    offsetof(PatternObject, weakreflist),	/* tp_weaklistoffset */
-    0,					/* tp_iter */
-    0,					/* tp_iternext */
-    pattern_methods,			/* tp_methods */
-    pattern_members,			/* tp_members */
+    (destructor)pattern_dealloc,        /* tp_dealloc */
+    0,                                  /* tp_print */
+    0,                                  /* tp_getattr */
+    0,                                  /* tp_setattr */
+    0,                                  /* tp_reserved */
+    0,                                  /* tp_repr */
+    0,                                  /* tp_as_number */
+    0,                                  /* tp_as_sequence */
+    0,                                  /* tp_as_mapping */
+    0,                                  /* tp_hash */
+    0,                                  /* tp_call */
+    0,                                  /* tp_str */
+    0,                                  /* tp_getattro */
+    0,                                  /* tp_setattro */
+    0,                                  /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
+    pattern_doc,                        /* tp_doc */
+    0,                                  /* tp_traverse */
+    0,                                  /* tp_clear */
+    0,                                  /* tp_richcompare */
+    offsetof(PatternObject, weakreflist),       /* tp_weaklistoffset */
+    0,                                  /* tp_iter */
+    0,                                  /* tp_iternext */
+    pattern_methods,                    /* tp_methods */
+    pattern_members,                    /* tp_members */
 };
 
 static int _validate(PatternObject *self); /* Forward */
@@ -2701,13 +2679,6 @@
     for (i = 0; i < n; i++) {
         PyObject *o = PyList_GET_ITEM(code, i);
         unsigned long value = PyLong_AsUnsignedLong(o);
-        if (value == (unsigned long)-1 && PyErr_Occurred()) {
-            if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
-                PyErr_SetString(PyExc_OverflowError,
-                                "regular expression code size limit exceeded");
-            }
-            break;
-        }
         self->code[i] = (SRE_CODE) value;
         if ((unsigned long) self->code[i] != value) {
             PyErr_SetString(PyExc_OverflowError,
@@ -2721,11 +2692,14 @@
         return NULL;
     }
 
-    if (pattern == Py_None)
+    if (pattern == Py_None) {
+        self->logical_charsize = -1;
         self->charsize = -1;
+    }
     else {
         Py_ssize_t p_length;
-        if (!getstring(pattern, &p_length, &self->charsize, &self->view)) {
+        if (!getstring(pattern, &p_length, &self->logical_charsize,
+                       &self->charsize, &self->view)) {
             Py_DECREF(self);
             return NULL;
         }
@@ -2776,8 +2750,7 @@
     \_________\_____/        /
                \____________/
 
-   It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
-   bytes wide (the latter if Python is compiled for "wide" unicode support).
+   It also helps that SRE_CODE is always an unsigned type.
 */
 
 /* Defining this one enables tracing of the validator */
@@ -2815,7 +2788,7 @@
         skip = *code;                                   \
         VTRACE(("%lu (skip to %p)\n",                   \
                (unsigned long)skip, code+skip));        \
-        if (code+skip-adj < code || code+skip-adj > end)\
+        if (skip-adj > end-code)                        \
             FAIL;                                       \
         code++;                                         \
     } while (0)
@@ -2848,7 +2821,7 @@
 
         case SRE_OP_CHARSET:
             offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
-            if (code+offset < code || code+offset > end)
+            if (offset > end-code)
                 FAIL;
             code += offset;
             break;
@@ -2856,7 +2829,7 @@
         case SRE_OP_BIGCHARSET:
             GET_ARG; /* Number of blocks */
             offset = 256/sizeof(SRE_CODE); /* 256-byte table */
-            if (code+offset < code || code+offset > end)
+            if (offset > end-code)
                 FAIL;
             /* Make sure that each byte points to a valid block */
             for (i = 0; i < 256; i++) {
@@ -2865,7 +2838,7 @@
             }
             code += offset;
             offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
-            if (code+offset < code || code+offset > end)
+            if (offset > end-code)
                 FAIL;
             code += offset;
             break;
@@ -2990,13 +2963,13 @@
                    <INFO> <1=skip> <2=flags> <3=min> <4=max>;
                    If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
                    more follows. */
-                SRE_CODE flags, min, max, i;
+                SRE_CODE flags, i;
                 SRE_CODE *newcode;
                 GET_SKIP;
                 newcode = code+skip-1;
                 GET_ARG; flags = arg;
-                GET_ARG; min = arg;
-                GET_ARG; max = arg;
+                GET_ARG;
+                GET_ARG;
                 /* Check that only valid flags are present */
                 if ((flags & ~(SRE_INFO_PREFIX |
                                SRE_INFO_LITERAL |
@@ -3012,15 +2985,15 @@
                     FAIL;
                 /* Validate the prefix */
                 if (flags & SRE_INFO_PREFIX) {
-                    SRE_CODE prefix_len, prefix_skip;
+                    SRE_CODE prefix_len;
                     GET_ARG; prefix_len = arg;
-                    GET_ARG; prefix_skip = arg;
+                    GET_ARG;
                     /* Here comes the prefix string */
-                    if (code+prefix_len < code || code+prefix_len > newcode)
+                    if (prefix_len > newcode-code)
                         FAIL;
                     code += prefix_len;
                     /* And here comes the overlap table */
-                    if (code+prefix_len < code || code+prefix_len > newcode)
+                    if (prefix_len > newcode-code)
                         FAIL;
                     /* Each overlap value should be < prefix_len */
                     for (i = 0; i < prefix_len; i++) {
@@ -3149,7 +3122,7 @@
                to allow arbitrary jumps anywhere in the code; so we just look
                for a JUMP opcode preceding our skip target.
             */
-            if (skip >= 3 && code+skip-3 >= code &&
+            if (skip >= 3 && skip-3 < end-code &&
                 code[skip-3] == SRE_OP_JUMP)
             {
                 VTRACE(("both then and else parts present\n"));
@@ -3261,8 +3234,8 @@
     Py_ssize_t i;
 
     if (index == NULL)
-	/* Default value */
-	return 0;
+        /* Default value */
+        return 0;
 
     if (PyLong_Check(index))
         return PyLong_AsSsize_t(index);
@@ -3590,36 +3563,36 @@
 Match objects always have a boolean value of True.");
 
 PyDoc_STRVAR(match_group_doc,
-"group([group1, ...]) -> str or tuple.\n\n\
+"group([group1, ...]) -> str or tuple.\n\
     Return subgroup(s) of the match by indices or names.\n\
     For 0 returns the entire match.");
 
 PyDoc_STRVAR(match_start_doc,
-"start([group=0]) -> int.\n\n\
+"start([group=0]) -> int.\n\
     Return index of the start of the substring matched by group.");
 
 PyDoc_STRVAR(match_end_doc,
-"end([group=0]) -> int.\n\n\
+"end([group=0]) -> int.\n\
     Return index of the end of the substring matched by group.");
 
 PyDoc_STRVAR(match_span_doc,
-"span([group]) -> tuple.\n\n\
+"span([group]) -> tuple.\n\
     For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
 
 PyDoc_STRVAR(match_groups_doc,
-"groups([default=None]) -> tuple.\n\n\
+"groups([default=None]) -> tuple.\n\
     Return a tuple containing all the subgroups of the match, from 1.\n\
     The default argument is used for groups\n\
     that did not participate in the match");
 
 PyDoc_STRVAR(match_groupdict_doc,
-"groupdict([default=None]) -> dict.\n\n\
+"groupdict([default=None]) -> dict.\n\
     Return a dictionary containing all the named subgroups of the match,\n\
     keyed by the subgroup name. The default argument is used for groups\n\
     that did not participate in the match");
 
 PyDoc_STRVAR(match_expand_doc,
-"expand(template) -> str.\n\n\
+"expand(template) -> str.\n\
     Return the string obtained by doing backslash substitution\n\
     on the string template, as done by the sub() method.");
 
@@ -3695,32 +3668,32 @@
     PyVarObject_HEAD_INIT(NULL,0)
     "_" SRE_MODULE ".SRE_Match",
     sizeof(MatchObject), sizeof(Py_ssize_t),
-    (destructor)match_dealloc,	/* tp_dealloc */
-    0,				/* tp_print */
-    0,				/* tp_getattr */
-    0,				/* tp_setattr */
-    0,				/* tp_reserved */
-    0,				/* tp_repr */
-    0,				/* tp_as_number */
-    0,				/* tp_as_sequence */
-    0,				/* tp_as_mapping */
-    0,				/* tp_hash */
-    0,				/* tp_call */
-    0,				/* tp_str */
-    0,				/* tp_getattro */
-    0,				/* tp_setattro */
-    0,				/* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,		/* tp_flags */
-    match_doc,			/* tp_doc */
-    0,				/* tp_traverse */
-    0,				/* tp_clear */
-    0,				/* tp_richcompare */
-    0,				/* tp_weaklistoffset */
-    0,				/* tp_iter */
-    0,				/* tp_iternext */
-    match_methods,		/* tp_methods */
-    match_members,		/* tp_members */
-    match_getset,  	        /* tp_getset */
+    (destructor)match_dealloc,  /* tp_dealloc */
+    0,                          /* tp_print */
+    0,                          /* tp_getattr */
+    0,                          /* tp_setattr */
+    0,                          /* tp_reserved */
+    0,                          /* tp_repr */
+    0,                          /* tp_as_number */
+    0,                          /* tp_as_sequence */
+    0,                          /* tp_as_mapping */
+    0,                          /* tp_hash */
+    0,                          /* tp_call */
+    0,                          /* tp_str */
+    0,                          /* tp_getattro */
+    0,                          /* tp_setattro */
+    0,                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,         /* tp_flags */
+    match_doc,                  /* tp_doc */
+    0,                          /* tp_traverse */
+    0,                          /* tp_clear */
+    0,                          /* tp_richcompare */
+    0,                          /* tp_weaklistoffset */
+    0,                          /* tp_iter */
+    0,                          /* tp_iternext */
+    match_methods,              /* tp_methods */
+    match_members,              /* tp_members */
+    match_getset,               /* tp_getset */
 };
 
 static PyObject*
@@ -3809,12 +3782,10 @@
 
     state->ptr = state->start;
 
-    if (state->charsize == 1) {
+    if (state->logical_charsize == 1) {
         status = sre_match(state, PatternObject_GetCode(self->pattern));
     } else {
-#if defined(HAVE_UNICODE)
         status = sre_umatch(state, PatternObject_GetCode(self->pattern));
-#endif
     }
     if (PyErr_Occurred())
         return NULL;
@@ -3842,12 +3813,10 @@
 
     state->ptr = state->start;
 
-    if (state->charsize == 1) {
+    if (state->logical_charsize == 1) {
         status = sre_search(state, PatternObject_GetCode(self->pattern));
     } else {
-#if defined(HAVE_UNICODE)
         status = sre_usearch(state, PatternObject_GetCode(self->pattern));
-#endif
     }
     if (PyErr_Occurred())
         return NULL;
@@ -3871,7 +3840,7 @@
 
 #define SCAN_OFF(x) offsetof(ScannerObject, x)
 static PyMemberDef scanner_members[] = {
-    {"pattern",	T_OBJECT,	SCAN_OFF(pattern),	READONLY},
+    {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
     {NULL}  /* Sentinel */
 };
 
@@ -3880,35 +3849,35 @@
     "_" SRE_MODULE ".SRE_Scanner",
     sizeof(ScannerObject), 0,
     (destructor)scanner_dealloc,/* tp_dealloc */
-    0,				/* tp_print */
-    0,				/* tp_getattr */
-    0,				/* tp_setattr */
-    0,				/* tp_reserved */
-    0,				/* tp_repr */
-    0,				/* tp_as_number */
-    0,				/* tp_as_sequence */
-    0,				/* tp_as_mapping */
-    0,				/* tp_hash */
-    0,				/* tp_call */
-    0,				/* tp_str */
-    0,				/* tp_getattro */
-    0,				/* tp_setattro */
-    0,				/* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,		/* tp_flags */
-    0,				/* tp_doc */
-    0,				/* tp_traverse */
-    0,				/* tp_clear */
-    0,				/* tp_richcompare */
-    0,				/* tp_weaklistoffset */
-    0,				/* tp_iter */
-    0,				/* tp_iternext */
-    scanner_methods,		/* tp_methods */
-    scanner_members,		/* tp_members */
-    0,				/* tp_getset */
+    0,                          /* tp_print */
+    0,                          /* tp_getattr */
+    0,                          /* tp_setattr */
+    0,                          /* tp_reserved */
+    0,                          /* tp_repr */
+    0,                          /* tp_as_number */
+    0,                          /* tp_as_sequence */
+    0,                          /* tp_as_mapping */
+    0,                          /* tp_hash */
+    0,                          /* tp_call */
+    0,                          /* tp_str */
+    0,                          /* tp_getattro */
+    0,                          /* tp_setattro */
+    0,                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,         /* tp_flags */
+    0,                          /* tp_doc */
+    0,                          /* tp_traverse */
+    0,                          /* tp_clear */
+    0,                          /* tp_richcompare */
+    0,                          /* tp_weaklistoffset */
+    0,                          /* tp_iter */
+    0,                          /* tp_iternext */
+    scanner_methods,            /* tp_methods */
+    scanner_members,            /* tp_members */
+    0,                          /* tp_getset */
 };
 
 static PyObject*
-pattern_scanner(PatternObject* pattern, PyObject* args)
+pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
 {
     /* create search state object */
 
@@ -3917,7 +3886,9 @@
     PyObject* string;
     Py_ssize_t start = 0;
     Py_ssize_t end = PY_SSIZE_T_MAX;
-    if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
+    static char* kwlist[] = { "source", "pos", "endpos", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
+                                     &string, &start, &end))
         return NULL;
 
     /* create scanner object */
@@ -3946,15 +3917,15 @@
 };
 
 static struct PyModuleDef sremodule = {
-	PyModuleDef_HEAD_INIT,
-	"_" SRE_MODULE,
-	NULL,
-	-1,
-	_functions,
-	NULL,
-	NULL,
-	NULL,
-	NULL
+        PyModuleDef_HEAD_INIT,
+        "_" SRE_MODULE,
+        NULL,
+        -1,
+        _functions,
+        NULL,
+        NULL,
+        NULL,
+        NULL
 };
 
 PyMODINIT_FUNC PyInit__sre(void)
@@ -3970,7 +3941,7 @@
 
     m = PyModule_Create(&sremodule);
     if (m == NULL)
-    	return NULL;
+        return NULL;
     d = PyModule_GetDict(m);
 
     x = PyLong_FromLong(SRE_MAGIC);


More information about the pypy-commit mailing list