Python's handling of unicode surrogates

Sat Apr 21 02:18:55 EDT 2007

Paul Boddie:

> Do we have a volunteer? ;-)

    I won't volunteer to do a real implementation - the Unicode type in 
Python is currently around 7000 lines long and there is other code to 
change in, for example, regular expressions. Here's a demonstration C++ 
implementation that stores an array of surrogate positions for indexing. 
  For text in which every character is a surrogate, this could lead to 
requiring 3 times as much storage (with a size_t requiring 64 bits for 
each 32 bit surrogate pair). Indexing is reasonably efficient, using a 
binary search through the surrogate positions so is proportional to 
log(number of surrogates).

    Code (not thoroughly tested):

/** @file surstr.cxx
** A simple Unicode string class that stores in UTF-16
** but indexes by character.
**/
// Copyright 2007 by Neil Hodgson <neilh at scintilla.org>
// This source code is public domain.

#include <string.h>
#include <stdio.h>

class surstr {
public:

     typedef wchar_t codePoint;
     enum { SURROGATE_LEAD_FIRST = 0xD800 };
     enum { SURROGATE_LEAD_LAST = 0xDBFF };
     enum { measure_length=0xffffffffU};

     codePoint *text;
     size_t length;
     size_t *surrogates;
     // Memory use would be decreased by allocating text and
     // surrogates as one block but the code is clearer this way
     size_t lengthSurrogates;

     static bool IsLead(codePoint cp) {
         return cp >= SURROGATE_LEAD_FIRST &&
             cp <= SURROGATE_LEAD_LAST;
     }

     void FindSurrogates() {
         lengthSurrogates = 0;
         for (size_t i=0; i < length; i++) {
             if (IsLead(text[i])) {
                 lengthSurrogates++;
             }
         }
         surrogates = new size_t[lengthSurrogates];
         size_t surr = 0;
         for (size_t i=0; i < length; i++) {
             if (IsLead(text[i])) {
                 surrogates[surr] = i - surr;
                 surr++;
             }
         }
     }

     size_t LinearIndexFromPosition(size_t position) const {
         // For checking that the binary search version works
         for (size_t i=0; i<lengthSurrogates; i++) {
             if (surrogates[i] >= position) {
                 return position + i;
             }
         }
         return position + lengthSurrogates;
     }

     size_t IndexFromPosition(size_t position) const {
         // Use a binary search to find index in log(lengthSurrogates)
         if (lengthSurrogates == 0)
             return position;
         if (position > surrogates[lengthSurrogates - 1])
             return position + lengthSurrogates;
         size_t lower = 0;
         size_t upper = lengthSurrogates-1;
         do {
             size_t middle = (upper + lower + 1) / 2; 	// Round high
             size_t posMiddle = surrogates[middle];
             if (position < posMiddle) {
                 upper = middle - 1;
             } else {
                 lower = middle;
             }
         } while (lower < upper);
         if (surrogates[lower] >= position)
             return position + lower;
         else
             return position + lower + 1;
     }

     size_t Length() const {
         return length - lengthSurrogates;
     }

     surstr() : text(0), length(0), surrogates(0), lengthSurrogates(0) {}

     // start and end are in code points
     surstr(codePoint *text_,
         size_t start=0, size_t end=measure_length) :
         text(0), length(0), surrogates(0), lengthSurrogates(0) {
         // Assert text_[start:end] only contains whole surrogate pairs
         if (end == measure_length) {
             end = 0;
             while (text_[end])
                 end++;
         }
         length = end - start;
         text = new codePoint[length];
         memcpy(text, text_, sizeof(codePoint) * length);
         FindSurrogates();
     }
     // start and end are in characters
     surstr(const surstr &source,
         size_t start=0, size_t end=measure_length) {
         size_t startIndex = source.IndexFromPosition(start);
         size_t endIndex;
         if (end == measure_length)
             endIndex = source.IndexFromPosition(source.Length());
         else
             endIndex = source.IndexFromPosition(end);

         length = endIndex - startIndex;
         text = new codePoint[length];
         memcpy(text, source.text + startIndex,
             sizeof(codePoint) * length);
         if (start == 0 && end == measure_length) {
             surrogates = new size_t[source.lengthSurrogates];
             memcpy(surrogates, source.surrogates,
                 sizeof(size_t) * source.lengthSurrogates);
             lengthSurrogates = source.lengthSurrogates;
         } else {
             FindSurrogates();
         }
     }
     ~surstr() {
         delete []text;
         text = 0;
         delete []surrogates;
         surrogates = 0;
     }
     void print() {
         for (size_t i=0;i<length;i++) {
             if (text[i] < 0x7f) {
                 printf("%c", text[i]);
             } else {
                 printf("\\u%04x", text[i]);
             }
         }
     }
};

void slicer(surstr &a) {
     printf("Length in characters = %d, code units = %d ==> ",
         a.Length(), a.length);
     a.print();
     printf("\n");
     for (size_t pos = 0; pos < a.Length(); pos++) {
         if (a.IndexFromPosition(pos) !=
             a.LinearIndexFromPosition(pos)) {
                 printf("  Failed at position %d -> %d",
                     pos, a.IndexFromPosition(pos));
         }
         printf("  [%0d] ", pos);
         surstr b(a, pos, pos+1);
         b.print();
         printf("\n");
     }
}

int main() {
     surstr n(L"");
     slicer(n);
     surstr a(L"a");
     slicer(a);
     surstr b(L"a\u6C348\U0001D11E-\U00010338!");
     slicer(b);
     printf("\n");
     surstr c(L"a\u6C348\U0001D11E\U0001D11E-\U00010338!"
         L"\U0001D11E\U0001D11Ea\u6C348\U0001D11E-\U00010338!");
     slicer(c);
     printf("\n");
}

    Test run:

Length in characters = 0, code units = 0 ==>
Length in characters = 1, code units = 1 ==> a
   [0] a
Length in characters = 7, code units = 9 ==> 
a\u6c348\ud834\udd1e-\ud800\udf38!
   [0] a
   [1] \u6c34
   [2] 8
   [3] \ud834\udd1e
   [4] -
   [5] \ud800\udf38
   [6] !

Length in characters = 17, code units = 24 ==> 
a\u6c348\ud834\udd1e\ud834\udd1e-\ud800\udf38!\ud834\udd1e\ud834\udd1ea\u6c348\ud834\udd1e-\ud800\udf38!
   [0] a
   [1] \u6c34
   [2] 8
   [3] \ud834\udd1e
   [4] \ud834\udd1e
   [5] -
   [6] \ud800\udf38
   [7] !
   [8] \ud834\udd1e
   [9] \ud834\udd1e
   [10] a
   [11] \u6c34
   [12] 8
   [13] \ud834\udd1e
   [14] -
   [15] \ud800\udf38
   [16] !

    Neil