Python's handling of unicode surrogates
Neil Hodgson
nyamatongwe+thunder at gmail.com
Sat Apr 21 02:18:55 EDT 2007
Paul Boddie:
> Do we have a volunteer? ;-)
I won't volunteer to do a real implementation - the Unicode type in
Python is currently around 7000 lines long and there is other code to
change in, for example, regular expressions. Here's a demonstration C++
implementation that stores an array of surrogate positions for indexing.
For text in which every character is a surrogate, this could lead to
requiring 3 times as much storage (with a size_t requiring 64 bits for
each 32 bit surrogate pair). Indexing is reasonably efficient, using a
binary search through the surrogate positions so is proportional to
log(number of surrogates).
Code (not thoroughly tested):
/** @file surstr.cxx
** A simple Unicode string class that stores in UTF-16
** but indexes by character.
**/
// Copyright 2007 by Neil Hodgson <neilh at scintilla.org>
// This source code is public domain.
#include <string.h>
#include <stdio.h>
class surstr {
public:
typedef wchar_t codePoint;
enum { SURROGATE_LEAD_FIRST = 0xD800 };
enum { SURROGATE_LEAD_LAST = 0xDBFF };
enum { measure_length=0xffffffffU};
codePoint *text;
size_t length;
size_t *surrogates;
// Memory use would be decreased by allocating text and
// surrogates as one block but the code is clearer this way
size_t lengthSurrogates;
static bool IsLead(codePoint cp) {
return cp >= SURROGATE_LEAD_FIRST &&
cp <= SURROGATE_LEAD_LAST;
}
void FindSurrogates() {
lengthSurrogates = 0;
for (size_t i=0; i < length; i++) {
if (IsLead(text[i])) {
lengthSurrogates++;
}
}
surrogates = new size_t[lengthSurrogates];
size_t surr = 0;
for (size_t i=0; i < length; i++) {
if (IsLead(text[i])) {
surrogates[surr] = i - surr;
surr++;
}
}
}
size_t LinearIndexFromPosition(size_t position) const {
// For checking that the binary search version works
for (size_t i=0; i<lengthSurrogates; i++) {
if (surrogates[i] >= position) {
return position + i;
}
}
return position + lengthSurrogates;
}
size_t IndexFromPosition(size_t position) const {
// Use a binary search to find index in log(lengthSurrogates)
if (lengthSurrogates == 0)
return position;
if (position > surrogates[lengthSurrogates - 1])
return position + lengthSurrogates;
size_t lower = 0;
size_t upper = lengthSurrogates-1;
do {
size_t middle = (upper + lower + 1) / 2; // Round high
size_t posMiddle = surrogates[middle];
if (position < posMiddle) {
upper = middle - 1;
} else {
lower = middle;
}
} while (lower < upper);
if (surrogates[lower] >= position)
return position + lower;
else
return position + lower + 1;
}
size_t Length() const {
return length - lengthSurrogates;
}
surstr() : text(0), length(0), surrogates(0), lengthSurrogates(0) {}
// start and end are in code points
surstr(codePoint *text_,
size_t start=0, size_t end=measure_length) :
text(0), length(0), surrogates(0), lengthSurrogates(0) {
// Assert text_[start:end] only contains whole surrogate pairs
if (end == measure_length) {
end = 0;
while (text_[end])
end++;
}
length = end - start;
text = new codePoint[length];
memcpy(text, text_, sizeof(codePoint) * length);
FindSurrogates();
}
// start and end are in characters
surstr(const surstr &source,
size_t start=0, size_t end=measure_length) {
size_t startIndex = source.IndexFromPosition(start);
size_t endIndex;
if (end == measure_length)
endIndex = source.IndexFromPosition(source.Length());
else
endIndex = source.IndexFromPosition(end);
length = endIndex - startIndex;
text = new codePoint[length];
memcpy(text, source.text + startIndex,
sizeof(codePoint) * length);
if (start == 0 && end == measure_length) {
surrogates = new size_t[source.lengthSurrogates];
memcpy(surrogates, source.surrogates,
sizeof(size_t) * source.lengthSurrogates);
lengthSurrogates = source.lengthSurrogates;
} else {
FindSurrogates();
}
}
~surstr() {
delete []text;
text = 0;
delete []surrogates;
surrogates = 0;
}
void print() {
for (size_t i=0;i<length;i++) {
if (text[i] < 0x7f) {
printf("%c", text[i]);
} else {
printf("\\u%04x", text[i]);
}
}
}
};
void slicer(surstr &a) {
printf("Length in characters = %d, code units = %d ==> ",
a.Length(), a.length);
a.print();
printf("\n");
for (size_t pos = 0; pos < a.Length(); pos++) {
if (a.IndexFromPosition(pos) !=
a.LinearIndexFromPosition(pos)) {
printf(" Failed at position %d -> %d",
pos, a.IndexFromPosition(pos));
}
printf(" [%0d] ", pos);
surstr b(a, pos, pos+1);
b.print();
printf("\n");
}
}
int main() {
surstr n(L"");
slicer(n);
surstr a(L"a");
slicer(a);
surstr b(L"a\u6C348\U0001D11E-\U00010338!");
slicer(b);
printf("\n");
surstr c(L"a\u6C348\U0001D11E\U0001D11E-\U00010338!"
L"\U0001D11E\U0001D11Ea\u6C348\U0001D11E-\U00010338!");
slicer(c);
printf("\n");
}
Test run:
Length in characters = 0, code units = 0 ==>
Length in characters = 1, code units = 1 ==> a
[0] a
Length in characters = 7, code units = 9 ==>
a\u6c348\ud834\udd1e-\ud800\udf38!
[0] a
[1] \u6c34
[2] 8
[3] \ud834\udd1e
[4] -
[5] \ud800\udf38
[6] !
Length in characters = 17, code units = 24 ==>
a\u6c348\ud834\udd1e\ud834\udd1e-\ud800\udf38!\ud834\udd1e\ud834\udd1ea\u6c348\ud834\udd1e-\ud800\udf38!
[0] a
[1] \u6c34
[2] 8
[3] \ud834\udd1e
[4] \ud834\udd1e
[5] -
[6] \ud800\udf38
[7] !
[8] \ud834\udd1e
[9] \ud834\udd1e
[10] a
[11] \u6c34
[12] 8
[13] \ud834\udd1e
[14] -
[15] \ud800\udf38
[16] !
Neil
More information about the Python-list
mailing list