[pypy-commit] pypy unicode-utf8: Fix test, improve logic
arigo
pypy.commits at gmail.com
Fri Aug 25 06:32:29 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92260:7193602c9384
Date: 2017-08-25 12:30 +0200
http://bitbucket.org/pypy/pypy/changeset/7193602c9384/
Log: Fix test, improve logic
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -93,6 +93,7 @@
"""Gives the position of the previous codepoint.
'pos' must not be zero.
"""
+ pos = r_uint(pos)
pos -= 1
chr1 = ord(code[pos])
if chr1 <= 0x7F:
@@ -142,6 +143,43 @@
(ordch4 & 0x3F)) # 0b00111111
assert False, "unreachable"
+def codepoint_before_pos(code, pos):
+ """Give a codepoint in code at the position immediately before pos
+ - assumes valid utf8, no checking!
+ """
+ pos = r_uint(pos)
+ ordch1 = ord(code[pos-1])
+ if ordch1 <= 0x7F:
+ return ordch1
+
+ ordch2 = ordch1
+ ordch1 = ord(code[pos-2])
+ if ordch1 >= 0xC0:
+ # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+ return (((ordch1 & 0x1F) << 6) + # 0b00011111
+ (ordch2 & 0x3F)) # 0b00111111
+
+ ordch3 = ordch2
+ ordch2 = ordch1
+ ordch1 = ord(code[pos-3])
+ if ordch1 >= 0xC0:
+ # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+ return (((ordch1 & 0x0F) << 12) + # 0b00001111
+ ((ordch2 & 0x3F) << 6) + # 0b00111111
+ (ordch3 & 0x3F)) # 0b00111111
+
+ ordch4 = ordch3
+ ordch3 = ordch2
+ ordch2 = ordch1
+ ordch1 = ord(code[pos-4])
+ if True:
+ # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+ return (((ordch1 & 0x07) << 18) + # 0b00000111
+ ((ordch2 & 0x3F) << 12) + # 0b00111111
+ ((ordch3 & 0x3F) << 6) + # 0b00111111
+ (ordch4 & 0x3F)) # 0b00111111
+ assert False, "unreachable"
+
class CheckError(Exception):
def __init__(self, pos):
self.pos = pos
@@ -312,25 +350,32 @@
UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
'utf8_loc',
- ('index', lltype.Signed),
+ ('baseindex', lltype.Signed),
('ofs', lltype.FixedSizeArray(lltype.Char, 16))
))
-EMPTY_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
+ASCII_INDEX_STORAGE_BLOCKS = 5
+ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE,
+ ASCII_INDEX_STORAGE_BLOCKS,
+ immortal=True)
+for _i in range(ASCII_INDEX_STORAGE_BLOCKS):
+ ASCII_INDEX_STORAGE[_i].baseindex = _i * 64
+ for _j in range(16):
+ ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
def create_utf8_index_storage(utf8, utf8len):
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
"""
- if utf8len == 0:
- return EMPTY_INDEX_STORAGE
+ if len(utf8) == utf8len <= ASCII_INDEX_STORAGE_BLOCKS * 64:
+ return ASCII_INDEX_STORAGE
arraysize = (utf8len + 63) // 64
storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
baseindex = 0
current = 0
- next = 0
while True:
- storage[current].index = baseindex
+ storage[current].baseindex = baseindex
+ next = baseindex
for i in range(16):
next = next_codepoint_pos(utf8, next)
storage[current].ofs[i] = chr(next - baseindex)
@@ -339,7 +384,7 @@
break
next = next_codepoint_pos(utf8, next)
next = next_codepoint_pos(utf8, next)
- next = next_codepoint_pos(utf8, next)
+ next = next_codepoint_pos(utf8, next)
else:
current += 1
baseindex = next
@@ -349,11 +394,13 @@
def codepoint_position_at_index(utf8, storage, index):
""" Return byte index of a character inside utf8 encoded string, given
- storage of type UTF8_INDEX_STORAGE
+ storage of type UTF8_INDEX_STORAGE. The index must be smaller than
+ the utf8 length: if needed, check explicitly before calling this
+ function.
"""
current = index >> 6
- ofs = ord(storage[current].ofs[(index >> 2) & 15])
- bytepos = storage[current].index + ofs
+ ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage[current].baseindex + ofs
index &= 0x3
if index == 0:
return prev_codepoint_pos(utf8, bytepos)
@@ -368,5 +415,15 @@
""" Return codepoint of a character inside utf8 encoded string, given
storage of type UTF8_INDEX_STORAGE
"""
- bytepos = codepoint_position_at_index(utf8, storage, index)
+ current = index >> 6
+ ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage[current].baseindex + ofs
+ index &= 0x3
+ if index == 0:
+ return codepoint_before_pos(utf8, bytepos)
+ if index == 3:
+ bytepos = next_codepoint_pos(utf8, bytepos)
+ index = 2 # fall-through to the next case
+ if index == 2:
+ bytepos = next_codepoint_pos(utf8, bytepos)
return codepoint_at_pos(utf8, bytepos)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -89,4 +89,12 @@
def test_utf8_index_storage(u):
index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
for i, item in enumerate(u):
- rutf8.codepoint_at_index(u.encode('utf8'), index, i) == item.encode('utf8')
+ assert (rutf8.codepoint_at_index(u.encode('utf8'), index, i) ==
+ ord(item))
+
+ at given(strategies.text())
+def test_codepoint_position_at_index(u):
+ index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+ for i in range(len(u)):
+ assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
+ len(u[:i].encode('utf8')))
More information about the pypy-commit
mailing list