Performance of int/long in Python 3
Neil Hodgson
nhodgson at iinet.net.au
Wed Apr 3 07:05:55 EDT 2013
Dave Angel:
> That would seem to imply that the speed regression on your data is NOT
> caused by the differing size encodings. Perhaps it is the difference in
> MSC compiler version, or other changes made between 3.2 and 3.3
Its not caused by there actually being different size encodings but
that the code is checking encoding size 2-4 times for each character.
Back in 3.2 the comparison loop looked like:
while (len1 > 0 && len2 > 0) {
Py_UNICODE c1, c2;
c1 = *s1++;
c2 = *s2++;
if (c1 != c2)
return (c1 < c2) ? -1 : 1;
len1--; len2--;
}
For 3.3 this has changed to
for (i = 0; i < len1 && i < len2; ++i) {
Py_UCS4 c1, c2;
c1 = PyUnicode_READ(kind1, data1, i);
c2 = PyUnicode_READ(kind2, data2, i);
if (c1 != c2)
return (c1 < c2) ? -1 : 1;
}
with PyUnicode_READ being
#define PyUnicode_READ(kind, data, index) \
((Py_UCS4) \
((kind) == PyUnicode_1BYTE_KIND ? \
((const Py_UCS1 *)(data))[(index)] : \
((kind) == PyUnicode_2BYTE_KIND ? \
((const Py_UCS2 *)(data))[(index)] : \
((const Py_UCS4 *)(data))[(index)] \
) \
))
There are either 1 or 2 kind checks in each call to PyUnicode_READ
and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to
move the kind checks out of the loop and specialize the loop but MSVC
2010 appears to not do so. The assembler (32-bit build) for each
PyUnicode_READ looks like
mov ecx, DWORD PTR _kind1$[ebp]
cmp ecx, 1
jne SHORT $LN17 at unicode_co@2
lea ecx, DWORD PTR [ebx+eax]
movzx edx, BYTE PTR [ecx+edx]
jmp SHORT $LN16 at unicode_co@2
$LN17 at unicode_co@2:
cmp ecx, 2
jne SHORT $LN15 at unicode_co@2
movzx edx, WORD PTR [ebx+edi]
jmp SHORT $LN16 at unicode_co@2
$LN15 at unicode_co@2:
mov edx, DWORD PTR [ebx+esi]
$LN16 at unicode_co@2:
The kind1/kind2 variables aren't even going into registers and at
least one test+branch and a jump are executed for every character. Two
tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
registers either.
Here's the full assembler output for unicode_compare:
; COMDAT _unicode_compare
_TEXT SEGMENT
_kind2$ = -20 ; size = 4
_kind1$ = -16 ; size = 4
_len2$ = -12 ; size = 4
_len1$ = -8 ; size = 4
_data2$ = -4 ; size = 4
_unicode_compare PROC ; COMDAT
; _str1$ = ecx
; _str2$ = eax
; 10417: {
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
push ebx
push esi
mov esi, eax
; 10418: int kind1, kind2;
; 10419: void *data1, *data2;
; 10420: Py_ssize_t len1, len2, i;
; 10421:
; 10422: kind1 = PyUnicode_KIND(str1);
mov eax, DWORD PTR [ecx+16]
mov edx, eax
shr edx, 2
and edx, 7
push edi
mov DWORD PTR _kind1$[ebp], edx
; 10423: kind2 = PyUnicode_KIND(str2);
mov edx, DWORD PTR [esi+16]
mov edi, edx
shr edi, 2
and edi, 7
mov DWORD PTR _kind2$[ebp], edi
; 10424: data1 = PyUnicode_DATA(str1);
test al, 32 ; 00000020H
je SHORT $LN9 at unicode_co@2
test al, 64 ; 00000040H
je SHORT $LN7 at unicode_co@2
lea ebx, DWORD PTR [ecx+24]
jmp SHORT $LN10 at unicode_co@2
$LN7 at unicode_co@2:
lea ebx, DWORD PTR [ecx+36]
jmp SHORT $LN10 at unicode_co@2
$LN9 at unicode_co@2:
mov ebx, DWORD PTR [ecx+36]
$LN10 at unicode_co@2:
; 10425: data2 = PyUnicode_DATA(str2);
test dl, 32 ; 00000020H
je SHORT $LN13 at unicode_co@2
test dl, 64 ; 00000040H
je SHORT $LN11 at unicode_co@2
lea edx, DWORD PTR [esi+24]
jmp SHORT $LN30 at unicode_co@2
$LN11 at unicode_co@2:
lea eax, DWORD PTR [esi+36]
mov DWORD PTR _data2$[ebp], eax
mov edx, eax
jmp SHORT $LN14 at unicode_co@2
$LN13 at unicode_co@2:
mov edx, DWORD PTR [esi+36]
$LN30 at unicode_co@2:
mov DWORD PTR _data2$[ebp], edx
$LN14 at unicode_co@2:
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov edi, DWORD PTR [ecx+8]
; 10427: len2 = PyUnicode_GET_LENGTH(str2);
mov ecx, DWORD PTR [esi+8]
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
xor eax, eax
mov DWORD PTR _len1$[ebp], edi
mov DWORD PTR _len2$[ebp], ecx
test edi, edi
jle SHORT $LN2 at unicode_co@2
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov esi, edx
mov edi, edx
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
sub ebx, edx
jmp SHORT $LN4 at unicode_co@2
$LL28 at unicode_co@2:
mov edx, DWORD PTR _data2$[ebp]
$LN4 at unicode_co@2:
cmp eax, ecx
jge SHORT $LN29 at unicode_co@2
; 10430: Py_UCS4 c1, c2;
; 10431: c1 = PyUnicode_READ(kind1, data1, i);
mov ecx, DWORD PTR _kind1$[ebp]
cmp ecx, 1
jne SHORT $LN17 at unicode_co@2
lea ecx, DWORD PTR [ebx+eax]
movzx edx, BYTE PTR [ecx+edx]
jmp SHORT $LN16 at unicode_co@2
$LN17 at unicode_co@2:
cmp ecx, 2
jne SHORT $LN15 at unicode_co@2
movzx edx, WORD PTR [ebx+edi]
jmp SHORT $LN16 at unicode_co@2
$LN15 at unicode_co@2:
mov edx, DWORD PTR [ebx+esi]
$LN16 at unicode_co@2:
; 10432: c2 = PyUnicode_READ(kind2, data2, i);
mov ecx, DWORD PTR _kind2$[ebp]
cmp ecx, 1
jne SHORT $LN21 at unicode_co@2
mov ecx, DWORD PTR _data2$[ebp]
movzx ecx, BYTE PTR [eax+ecx]
jmp SHORT $LN20 at unicode_co@2
$LN21 at unicode_co@2:
cmp ecx, 2
jne SHORT $LN19 at unicode_co@2
movzx ecx, WORD PTR [edi]
jmp SHORT $LN20 at unicode_co@2
$LN19 at unicode_co@2:
mov ecx, DWORD PTR [esi]
$LN20 at unicode_co@2:
; 10433:
; 10434: if (c1 != c2)
cmp edx, ecx
jne SHORT $LN31 at unicode_co@2
mov ecx, DWORD PTR _len2$[ebp]
inc eax
add edi, 2
add esi, 4
cmp eax, DWORD PTR _len1$[ebp]
jl SHORT $LL28 at unicode_co@2
$LN29 at unicode_co@2:
mov edi, DWORD PTR _len1$[ebp]
$LN2 at unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
cmp edi, ecx
jge SHORT $LN23 at unicode_co@2
pop edi
pop esi
or eax, -1
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN31 at unicode_co@2:
; 10435: return (c1 < c2) ? -1 : 1;
sbb eax, eax
pop edi
and eax, -2 ; fffffffeH
pop esi
inc eax
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN23 at unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
xor eax, eax
cmp edi, ecx
pop edi
pop esi
setne al
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
_unicode_compare ENDP
Neil
More information about the Python-list
mailing list