Performance of int/long in Python 3
Dave Angel
davea at davea.name
Wed Apr 3 07:52:42 EDT 2013
On 04/03/2013 07:05 AM, Neil Hodgson wrote:
> Dave Angel:
>
>> That would seem to imply that the speed regression on your data is NOT
>> caused by the differing size encodings. Perhaps it is the difference in
>> MSC compiler version, or other changes made between 3.2 and 3.3
>
> Its not caused by there actually being different size encodings but
> that the code is checking encoding size 2-4 times for each character.
>
> Back in 3.2 the comparison loop looked like:
>
> while (len1 > 0 && len2 > 0) {
> Py_UNICODE c1, c2;
>
> c1 = *s1++;
> c2 = *s2++;
>
> if (c1 != c2)
> return (c1 < c2) ? -1 : 1;
>
> len1--; len2--;
> }
>
> For 3.3 this has changed to
>
> for (i = 0; i < len1 && i < len2; ++i) {
> Py_UCS4 c1, c2;
> c1 = PyUnicode_READ(kind1, data1, i);
> c2 = PyUnicode_READ(kind2, data2, i);
>
> if (c1 != c2)
> return (c1 < c2) ? -1 : 1;
> }
>
> with PyUnicode_READ being
>
> #define PyUnicode_READ(kind, data, index) \
> ((Py_UCS4) \
> ((kind) == PyUnicode_1BYTE_KIND ? \
> ((const Py_UCS1 *)(data))[(index)] : \
> ((kind) == PyUnicode_2BYTE_KIND ? \
> ((const Py_UCS2 *)(data))[(index)] : \
> ((const Py_UCS4 *)(data))[(index)] \
> ) \
> ))
>
> There are either 1 or 2 kind checks in each call to PyUnicode_READ
> and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to
> move the kind checks out of the loop and specialize the loop but MSVC
> 2010 appears to not do so.
I don't know how good MSC's template logic is, but it seems this would
be a good case for an explicit template, typed on the 'kind's values.
Or are all C++ features disabled when compiling Python? Failing that,
just code up 9 cases, and do a switch on the kinds.
I'm also puzzled. I thought that the sort algorithm used a hash of all
the items to be sorted, and only reverted to a raw comparison of the
original values when the hash collided. Is that not the case? Or is
the code you post here only used when the hash collides?
The assembler (32-bit build) for each
> PyUnicode_READ looks like
>
> mov ecx, DWORD PTR _kind1$[ebp]
> cmp ecx, 1
> jne SHORT $LN17 at unicode_co@2
> lea ecx, DWORD PTR [ebx+eax]
> movzx edx, BYTE PTR [ecx+edx]
> jmp SHORT $LN16 at unicode_co@2
> $LN17 at unicode_co@2:
> cmp ecx, 2
> jne SHORT $LN15 at unicode_co@2
> movzx edx, WORD PTR [ebx+edi]
> jmp SHORT $LN16 at unicode_co@2
> $LN15 at unicode_co@2:
> mov edx, DWORD PTR [ebx+esi]
> $LN16 at unicode_co@2:
It appears that the compiler is keeping the three pointers in three
separate registers (eax, esi and edi) even though those are 3 aliases
for the same pointer. This is preventing it from putting other values
in those registers.
It'd probably do better if the C code manipulated the pointers, rather
than using an index i each time. But if it did, perhaps gcc would
generate worse code.
If I were coding the assembler by hand (Intel only), I'd be able to
avoid the multiple cmp operations, simply by comparing first to 2, then
doing a jne and a ja. I dunno whether the compiler would notice if I
coded the equivalent in C. (make both comparisons to 2, one for less,
and one for more)
>
> The kind1/kind2 variables aren't even going into registers and at
> least one test+branch and a jump are executed for every character. Two
> tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
> registers either.
>
> Here's the full assembler output for unicode_compare:
>
> ; COMDAT _unicode_compare
> _TEXT SEGMENT
> _kind2$ = -20 ; size = 4
> _kind1$ = -16 ; size = 4
> _len2$ = -12 ; size = 4
> _len1$ = -8 ; size = 4
> _data2$ = -4 ; size = 4
> _unicode_compare PROC ; COMDAT
> ; _str1$ = ecx
> ; _str2$ = eax
>
> ; 10417: {
>
> push ebp
> mov ebp, esp
> sub esp, 20 ; 00000014H
> push ebx
> push esi
> mov esi, eax
>
> ; 10418: int kind1, kind2;
> ; 10419: void *data1, *data2;
> ; 10420: Py_ssize_t len1, len2, i;
> ; 10421:
> ; 10422: kind1 = PyUnicode_KIND(str1);
>
> mov eax, DWORD PTR [ecx+16]
> mov edx, eax
> shr edx, 2
> and edx, 7
> push edi
> mov DWORD PTR _kind1$[ebp], edx
>
> ; 10423: kind2 = PyUnicode_KIND(str2);
>
> mov edx, DWORD PTR [esi+16]
> mov edi, edx
> shr edi, 2
> and edi, 7
> mov DWORD PTR _kind2$[ebp], edi
>
> ; 10424: data1 = PyUnicode_DATA(str1);
>
> test al, 32 ; 00000020H
> je SHORT $LN9 at unicode_co@2
> test al, 64 ; 00000040H
> je SHORT $LN7 at unicode_co@2
> lea ebx, DWORD PTR [ecx+24]
> jmp SHORT $LN10 at unicode_co@2
> $LN7 at unicode_co@2:
> lea ebx, DWORD PTR [ecx+36]
> jmp SHORT $LN10 at unicode_co@2
> $LN9 at unicode_co@2:
> mov ebx, DWORD PTR [ecx+36]
> $LN10 at unicode_co@2:
>
> ; 10425: data2 = PyUnicode_DATA(str2);
>
> test dl, 32 ; 00000020H
> je SHORT $LN13 at unicode_co@2
> test dl, 64 ; 00000040H
> je SHORT $LN11 at unicode_co@2
> lea edx, DWORD PTR [esi+24]
> jmp SHORT $LN30 at unicode_co@2
> $LN11 at unicode_co@2:
> lea eax, DWORD PTR [esi+36]
> mov DWORD PTR _data2$[ebp], eax
> mov edx, eax
> jmp SHORT $LN14 at unicode_co@2
> $LN13 at unicode_co@2:
> mov edx, DWORD PTR [esi+36]
> $LN30 at unicode_co@2:
> mov DWORD PTR _data2$[ebp], edx
> $LN14 at unicode_co@2:
>
> ; 10426: len1 = PyUnicode_GET_LENGTH(str1);
>
> mov edi, DWORD PTR [ecx+8]
>
> ; 10427: len2 = PyUnicode_GET_LENGTH(str2);
>
> mov ecx, DWORD PTR [esi+8]
>
> ; 10428:
> ; 10429: for (i = 0; i < len1 && i < len2; ++i) {
>
> xor eax, eax
> mov DWORD PTR _len1$[ebp], edi
> mov DWORD PTR _len2$[ebp], ecx
> test edi, edi
> jle SHORT $LN2 at unicode_co@2
>
> ; 10426: len1 = PyUnicode_GET_LENGTH(str1);
>
> mov esi, edx
> mov edi, edx
>
> ; 10428:
> ; 10429: for (i = 0; i < len1 && i < len2; ++i) {
>
> sub ebx, edx
> jmp SHORT $LN4 at unicode_co@2
> $LL28 at unicode_co@2:
> mov edx, DWORD PTR _data2$[ebp]
> $LN4 at unicode_co@2:
> cmp eax, ecx
> jge SHORT $LN29 at unicode_co@2
>
> ; 10430: Py_UCS4 c1, c2;
> ; 10431: c1 = PyUnicode_READ(kind1, data1, i);
>
> mov ecx, DWORD PTR _kind1$[ebp]
> cmp ecx, 1
> jne SHORT $LN17 at unicode_co@2
> lea ecx, DWORD PTR [ebx+eax]
> movzx edx, BYTE PTR [ecx+edx]
> jmp SHORT $LN16 at unicode_co@2
> $LN17 at unicode_co@2:
> cmp ecx, 2
> jne SHORT $LN15 at unicode_co@2
> movzx edx, WORD PTR [ebx+edi]
> jmp SHORT $LN16 at unicode_co@2
> $LN15 at unicode_co@2:
> mov edx, DWORD PTR [ebx+esi]
> $LN16 at unicode_co@2:
>
> ; 10432: c2 = PyUnicode_READ(kind2, data2, i);
>
> mov ecx, DWORD PTR _kind2$[ebp]
> cmp ecx, 1
> jne SHORT $LN21 at unicode_co@2
> mov ecx, DWORD PTR _data2$[ebp]
> movzx ecx, BYTE PTR [eax+ecx]
> jmp SHORT $LN20 at unicode_co@2
> $LN21 at unicode_co@2:
> cmp ecx, 2
> jne SHORT $LN19 at unicode_co@2
> movzx ecx, WORD PTR [edi]
> jmp SHORT $LN20 at unicode_co@2
> $LN19 at unicode_co@2:
> mov ecx, DWORD PTR [esi]
> $LN20 at unicode_co@2:
>
> ; 10433:
> ; 10434: if (c1 != c2)
>
> cmp edx, ecx
> jne SHORT $LN31 at unicode_co@2
> mov ecx, DWORD PTR _len2$[ebp]
> inc eax
> add edi, 2
> add esi, 4
> cmp eax, DWORD PTR _len1$[ebp]
> jl SHORT $LL28 at unicode_co@2
> $LN29 at unicode_co@2:
> mov edi, DWORD PTR _len1$[ebp]
> $LN2 at unicode_co@2:
>
> ; 10436: }
> ; 10437:
> ; 10438: return (len1 < len2) ? -1 : (len1 != len2);
>
> cmp edi, ecx
> jge SHORT $LN23 at unicode_co@2
> pop edi
> pop esi
> or eax, -1
> pop ebx
>
> ; 10439: }
>
> mov esp, ebp
> pop ebp
> ret 0
> $LN31 at unicode_co@2:
>
> ; 10435: return (c1 < c2) ? -1 : 1;
>
> sbb eax, eax
> pop edi
> and eax, -2 ; fffffffeH
> pop esi
> inc eax
> pop ebx
>
> ; 10439: }
>
> mov esp, ebp
> pop ebp
> ret 0
> $LN23 at unicode_co@2:
>
> ; 10436: }
> ; 10437:
> ; 10438: return (len1 < len2) ? -1 : (len1 != len2);
>
> xor eax, eax
> cmp edi, ecx
> pop edi
> pop esi
> setne al
> pop ebx
>
> ; 10439: }
>
> mov esp, ebp
> pop ebp
> ret 0
> _unicode_compare ENDP
>
> Neil
--
DaveA
More information about the Python-list
mailing list