Performance of int/long in Python 3

Wed Apr 3 07:52:42 EDT 2013

On 04/03/2013 07:05 AM, Neil Hodgson wrote:
> Dave Angel:
>
>> That would seem to imply that the speed regression on your data is NOT
>> caused by the differing size encodings. Perhaps it is the difference in
>> MSC compiler version, or other changes made between 3.2 and 3.3
>
>     Its not caused by there actually being different size encodings but
> that the code is checking encoding size 2-4 times for each character.
>
>     Back in 3.2 the comparison loop looked like:
>
>      while (len1 > 0 && len2 > 0) {
>          Py_UNICODE c1, c2;
>
>          c1 = *s1++;
>          c2 = *s2++;
>
>          if (c1 != c2)
>              return (c1 < c2) ? -1 : 1;
>
>          len1--; len2--;
>      }
>
>     For 3.3 this has changed to
>
>      for (i = 0; i < len1 && i < len2; ++i) {
>          Py_UCS4 c1, c2;
>          c1 = PyUnicode_READ(kind1, data1, i);
>          c2 = PyUnicode_READ(kind2, data2, i);
>
>          if (c1 != c2)
>              return (c1 < c2) ? -1 : 1;
>      }
>
>      with PyUnicode_READ being
>
> #define PyUnicode_READ(kind, data, index) \
>      ((Py_UCS4) \
>      ((kind) == PyUnicode_1BYTE_KIND ? \
>          ((const Py_UCS1 *)(data))[(index)] : \
>          ((kind) == PyUnicode_2BYTE_KIND ? \
>              ((const Py_UCS2 *)(data))[(index)] : \
>              ((const Py_UCS4 *)(data))[(index)] \
>          ) \
>      ))
>
>      There are either 1 or 2 kind checks in each call to PyUnicode_READ
> and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to
> move the kind checks out of the loop and specialize the loop but MSVC
> 2010 appears to not do so.

I don't know how good MSC's template logic is, but it seems this would 
be a good case for an explicit template, typed on the 'kind's values. 
Or are all C++ features disabled when compiling Python?  Failing that, 
just code up 9 cases, and do a switch on the kinds.

I'm also puzzled.  I thought that the sort algorithm used a hash of all 
the items to be sorted, and only reverted to a raw comparison of the 
original values when the hash collided.  Is that not the case?  Or is 
the code you post here only used when the hash collides?

  The assembler (32-bit build) for each
> PyUnicode_READ looks like
>
>      mov    ecx, DWORD PTR _kind1$[ebp]
>      cmp    ecx, 1
>      jne    SHORT $LN17 at unicode_co@2
>      lea    ecx, DWORD PTR [ebx+eax]
>      movzx    edx, BYTE PTR [ecx+edx]
>      jmp    SHORT $LN16 at unicode_co@2
> $LN17 at unicode_co@2:
>      cmp    ecx, 2
>      jne    SHORT $LN15 at unicode_co@2
>      movzx    edx, WORD PTR [ebx+edi]
>      jmp    SHORT $LN16 at unicode_co@2
> $LN15 at unicode_co@2:
>      mov    edx, DWORD PTR [ebx+esi]
> $LN16 at unicode_co@2:

It appears that the compiler is keeping the three pointers in three 
separate registers (eax, esi and edi) even though those are 3 aliases 
for the same pointer.   This is preventing it from putting other values 
in those registers.

It'd probably do better if the C code manipulated the pointers, rather 
than using an index i each time.  But if it did, perhaps gcc would 
generate worse code.

If I were coding the assembler by hand (Intel only), I'd be able to 
avoid the multiple cmp operations, simply by comparing first to 2, then 
doing a jne and a ja.  I dunno whether the compiler would notice if I 
coded the equivalent in C.  (make both comparisons to 2, one for less, 
and one for more)

>
>     The kind1/kind2 variables aren't even going into registers and at
> least one test+branch and a jump are executed for every character. Two
> tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
> registers either.
>
>     Here's the full assembler output for unicode_compare:
>
> ;    COMDAT _unicode_compare
> _TEXT    SEGMENT
> _kind2$ = -20                        ; size = 4
> _kind1$ = -16                        ; size = 4
> _len2$ = -12                        ; size = 4
> _len1$ = -8                        ; size = 4
> _data2$ = -4                        ; size = 4
> _unicode_compare PROC                    ; COMDAT
> ; _str1$ = ecx
> ; _str2$ = eax
>
> ; 10417: {
>
>      push    ebp
>      mov    ebp, esp
>      sub    esp, 20                    ; 00000014H
>      push    ebx
>      push    esi
>      mov    esi, eax
>
> ; 10418:     int kind1, kind2;
> ; 10419:     void *data1, *data2;
> ; 10420:     Py_ssize_t len1, len2, i;
> ; 10421:
> ; 10422:     kind1 = PyUnicode_KIND(str1);
>
>      mov    eax, DWORD PTR [ecx+16]
>      mov    edx, eax
>      shr    edx, 2
>      and    edx, 7
>      push    edi
>      mov    DWORD PTR _kind1$[ebp], edx
>
> ; 10423:     kind2 = PyUnicode_KIND(str2);
>
>      mov    edx, DWORD PTR [esi+16]
>      mov    edi, edx
>      shr    edi, 2
>      and    edi, 7
>      mov    DWORD PTR _kind2$[ebp], edi
>
> ; 10424:     data1 = PyUnicode_DATA(str1);
>
>      test    al, 32                    ; 00000020H
>      je    SHORT $LN9 at unicode_co@2
>      test    al, 64                    ; 00000040H
>      je    SHORT $LN7 at unicode_co@2
>      lea    ebx, DWORD PTR [ecx+24]
>      jmp    SHORT $LN10 at unicode_co@2
> $LN7 at unicode_co@2:
>      lea    ebx, DWORD PTR [ecx+36]
>      jmp    SHORT $LN10 at unicode_co@2
> $LN9 at unicode_co@2:
>      mov    ebx, DWORD PTR [ecx+36]
> $LN10 at unicode_co@2:
>
> ; 10425:     data2 = PyUnicode_DATA(str2);
>
>      test    dl, 32                    ; 00000020H
>      je    SHORT $LN13 at unicode_co@2
>      test    dl, 64                    ; 00000040H
>      je    SHORT $LN11 at unicode_co@2
>      lea    edx, DWORD PTR [esi+24]
>      jmp    SHORT $LN30 at unicode_co@2
> $LN11 at unicode_co@2:
>      lea    eax, DWORD PTR [esi+36]
>      mov    DWORD PTR _data2$[ebp], eax
>      mov    edx, eax
>      jmp    SHORT $LN14 at unicode_co@2
> $LN13 at unicode_co@2:
>      mov    edx, DWORD PTR [esi+36]
> $LN30 at unicode_co@2:
>      mov    DWORD PTR _data2$[ebp], edx
> $LN14 at unicode_co@2:
>
> ; 10426:     len1 = PyUnicode_GET_LENGTH(str1);
>
>      mov    edi, DWORD PTR [ecx+8]
>
> ; 10427:     len2 = PyUnicode_GET_LENGTH(str2);
>
>      mov    ecx, DWORD PTR [esi+8]
>
> ; 10428:
> ; 10429:     for (i = 0; i < len1 && i < len2; ++i) {
>
>      xor    eax, eax
>      mov    DWORD PTR _len1$[ebp], edi
>      mov    DWORD PTR _len2$[ebp], ecx
>      test    edi, edi
>      jle    SHORT $LN2 at unicode_co@2
>
> ; 10426:     len1 = PyUnicode_GET_LENGTH(str1);
>
>      mov    esi, edx
>      mov    edi, edx
>
> ; 10428:
> ; 10429:     for (i = 0; i < len1 && i < len2; ++i) {
>
>      sub    ebx, edx
>      jmp    SHORT $LN4 at unicode_co@2
> $LL28 at unicode_co@2:
>      mov    edx, DWORD PTR _data2$[ebp]
> $LN4 at unicode_co@2:
>      cmp    eax, ecx
>      jge    SHORT $LN29 at unicode_co@2
>
> ; 10430:         Py_UCS4 c1, c2;
> ; 10431:         c1 = PyUnicode_READ(kind1, data1, i);
>
>      mov    ecx, DWORD PTR _kind1$[ebp]
>      cmp    ecx, 1
>      jne    SHORT $LN17 at unicode_co@2
>      lea    ecx, DWORD PTR [ebx+eax]
>      movzx    edx, BYTE PTR [ecx+edx]
>      jmp    SHORT $LN16 at unicode_co@2
> $LN17 at unicode_co@2:
>      cmp    ecx, 2
>      jne    SHORT $LN15 at unicode_co@2
>      movzx    edx, WORD PTR [ebx+edi]
>      jmp    SHORT $LN16 at unicode_co@2
> $LN15 at unicode_co@2:
>      mov    edx, DWORD PTR [ebx+esi]
> $LN16 at unicode_co@2:
>
> ; 10432:         c2 = PyUnicode_READ(kind2, data2, i);
>
>      mov    ecx, DWORD PTR _kind2$[ebp]
>      cmp    ecx, 1
>      jne    SHORT $LN21 at unicode_co@2
>      mov    ecx, DWORD PTR _data2$[ebp]
>      movzx    ecx, BYTE PTR [eax+ecx]
>      jmp    SHORT $LN20 at unicode_co@2
> $LN21 at unicode_co@2:
>      cmp    ecx, 2
>      jne    SHORT $LN19 at unicode_co@2
>      movzx    ecx, WORD PTR [edi]
>      jmp    SHORT $LN20 at unicode_co@2
> $LN19 at unicode_co@2:
>      mov    ecx, DWORD PTR [esi]
> $LN20 at unicode_co@2:
>
> ; 10433:
> ; 10434:         if (c1 != c2)
>
>      cmp    edx, ecx
>      jne    SHORT $LN31 at unicode_co@2
>      mov    ecx, DWORD PTR _len2$[ebp]
>      inc    eax
>      add    edi, 2
>      add    esi, 4
>      cmp    eax, DWORD PTR _len1$[ebp]
>      jl    SHORT $LL28 at unicode_co@2
> $LN29 at unicode_co@2:
>      mov    edi, DWORD PTR _len1$[ebp]
> $LN2 at unicode_co@2:
>
> ; 10436:     }
> ; 10437:
> ; 10438:     return (len1 < len2) ? -1 : (len1 != len2);
>
>      cmp    edi, ecx
>      jge    SHORT $LN23 at unicode_co@2
>      pop    edi
>      pop    esi
>      or    eax, -1
>      pop    ebx
>
> ; 10439: }
>
>      mov    esp, ebp
>      pop    ebp
>      ret    0
> $LN31 at unicode_co@2:
>
> ; 10435:             return (c1 < c2) ? -1 : 1;
>
>      sbb    eax, eax
>      pop    edi
>      and    eax, -2                    ; fffffffeH
>      pop    esi
>      inc    eax
>      pop    ebx
>
> ; 10439: }
>
>      mov    esp, ebp
>      pop    ebp
>      ret    0
> $LN23 at unicode_co@2:
>
> ; 10436:     }
> ; 10437:
> ; 10438:     return (len1 < len2) ? -1 : (len1 != len2);
>
>      xor    eax, eax
>      cmp    edi, ecx
>      pop    edi
>      pop    esi
>      setne    al
>      pop    ebx
>
> ; 10439: }
>
>      mov    esp, ebp
>      pop    ebp
>      ret    0
> _unicode_compare ENDP
>
>     Neil

-- 
DaveA