Performance of int/long in Python 3

Wed Apr 3 07:05:55 EDT 2013

Dave Angel:

> That would seem to imply that the speed regression on your data is NOT
> caused by the differing size encodings. Perhaps it is the difference in
> MSC compiler version, or other changes made between 3.2 and 3.3

    Its not caused by there actually being different size encodings but 
that the code is checking encoding size 2-4 times for each character.

    Back in 3.2 the comparison loop looked like:

     while (len1 > 0 && len2 > 0) {
         Py_UNICODE c1, c2;

         c1 = *s1++;
         c2 = *s2++;

         if (c1 != c2)
             return (c1 < c2) ? -1 : 1;

         len1--; len2--;
     }

    For 3.3 this has changed to

     for (i = 0; i < len1 && i < len2; ++i) {
         Py_UCS4 c1, c2;
         c1 = PyUnicode_READ(kind1, data1, i);
         c2 = PyUnicode_READ(kind2, data2, i);

         if (c1 != c2)
             return (c1 < c2) ? -1 : 1;
     }

	with PyUnicode_READ being

#define PyUnicode_READ(kind, data, index) \
     ((Py_UCS4) \
     ((kind) == PyUnicode_1BYTE_KIND ? \
         ((const Py_UCS1 *)(data))[(index)] : \
         ((kind) == PyUnicode_2BYTE_KIND ? \
             ((const Py_UCS2 *)(data))[(index)] : \
             ((const Py_UCS4 *)(data))[(index)] \
         ) \
     ))

     There are either 1 or 2 kind checks in each call to PyUnicode_READ 
and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to 
move the kind checks out of the loop and specialize the loop but MSVC 
2010 appears to not do so. The assembler (32-bit build) for each 
PyUnicode_READ looks like

	mov	ecx, DWORD PTR _kind1$[ebp]
	cmp	ecx, 1
	jne	SHORT $LN17 at unicode_co@2
	lea	ecx, DWORD PTR [ebx+eax]
	movzx	edx, BYTE PTR [ecx+edx]
	jmp	SHORT $LN16 at unicode_co@2
$LN17 at unicode_co@2:
	cmp	ecx, 2
	jne	SHORT $LN15 at unicode_co@2
	movzx	edx, WORD PTR [ebx+edi]
	jmp	SHORT $LN16 at unicode_co@2
$LN15 at unicode_co@2:
	mov	edx, DWORD PTR [ebx+esi]
$LN16 at unicode_co@2:

    The kind1/kind2 variables aren't even going into registers and at 
least one test+branch and a jump are executed for every character. Two 
tests for 2 and 4 byte kinds. len1 and len2 don't get to go into 
registers either.

    Here's the full assembler output for unicode_compare:

;	COMDAT _unicode_compare
_TEXT	SEGMENT
_kind2$ = -20						; size = 4
_kind1$ = -16						; size = 4
_len2$ = -12						; size = 4
_len1$ = -8						; size = 4
_data2$ = -4						; size = 4
_unicode_compare PROC					; COMDAT
; _str1$ = ecx
; _str2$ = eax

; 10417: {

	push	ebp
	mov	ebp, esp
	sub	esp, 20					; 00000014H
	push	ebx
	push	esi
	mov	esi, eax

; 10418:     int kind1, kind2;
; 10419:     void *data1, *data2;
; 10420:     Py_ssize_t len1, len2, i;
; 10421:
; 10422:     kind1 = PyUnicode_KIND(str1);

	mov	eax, DWORD PTR [ecx+16]
	mov	edx, eax
	shr	edx, 2
	and	edx, 7
	push	edi
	mov	DWORD PTR _kind1$[ebp], edx

; 10423:     kind2 = PyUnicode_KIND(str2);

	mov	edx, DWORD PTR [esi+16]
	mov	edi, edx
	shr	edi, 2
	and	edi, 7
	mov	DWORD PTR _kind2$[ebp], edi

; 10424:     data1 = PyUnicode_DATA(str1);

	test	al, 32					; 00000020H
	je	SHORT $LN9 at unicode_co@2
	test	al, 64					; 00000040H
	je	SHORT $LN7 at unicode_co@2
	lea	ebx, DWORD PTR [ecx+24]
	jmp	SHORT $LN10 at unicode_co@2
$LN7 at unicode_co@2:
	lea	ebx, DWORD PTR [ecx+36]
	jmp	SHORT $LN10 at unicode_co@2
$LN9 at unicode_co@2:
	mov	ebx, DWORD PTR [ecx+36]
$LN10 at unicode_co@2:

; 10425:     data2 = PyUnicode_DATA(str2);

	test	dl, 32					; 00000020H
	je	SHORT $LN13 at unicode_co@2
	test	dl, 64					; 00000040H
	je	SHORT $LN11 at unicode_co@2
	lea	edx, DWORD PTR [esi+24]
	jmp	SHORT $LN30 at unicode_co@2
$LN11 at unicode_co@2:
	lea	eax, DWORD PTR [esi+36]
	mov	DWORD PTR _data2$[ebp], eax
	mov	edx, eax
	jmp	SHORT $LN14 at unicode_co@2
$LN13 at unicode_co@2:
	mov	edx, DWORD PTR [esi+36]
$LN30 at unicode_co@2:
	mov	DWORD PTR _data2$[ebp], edx
$LN14 at unicode_co@2:

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

	mov	edi, DWORD PTR [ecx+8]

; 10427:     len2 = PyUnicode_GET_LENGTH(str2);

	mov	ecx, DWORD PTR [esi+8]

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

	xor	eax, eax
	mov	DWORD PTR _len1$[ebp], edi
	mov	DWORD PTR _len2$[ebp], ecx
	test	edi, edi
	jle	SHORT $LN2 at unicode_co@2

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

	mov	esi, edx
	mov	edi, edx

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

	sub	ebx, edx
	jmp	SHORT $LN4 at unicode_co@2
$LL28 at unicode_co@2:
	mov	edx, DWORD PTR _data2$[ebp]
$LN4 at unicode_co@2:
	cmp	eax, ecx
	jge	SHORT $LN29 at unicode_co@2

; 10430:         Py_UCS4 c1, c2;
; 10431:         c1 = PyUnicode_READ(kind1, data1, i);

	mov	ecx, DWORD PTR _kind1$[ebp]
	cmp	ecx, 1
	jne	SHORT $LN17 at unicode_co@2
	lea	ecx, DWORD PTR [ebx+eax]
	movzx	edx, BYTE PTR [ecx+edx]
	jmp	SHORT $LN16 at unicode_co@2
$LN17 at unicode_co@2:
	cmp	ecx, 2
	jne	SHORT $LN15 at unicode_co@2
	movzx	edx, WORD PTR [ebx+edi]
	jmp	SHORT $LN16 at unicode_co@2
$LN15 at unicode_co@2:
	mov	edx, DWORD PTR [ebx+esi]
$LN16 at unicode_co@2:

; 10432:         c2 = PyUnicode_READ(kind2, data2, i);

	mov	ecx, DWORD PTR _kind2$[ebp]
	cmp	ecx, 1
	jne	SHORT $LN21 at unicode_co@2
	mov	ecx, DWORD PTR _data2$[ebp]
	movzx	ecx, BYTE PTR [eax+ecx]
	jmp	SHORT $LN20 at unicode_co@2
$LN21 at unicode_co@2:
	cmp	ecx, 2
	jne	SHORT $LN19 at unicode_co@2
	movzx	ecx, WORD PTR [edi]
	jmp	SHORT $LN20 at unicode_co@2
$LN19 at unicode_co@2:
	mov	ecx, DWORD PTR [esi]
$LN20 at unicode_co@2:

; 10433:
; 10434:         if (c1 != c2)

	cmp	edx, ecx
	jne	SHORT $LN31 at unicode_co@2
	mov	ecx, DWORD PTR _len2$[ebp]
	inc	eax
	add	edi, 2
	add	esi, 4
	cmp	eax, DWORD PTR _len1$[ebp]
	jl	SHORT $LL28 at unicode_co@2
$LN29 at unicode_co@2:
	mov	edi, DWORD PTR _len1$[ebp]
$LN2 at unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

	cmp	edi, ecx
	jge	SHORT $LN23 at unicode_co@2
	pop	edi
	pop	esi
	or	eax, -1
	pop	ebx

; 10439: }

	mov	esp, ebp
	pop	ebp
	ret	0
$LN31 at unicode_co@2:

; 10435:             return (c1 < c2) ? -1 : 1;

	sbb	eax, eax
	pop	edi
	and	eax, -2					; fffffffeH
	pop	esi
	inc	eax
	pop	ebx

; 10439: }

	mov	esp, ebp
	pop	ebp
	ret	0
$LN23 at unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

	xor	eax, eax
	cmp	edi, ecx
	pop	edi
	pop	esi
	setne	al
	pop	ebx

; 10439: }

	mov	esp, ebp
	pop	ebp
	ret	0
_unicode_compare ENDP

    Neil