[Python-checkins] bpo-38530: Refactor and improve AttributeError suggestions (GH-25776)

pablogsal webhook-mailer at python.org
Mon May 3 11:47:36 EDT 2021


https://github.com/python/cpython/commit/80a2a4ed7d090fff2584302f07315d567109bca9
commit: 80a2a4ed7d090fff2584302f07315d567109bca9
branch: master
author: Dennis Sweeney <36520290+sweeneyde at users.noreply.github.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2021-05-03T16:47:27+01:00
summary:

bpo-38530: Refactor and improve AttributeError suggestions (GH-25776)

- Make case-swaps half the cost of any other edit
- Refactor Levenshtein code to not use memory allocator, and to bail early on no match.
- Add comments to Levenshtein distance code
- Add test cases for Levenshtein distance behind a debug macro
- Set threshold to `(name_size + item_size + 3) * MOVE_COST / 6`.
  - Reasoning: similar to `difflib.SequenceMatcher.ratio()` >= 2/3:
```
"Multiset Jaccard similarity" >= 2/3
matching letters / total letters >= 2/3
(name_size - distance + item_size - distance) / (name_size + item_size) >= 2/3
1 - (2*distance) / (name_size + item_size) >= 2/3
1/3 >= (2*distance) / (name_size + item_size)
(name_size + item_size) / 6 >= distance
With rounding:
(name_size + item_size + 3) // 6 >= distance
```

Co-authored-by: Pablo Galindo <pablogsal at gmail.com>

files:
M Include/internal/pycore_pyerrors.h
M Lib/test/test_exceptions.py
M Modules/_testinternalcapi.c
M Python/suggestions.c

diff --git a/Include/internal/pycore_pyerrors.h b/Include/internal/pycore_pyerrors.h
index d1af8e91b3b90..a5e97fe23fb05 100644
--- a/Include/internal/pycore_pyerrors.h
+++ b/Include/internal/pycore_pyerrors.h
@@ -87,6 +87,8 @@ PyAPI_FUNC(int) _PyErr_CheckSignalsTstate(PyThreadState *tstate);
 PyAPI_FUNC(void) _Py_DumpExtensionModules(int fd, PyInterpreterState *interp);
 
 extern PyObject* _Py_Offer_Suggestions(PyObject* exception);
+PyAPI_FUNC(Py_ssize_t) _Py_UTF8_Edit_Cost(PyObject *str_a, PyObject *str_b,
+                                          Py_ssize_t max_cost);
 
 #ifdef __cplusplus
 }
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index 3810108e35663..bc0404ea4b04d 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -1486,13 +1486,13 @@ def func():
 
     def test_name_error_suggestions_from_builtins(self):
         def func():
-            print(AttributeErrop)
+            print(ZeroDivisionErrrrr)
         try:
             func()
         except NameError as exc:
             with support.captured_stderr() as err:
                 sys.__excepthook__(*sys.exc_info())
-        self.assertIn("'AttributeError'?", err.getvalue())
+        self.assertIn("'ZeroDivisionError'?", err.getvalue())
 
     def test_name_error_suggestions_do_not_trigger_for_long_names(self):
         def f():
@@ -1565,21 +1565,87 @@ def test_name_error_bad_suggestions_do_not_trigger_for_small_names(self):
     def test_name_error_suggestions_do_not_trigger_for_too_many_locals(self):
         def f():
             # Mutating locals() is unreliable, so we need to do it by hand
-            a1 = a2 = a3 = a4 = a5 = a6 = a7 = a8 = a9 = a10 = a11 = a12 = a13 = \
-            a14 = a15 = a16 = a17 = a18 = a19 = a20 = a21 = a22 = a23 = a24 = a25 = \
-            a26 = a27 = a28 = a29 = a30 = a31 = a32 = a33 = a34 = a35 = a36 = a37 = \
-            a38 = a39 = a40 = a41 = a42 = a43 = a44 = a45 = a46 = a47 = a48 = a49 = \
-            a50 = a51 = a52 = a53 = a54 = a55 = a56 = a57 = a58 = a59 = a60 = a61 = \
-            a62 = a63 = a64 = a65 = a66 = a67 = a68 = a69 = a70 = a71 = a72 = a73 = \
-            a74 = a75 = a76 = a77 = a78 = a79 = a80 = a81 = a82 = a83 = a84 = a85 = \
-            a86 = a87 = a88 = a89 = a90 = a91 = a92 = a93 = a94 = a95 = a96 = a97 = \
-            a98 = a99 = a100 = a101 = a102 = a103 = a104 = a105 = a106 = a107 = \
-            a108 = a109 = a110 = a111 = a112 = a113 = a114 = a115 = a116 = a117 = \
-            a118 = a119 = a120 = a121 = a122 = a123 = a124 = a125 = a126 = \
-            a127 = a128 = a129 = a130 = a131 = a132 = a133 = a134 = a135 = a136 = \
-            a137 = a138 = a139 = a140 = a141 = a142 = a143 = a144 = a145 = \
-            a146 = a147 = a148 = a149 = a150 = a151 = a152 = a153 = a154 = a155 = \
-            a156 = a157 = a158 = a159 = a160 = a161 = None
+            a1 = a2 = a3 = a4 = a5 = a6 = a7 = a8 = a9 = a10 = \
+            a11 = a12 = a13 = a14 = a15 = a16 = a17 = a18 = a19 = a20 = \
+            a21 = a22 = a23 = a24 = a25 = a26 = a27 = a28 = a29 = a30 = \
+            a31 = a32 = a33 = a34 = a35 = a36 = a37 = a38 = a39 = a40 = \
+            a41 = a42 = a43 = a44 = a45 = a46 = a47 = a48 = a49 = a50 = \
+            a51 = a52 = a53 = a54 = a55 = a56 = a57 = a58 = a59 = a60 = \
+            a61 = a62 = a63 = a64 = a65 = a66 = a67 = a68 = a69 = a70 = \
+            a71 = a72 = a73 = a74 = a75 = a76 = a77 = a78 = a79 = a80 = \
+            a81 = a82 = a83 = a84 = a85 = a86 = a87 = a88 = a89 = a90 = \
+            a91 = a92 = a93 = a94 = a95 = a96 = a97 = a98 = a99 = a100 = \
+            a101 = a102 = a103 = a104 = a105 = a106 = a107 = a108 = a109 = a110 = \
+            a111 = a112 = a113 = a114 = a115 = a116 = a117 = a118 = a119 = a120 = \
+            a121 = a122 = a123 = a124 = a125 = a126 = a127 = a128 = a129 = a130 = \
+            a131 = a132 = a133 = a134 = a135 = a136 = a137 = a138 = a139 = a140 = \
+            a141 = a142 = a143 = a144 = a145 = a146 = a147 = a148 = a149 = a150 = \
+            a151 = a152 = a153 = a154 = a155 = a156 = a157 = a158 = a159 = a160 = \
+            a161 = a162 = a163 = a164 = a165 = a166 = a167 = a168 = a169 = a170 = \
+            a171 = a172 = a173 = a174 = a175 = a176 = a177 = a178 = a179 = a180 = \
+            a181 = a182 = a183 = a184 = a185 = a186 = a187 = a188 = a189 = a190 = \
+            a191 = a192 = a193 = a194 = a195 = a196 = a197 = a198 = a199 = a200 = \
+            a201 = a202 = a203 = a204 = a205 = a206 = a207 = a208 = a209 = a210 = \
+            a211 = a212 = a213 = a214 = a215 = a216 = a217 = a218 = a219 = a220 = \
+            a221 = a222 = a223 = a224 = a225 = a226 = a227 = a228 = a229 = a230 = \
+            a231 = a232 = a233 = a234 = a235 = a236 = a237 = a238 = a239 = a240 = \
+            a241 = a242 = a243 = a244 = a245 = a246 = a247 = a248 = a249 = a250 = \
+            a251 = a252 = a253 = a254 = a255 = a256 = a257 = a258 = a259 = a260 = \
+            a261 = a262 = a263 = a264 = a265 = a266 = a267 = a268 = a269 = a270 = \
+            a271 = a272 = a273 = a274 = a275 = a276 = a277 = a278 = a279 = a280 = \
+            a281 = a282 = a283 = a284 = a285 = a286 = a287 = a288 = a289 = a290 = \
+            a291 = a292 = a293 = a294 = a295 = a296 = a297 = a298 = a299 = a300 = \
+            a301 = a302 = a303 = a304 = a305 = a306 = a307 = a308 = a309 = a310 = \
+            a311 = a312 = a313 = a314 = a315 = a316 = a317 = a318 = a319 = a320 = \
+            a321 = a322 = a323 = a324 = a325 = a326 = a327 = a328 = a329 = a330 = \
+            a331 = a332 = a333 = a334 = a335 = a336 = a337 = a338 = a339 = a340 = \
+            a341 = a342 = a343 = a344 = a345 = a346 = a347 = a348 = a349 = a350 = \
+            a351 = a352 = a353 = a354 = a355 = a356 = a357 = a358 = a359 = a360 = \
+            a361 = a362 = a363 = a364 = a365 = a366 = a367 = a368 = a369 = a370 = \
+            a371 = a372 = a373 = a374 = a375 = a376 = a377 = a378 = a379 = a380 = \
+            a381 = a382 = a383 = a384 = a385 = a386 = a387 = a388 = a389 = a390 = \
+            a391 = a392 = a393 = a394 = a395 = a396 = a397 = a398 = a399 = a400 = \
+            a401 = a402 = a403 = a404 = a405 = a406 = a407 = a408 = a409 = a410 = \
+            a411 = a412 = a413 = a414 = a415 = a416 = a417 = a418 = a419 = a420 = \
+            a421 = a422 = a423 = a424 = a425 = a426 = a427 = a428 = a429 = a430 = \
+            a431 = a432 = a433 = a434 = a435 = a436 = a437 = a438 = a439 = a440 = \
+            a441 = a442 = a443 = a444 = a445 = a446 = a447 = a448 = a449 = a450 = \
+            a451 = a452 = a453 = a454 = a455 = a456 = a457 = a458 = a459 = a460 = \
+            a461 = a462 = a463 = a464 = a465 = a466 = a467 = a468 = a469 = a470 = \
+            a471 = a472 = a473 = a474 = a475 = a476 = a477 = a478 = a479 = a480 = \
+            a481 = a482 = a483 = a484 = a485 = a486 = a487 = a488 = a489 = a490 = \
+            a491 = a492 = a493 = a494 = a495 = a496 = a497 = a498 = a499 = a500 = \
+            a501 = a502 = a503 = a504 = a505 = a506 = a507 = a508 = a509 = a510 = \
+            a511 = a512 = a513 = a514 = a515 = a516 = a517 = a518 = a519 = a520 = \
+            a521 = a522 = a523 = a524 = a525 = a526 = a527 = a528 = a529 = a530 = \
+            a531 = a532 = a533 = a534 = a535 = a536 = a537 = a538 = a539 = a540 = \
+            a541 = a542 = a543 = a544 = a545 = a546 = a547 = a548 = a549 = a550 = \
+            a551 = a552 = a553 = a554 = a555 = a556 = a557 = a558 = a559 = a560 = \
+            a561 = a562 = a563 = a564 = a565 = a566 = a567 = a568 = a569 = a570 = \
+            a571 = a572 = a573 = a574 = a575 = a576 = a577 = a578 = a579 = a580 = \
+            a581 = a582 = a583 = a584 = a585 = a586 = a587 = a588 = a589 = a590 = \
+            a591 = a592 = a593 = a594 = a595 = a596 = a597 = a598 = a599 = a600 = \
+            a601 = a602 = a603 = a604 = a605 = a606 = a607 = a608 = a609 = a610 = \
+            a611 = a612 = a613 = a614 = a615 = a616 = a617 = a618 = a619 = a620 = \
+            a621 = a622 = a623 = a624 = a625 = a626 = a627 = a628 = a629 = a630 = \
+            a631 = a632 = a633 = a634 = a635 = a636 = a637 = a638 = a639 = a640 = \
+            a641 = a642 = a643 = a644 = a645 = a646 = a647 = a648 = a649 = a650 = \
+            a651 = a652 = a653 = a654 = a655 = a656 = a657 = a658 = a659 = a660 = \
+            a661 = a662 = a663 = a664 = a665 = a666 = a667 = a668 = a669 = a670 = \
+            a671 = a672 = a673 = a674 = a675 = a676 = a677 = a678 = a679 = a680 = \
+            a681 = a682 = a683 = a684 = a685 = a686 = a687 = a688 = a689 = a690 = \
+            a691 = a692 = a693 = a694 = a695 = a696 = a697 = a698 = a699 = a700 = \
+            a701 = a702 = a703 = a704 = a705 = a706 = a707 = a708 = a709 = a710 = \
+            a711 = a712 = a713 = a714 = a715 = a716 = a717 = a718 = a719 = a720 = \
+            a721 = a722 = a723 = a724 = a725 = a726 = a727 = a728 = a729 = a730 = \
+            a731 = a732 = a733 = a734 = a735 = a736 = a737 = a738 = a739 = a740 = \
+            a741 = a742 = a743 = a744 = a745 = a746 = a747 = a748 = a749 = a750 = \
+            a751 = a752 = a753 = a754 = a755 = a756 = a757 = a758 = a759 = a760 = \
+            a761 = a762 = a763 = a764 = a765 = a766 = a767 = a768 = a769 = a770 = \
+            a771 = a772 = a773 = a774 = a775 = a776 = a777 = a778 = a779 = a780 = \
+            a781 = a782 = a783 = a784 = a785 = a786 = a787 = a788 = a789 = a790 = \
+            a791 = a792 = a793 = a794 = a795 = a796 = a797 = a798 = a799 = a800 \
+                = None
             print(a0)
 
         try:
@@ -1778,7 +1844,7 @@ class A:
             blech = None
         # A class with a very big __dict__ will not be consider
         # for suggestions.
-        for index in range(160):
+        for index in range(2000):
             setattr(A, f"index_{index}", None)
 
         try:
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index ab6c5965d1661..d5616fd59c6e5 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -18,6 +18,7 @@
 #include "pycore_hashtable.h"    // _Py_hashtable_new()
 #include "pycore_initconfig.h"   // _Py_GetConfigsAsDict()
 #include "pycore_interp.h"       // _PyInterpreterState_GetConfigCopy()
+#include "pycore_pyerrors.h"      // _Py_UTF8_Edit_Cost()
 
 
 static PyObject *
@@ -279,6 +280,91 @@ test_atomic_funcs(PyObject *self, PyObject *Py_UNUSED(args))
 }
 
 
+static int
+check_edit_cost(const char *a, const char *b, Py_ssize_t expected)
+{
+    int ret = -1;
+    PyObject *a_obj = NULL;
+    PyObject *b_obj = NULL;
+
+    a_obj = PyUnicode_FromString(a);
+    if (a_obj == NULL) {
+        goto exit;
+    }
+    b_obj = PyUnicode_FromString(b);
+    if (a_obj == NULL) {
+        goto exit;
+    }
+    Py_ssize_t result = _Py_UTF8_Edit_Cost(a_obj, b_obj, -1);
+    if (result != expected) {
+        PyErr_Format(PyExc_AssertionError,
+                     "Edit cost from '%s' to '%s' returns %zd, expected %zd",
+                     a, b, result, expected);
+        goto exit;
+    }
+    // Check that smaller max_edits thresholds are exceeded.
+    Py_ssize_t max_edits = result;
+    while (max_edits > 0) {
+        max_edits /= 2;
+        Py_ssize_t result2 = _Py_UTF8_Edit_Cost(a_obj, b_obj, max_edits);
+        if (result2 <= max_edits) {
+            PyErr_Format(PyExc_AssertionError,
+                         "Edit cost from '%s' to '%s' (threshold %zd) "
+                         "returns %zd, expected greater than %zd",
+                         a, b, max_edits, result2, max_edits);
+            goto exit;
+        }
+    }
+    // Check that bigger max_edits thresholds don't change anything
+    Py_ssize_t result3 = _Py_UTF8_Edit_Cost(a_obj, b_obj, result * 2 + 1);
+    if (result3 != result) {
+        PyErr_Format(PyExc_AssertionError,
+                     "Edit cost from '%s' to '%s' (threshold %zd) "
+                     "returns %zd, expected %zd",
+                     a, b, result * 2, result3, result);
+        goto exit;
+    }
+    ret = 0;
+exit:
+    Py_XDECREF(a_obj);
+    Py_XDECREF(b_obj);
+    return ret;
+}
+
+static PyObject *
+test_edit_cost(PyObject *self, PyObject *Py_UNUSED(args))
+{
+    #define CHECK(a, b, n) do {              \
+        if (check_edit_cost(a, b, n) < 0) {  \
+            return NULL;                     \
+        }                                    \
+    } while (0)                              \
+
+    CHECK("", "", 0);
+    CHECK("", "a", 2);
+    CHECK("a", "A", 1);
+    CHECK("Apple", "Aple", 2);
+    CHECK("Banana", "B at n@n@", 6);
+    CHECK("Cherry", "Cherry!", 2);
+    CHECK("---0---", "------", 2);
+    CHECK("abc", "y", 6);
+    CHECK("aa", "bb", 4);
+    CHECK("aaaaa", "AAAAA", 5);
+    CHECK("wxyz", "wXyZ", 2);
+    CHECK("wxyz", "wXyZ123", 8);
+    CHECK("Python", "Java", 12);
+    CHECK("Java", "C#", 8);
+    CHECK("AbstractFoobarManager", "abstract_foobar_manager", 3+2*2);
+    CHECK("CPython", "PyPy", 10);
+    CHECK("CPython", "pypy", 11);
+    CHECK("AttributeError", "AttributeErrop", 2);
+    CHECK("AttributeError", "AttributeErrorTests", 10);
+
+    #undef CHECK
+    Py_RETURN_NONE;
+}
+
+
 static PyMethodDef TestMethods[] = {
     {"get_configs", get_configs, METH_NOARGS},
     {"get_recursion_depth", get_recursion_depth, METH_NOARGS},
@@ -289,6 +375,7 @@ static PyMethodDef TestMethods[] = {
     {"get_config", test_get_config, METH_NOARGS},
     {"set_config", test_set_config, METH_O},
     {"test_atomic_funcs", test_atomic_funcs, METH_NOARGS},
+    {"test_edit_cost", test_edit_cost, METH_NOARGS},
     {NULL, NULL} /* sentinel */
 };
 
diff --git a/Python/suggestions.c b/Python/suggestions.c
index 2fd6714e84787..6fb01f10cd37c 100644
--- a/Python/suggestions.c
+++ b/Python/suggestions.c
@@ -3,78 +3,129 @@
 
 #include "pycore_pyerrors.h"
 
-#define MAX_DISTANCE 3
-#define MAX_CANDIDATE_ITEMS 160
-#define MAX_STRING_SIZE 25
+#define MAX_CANDIDATE_ITEMS 750
+#define MAX_STRING_SIZE 40
 
-/* Calculate the Levenshtein distance between string1 and string2 */
-static Py_ssize_t
-levenshtein_distance(const char *a, size_t a_size,
-                     const char *b, size_t b_size) {
+#define MOVE_COST 2
+#define CASE_COST 1
 
-    if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
+#define LEAST_FIVE_BITS(n) ((n) & 31)
+
+static inline int
+substitution_cost(char a, char b)
+{
+    if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) {
+        // Not the same, not a case flip.
+        return MOVE_COST;
+    }
+    if (a == b) {
         return 0;
     }
+    if ('A' <= a && a <= 'Z') {
+        a += ('a' - 'A');
+    }
+    if ('A' <= b && b <= 'Z') {
+        b += ('a' - 'A');
+    }
+    if (a == b) {
+        return CASE_COST;
+    }
+    return MOVE_COST;
+}
+
+/* Calculate the Levenshtein distance between string1 and string2 */
+static Py_ssize_t
+levenshtein_distance(const char *a, size_t a_size,
+                     const char *b, size_t b_size,
+                     size_t max_cost)
+{
+    static size_t buffer[MAX_STRING_SIZE];
 
     // Both strings are the same (by identity)
     if (a == b) {
         return 0;
     }
 
-    // The first string is empty
-    if (a_size == 0) {
-        return b_size;
+    // Trim away common affixes.
+    while (a_size && b_size && a[0] == b[0]) {
+        a++; a_size--;
+        b++; b_size--;
+    }
+    while (a_size && b_size && a[a_size-1] == b[b_size-1]) {
+        a_size--;
+        b_size--;
+    }
+    if (a_size == 0 || b_size == 0) {
+        return (a_size + b_size) * MOVE_COST;
+    }
+    if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
+        return max_cost + 1;
     }
 
-    // The second string is empty
-    if (b_size == 0) {
-        return a_size;
+    // Prefer shorter buffer
+    if (b_size < a_size) {
+        const char *t = a; a = b; b = t;
+        size_t t_size = a_size; a_size = b_size; b_size = t_size;
     }
 
-    size_t *buffer = PyMem_Calloc(a_size, sizeof(size_t));
-    if (buffer == NULL) {
-        return -1;
+    // quick fail when a match is impossible.
+    if ((b_size - a_size) * MOVE_COST > max_cost) {
+        return max_cost + 1;
     }
 
+    // Instead of producing the whole traditional len(a)-by-len(b)
+    // matrix, we can update just one row in place.
     // Initialize the buffer row
-    size_t index = 0;
-    while (index < a_size) {
-        buffer[index] = index + 1;
-        index++;
+    for (size_t i = 0; i < a_size; i++) {
+        // cost from b[:0] to a[:i+1]
+        buffer[i] = (i + 1) * MOVE_COST;
     }
 
-    size_t b_index = 0;
     size_t result = 0;
-    while (b_index < b_size) {
+    for (size_t b_index = 0; b_index < b_size; b_index++) {
         char code = b[b_index];
-        size_t distance = result = b_index++;
-        index = SIZE_MAX;
-        while (++index < a_size) {
-            size_t b_distance = code == a[index] ? distance : distance + 1;
+        // cost(b[:b_index], a[:0]) == b_index * MOVE_COST
+        size_t distance = result = b_index * MOVE_COST;
+        size_t minimum = SIZE_MAX;
+        for (size_t index = 0; index < a_size; index++) {
+
+            // cost(b[:b_index+1], a[:index+1]) = min(
+            //     // 1) substitute
+            //     cost(b[:b_index], a[:index])
+            //         + substitution_cost(b[b_index], a[index]),
+            //     // 2) delete from b
+            //     cost(b[:b_index], a[:index+1]) + MOVE_COST,
+            //     // 3) delete from a
+            //     cost(b[:b_index+1], a[index]) + MOVE_COST
+            // )
+
+            // 1) Previous distance in this row is cost(b[:b_index], a[:index])
+            size_t substitute = distance + substitution_cost(code, a[index]);
+            // 2) cost(b[:b_index], a[:index+1]) from previous row
             distance = buffer[index];
-            if (distance > result) {
-                if (b_distance > result) {
-                    result = result + 1;
-                } else {
-                    result = b_distance;
-                }
-            } else {
-                if (b_distance > distance) {
-                    result = distance + 1;
-                } else {
-                    result = b_distance;
-                }
-            }
+            // 3) existing result is cost(b[:b_index+1], a[index])
+
+            size_t insert_delete = Py_MIN(result, distance) + MOVE_COST;
+            result = Py_MIN(insert_delete, substitute);
+
+            // cost(b[:b_index+1], a[:index+1])
             buffer[index] = result;
+            if (result < minimum) {
+                minimum = result;
+            }
+        }
+        if (minimum > max_cost) {
+            // Everything in this row is too big, so bail early.
+            return max_cost + 1;
         }
     }
-    PyMem_Free(buffer);
     return result;
 }
 
 static inline PyObject *
 calculate_suggestions(PyObject *dir,
-                      PyObject *name) {
+                      PyObject *name)
+{
     assert(!PyErr_Occurred());
     assert(PyList_CheckExact(dir));
 
@@ -83,13 +134,14 @@ calculate_suggestions(PyObject *dir,
         return NULL;
     }
 
-    Py_ssize_t suggestion_distance = PyUnicode_GetLength(name);
+    Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX;
     PyObject *suggestion = NULL;
     Py_ssize_t name_size;
     const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size);
     if (name_str == NULL) {
         return NULL;
     }
+
     for (int i = 0; i < dir_size; ++i) {
         PyObject *item = PyList_GET_ITEM(dir, i);
         Py_ssize_t item_size;
@@ -97,15 +149,14 @@ calculate_suggestions(PyObject *dir,
         if (item_str == NULL) {
             return NULL;
         }
-        Py_ssize_t current_distance = levenshtein_distance(
-                name_str, name_size, item_str, item_size);
-        if (current_distance == -1) {
-            return NULL;
-        }
-        if (current_distance == 0 ||
-            current_distance > MAX_DISTANCE ||
-            current_distance * 2 > name_size)
-        {
+        // No more than 1/3 of the involved characters should need changed.
+        Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6;
+        // Don't take matches we've already beaten.
+        max_distance = Py_MIN(max_distance, suggestion_distance - 1);
+        Py_ssize_t current_distance =
+            levenshtein_distance(name_str, name_size,
+                                 item_str, item_size, max_distance);
+        if (current_distance > max_distance) {
             continue;
         }
         if (!suggestion || current_distance < suggestion_distance) {
@@ -113,15 +164,13 @@ calculate_suggestions(PyObject *dir,
             suggestion_distance = current_distance;
         }
     }
-    if (!suggestion) {
-        return NULL;
-    }
-    Py_INCREF(suggestion);
+    Py_XINCREF(suggestion);
     return suggestion;
 }
 
 static PyObject *
-offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) {
+offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc)
+{
     PyObject *name = exc->name; // borrowed reference
     PyObject *obj = exc->obj; // borrowed reference
 
@@ -142,7 +191,8 @@ offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) {
 
 
 static PyObject *
-offer_suggestions_for_name_error(PyNameErrorObject *exc) {
+offer_suggestions_for_name_error(PyNameErrorObject *exc)
+{
     PyObject *name = exc->name; // borrowed reference
     PyTracebackObject *traceback = (PyTracebackObject *) exc->traceback; // borrowed reference
     // Abort if we don't have a variable name or we have an invalid one
@@ -194,7 +244,9 @@ offer_suggestions_for_name_error(PyNameErrorObject *exc) {
 // Offer suggestions for a given exception. Returns a python string object containing the
 // suggestions. This function returns NULL if no suggestion was found or if an exception happened,
 // users must call PyErr_Occurred() to disambiguate.
-PyObject *_Py_Offer_Suggestions(PyObject *exception) {
+PyObject *
+_Py_Offer_Suggestions(PyObject *exception)
+{
     PyObject *result = NULL;
     assert(!PyErr_Occurred());
     if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_AttributeError)) {
@@ -205,3 +257,22 @@ PyObject *_Py_Offer_Suggestions(PyObject *exception) {
     return result;
 }
 
+Py_ssize_t
+_Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost)
+{
+    assert(PyUnicode_Check(a) && PyUnicode_Check(b));
+    Py_ssize_t size_a, size_b;
+    const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a);
+    if (utf8_a == NULL) {
+        return -1;
+    }
+    const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b);
+    if (utf8_b == NULL) {
+        return -1;
+    }
+    if (max_cost == -1) {
+        max_cost = MOVE_COST * Py_MAX(size_a, size_b);
+    }
+    return levenshtein_distance(utf8_a, size_a, utf8_b, size_b, max_cost);
+}
+



More information about the Python-checkins mailing list