Comparing strings from the back?

Wed Sep 5 05:48:38 EDT 2012

Chris Angelico wrote:

> On Wed, Sep 5, 2012 at 6:29 PM, Peter Otten <__peter__ at web.de> wrote:
>> comparing every pair in a sample of 1000 8-char words
>> taken from '/usr/share/dict/words'
>>
>> head
>> 1: 477222 ************************************************************
>> 2:  18870 **
>> ...

tail
1: 386633 ************************************************************
2:  74966 ***********

> Not understanding this. What are the statistics, 

I measured how many chars they have in common for all combinations of 1000 
words taken from /usr/share/dict/words.

477222 pairs have one char in common, 18870 pairs have two chars in common 
when compared from the *beginning* of the string.

386633 pairs have one char in common, 74966 pairs have two chars in common 
when compared from the *end* of the string.

and what (if it's not obvious from the previous answer) do they prove?

They demonstrate that for two words from that particular corpus it is likely 
that a[::-1] == b[::-1] has to take more (1.34 versus 1.05 on average) 
characters into account than a == b, i. e. comparing from the back should be 
slower rather than faster.

If that doesn't help, here's the code ;)

import random
import itertools
from contextlib import contextmanager
from collections import defaultdict

@contextmanager
def open_corpus(args):
    with open(args.name) as instream:
        words = (line.strip() for line in instream)
        #words = (word for word in words if not word.endswith("'s"))
        yield words

def pick(items, count):
    items = list(items)
    return random.sample(items, count)

def count_common(a, b):
    i = 0
    for i, (x, y) in enumerate(zip(a, b), 1):
        if x != y:
            break
    return i

def corpus(args):
    with open_corpus(args) as words:
        wordlens = (len(line.strip()) for line in words)
        freq = defaultdict(int)
        for wordlen in wordlens:
            freq[wordlen] += 1
    show_histogram(freq)

def show_histogram(freq):
    peak = max(freq.values())
    freq_width = len(str(peak))
    max_freq = max(freq)
    len_width = len(str(max_freq))
    for i in range(min(freq), max_freq+1):
        value = freq[i]
        print("{:{}}: {:{}} {}".format(
                i, len_width, value,
                freq_width, "*" * (60 * value // peak)))

def compare(args):
    with open_corpus(args) as words:
        words = pick(
            (word for word in words if len(word) == args.word_len),
            args.sample_size)
    n = 0
    head_common = defaultdict(int)
    tail_common = defaultdict(int)
    n_tail = n_head = 0
    for n, (a, b) in enumerate(itertools.combinations(words, 2), 1):
        cc = count_common(a, b)
        n_head += cc
        head_common[cc] += 1
        ccr = count_common(a[::-1], b[::-1])
        n_tail += ccr
        tail_common[ccr] += 1

    print(n, "combinations")
    print("average common chars (head) {:.3}".format(n_head/n))
    print("average common chars (tail) {:.3}".format(n_tail/n))

    print("comparing every pair in a "
          "sample of {sample} {len}-char words\n"
          "taken from {name!r}".format(
            sample=args.sample_size,
            len=args.word_len,
            name=args.name))
    print()
    print("head")
    show_histogram(head_common)
    print()
    print("tail")
    show_histogram(tail_common)

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parsers = parser.add_subparsers()

    pcorpus = parsers.add_parser("corpus")
    pcorpus.add_argument("name")
    pcorpus.set_defaults(func=corpus)

    pcompare = parsers.add_parser("compare")
    pcompare.add_argument("name")
    pcompare.add_argument("--sample-size", type=int, default=1000)
    pcompare.add_argument("--word-len", type=int, default=8)
    pcompare.set_defaults(func=compare)
    args = parser.parse_args()
    args.func(args)

if __name__ == "__main__":
    main()