Comparing strings from the back?
Peter Otten
__peter__ at web.de
Wed Sep 5 05:48:38 EDT 2012
Chris Angelico wrote:
> On Wed, Sep 5, 2012 at 6:29 PM, Peter Otten <__peter__ at web.de> wrote:
>> comparing every pair in a sample of 1000 8-char words
>> taken from '/usr/share/dict/words'
>>
>> head
>> 1: 477222 ************************************************************
>> 2: 18870 **
>> ...
tail
1: 386633 ************************************************************
2: 74966 ***********
> Not understanding this. What are the statistics,
I measured how many chars they have in common for all combinations of 1000
words taken from /usr/share/dict/words.
477222 pairs have one char in common, 18870 pairs have two chars in common
when compared from the *beginning* of the string.
386633 pairs have one char in common, 74966 pairs have two chars in common
when compared from the *end* of the string.
and what (if it's not obvious from the previous answer) do they prove?
They demonstrate that for two words from that particular corpus it is likely
that a[::-1] == b[::-1] has to take more (1.34 versus 1.05 on average)
characters into account than a == b, i. e. comparing from the back should be
slower rather than faster.
If that doesn't help, here's the code ;)
import random
import itertools
from contextlib import contextmanager
from collections import defaultdict
@contextmanager
def open_corpus(args):
with open(args.name) as instream:
words = (line.strip() for line in instream)
#words = (word for word in words if not word.endswith("'s"))
yield words
def pick(items, count):
items = list(items)
return random.sample(items, count)
def count_common(a, b):
i = 0
for i, (x, y) in enumerate(zip(a, b), 1):
if x != y:
break
return i
def corpus(args):
with open_corpus(args) as words:
wordlens = (len(line.strip()) for line in words)
freq = defaultdict(int)
for wordlen in wordlens:
freq[wordlen] += 1
show_histogram(freq)
def show_histogram(freq):
peak = max(freq.values())
freq_width = len(str(peak))
max_freq = max(freq)
len_width = len(str(max_freq))
for i in range(min(freq), max_freq+1):
value = freq[i]
print("{:{}}: {:{}} {}".format(
i, len_width, value,
freq_width, "*" * (60 * value // peak)))
def compare(args):
with open_corpus(args) as words:
words = pick(
(word for word in words if len(word) == args.word_len),
args.sample_size)
n = 0
head_common = defaultdict(int)
tail_common = defaultdict(int)
n_tail = n_head = 0
for n, (a, b) in enumerate(itertools.combinations(words, 2), 1):
cc = count_common(a, b)
n_head += cc
head_common[cc] += 1
ccr = count_common(a[::-1], b[::-1])
n_tail += ccr
tail_common[ccr] += 1
print(n, "combinations")
print("average common chars (head) {:.3}".format(n_head/n))
print("average common chars (tail) {:.3}".format(n_tail/n))
print("comparing every pair in a "
"sample of {sample} {len}-char words\n"
"taken from {name!r}".format(
sample=args.sample_size,
len=args.word_len,
name=args.name))
print()
print("head")
show_histogram(head_common)
print()
print("tail")
show_histogram(tail_common)
def main():
import argparse
parser = argparse.ArgumentParser()
parsers = parser.add_subparsers()
pcorpus = parsers.add_parser("corpus")
pcorpus.add_argument("name")
pcorpus.set_defaults(func=corpus)
pcompare = parsers.add_parser("compare")
pcompare.add_argument("name")
pcompare.add_argument("--sample-size", type=int, default=1000)
pcompare.add_argument("--word-len", type=int, default=8)
pcompare.set_defaults(func=compare)
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()
More information about the Python-list
mailing list