Optimizing Text Similarity Algorithm

Mon Aug 22 08:53:41 EDT 2011

Yaşar Arabacı wrote:

> I originally posted this question on stackoverflow, you can find it here:
> http://stackoverflow.com/q/7133350/886669
> 
> I just want people check what I am doing and express their opinion about
> the thing I am doing is acceptable, or are there some expects of it that
> could change.

You are using only word frequencies to calculate the similarity of the 
document pairs. You can calculate these frequencies before you enter the the 
expensive 

for documentA in ...:
    for documentB in ...:
        calculate_similarity(documentA, documentB)

loops and therefore bring that portion of your could from O(n*n) to O(n). 
Here's is a sample where I applied that technique blindly to your code. I 
also had to remove the django dependency, so I've changed it to operate on 
text files.

import sys
import math
import re

from collections import Counter
from itertools import combinations

def get_words(text):
    # FIXME
    regex = re.compile("\W+", flags=re.UNICODE)
    return re.split(regex, text)

def pairs(files):
    """Generate (title, wordlist) pairs.

    (filename is used as title)
    """
    for filename in files:
        with open(filename) as f:
            text = f.read()
        yield filename, get_words(text)

def process(pairs):
    triples = []
    total = Counter()
    for title, words in pairs:
        c = Counter(words)
        total.update(c.iterkeys())
        sigma = sum(math.log(freq, 1.6) for freq in c.itervalues())
        triples.append((title, c, sigma))

    for (title1, freq1, sum1), (title2, freq2, sum2) in combinations(
        triples, 2):
        upper_part = 0

        for word in freq1 & freq2:
            adjusted1 = math.log(freq1[word], 1.6)
            adjusted2 = math.log(freq2[word], 1.6)
            upper_part += (
                adjusted1 * adjusted2 * math.log(len(triples)/total[word]))

        lower_part = math.sqrt(sum1 * sum2)

        title1, title2 = sorted([title1, title2])
        yield title1, title2, upper_part/lower_part

def main():
    files = sys.argv[1:]

    results = process(pairs(files))
    results = sorted(results, key=lambda x: x[:2])
    results.sort(key=lambda x: x[2], reverse=True)
    print "\n".join("%s and %s => %f" % xcv for xcv in results)

if __name__ == "__main__":
    main()