Generating list of unique search sub-phrases

Peter Otten __peter__ at web.de
Fri May 29 19:07:04 EDT 2015


Nick Mellor wrote:

> Hi all,
> 
> My own solution works but I'm sure it could be simpler or read better. How
> would you do it?
> 
> Say you've got a list of companies:
> 
> Aerosonde Ltd
> Amcor
> ANCA
> Austal Ships
> Australia Post
> Australian Air Express
> Australian Defence Industries
> Australian Railroad Group
> Australian Submarine Corporation
> 
> and you need to extract phrases from the company names that uniquely
> identify that company. The results for the above list of companies should
> be:
> 
> Company: 'Aerosonde Ltd'
>  Aliases: Aerosonde,Ltd,Aerosonde Ltd
> 
> Company: 'Amcor'
>  Aliases: Amcor
> 
> Company: 'ANCA'
>  Aliases: ANCA
> 
> Company: 'Austal Ships'
>  Aliases: Austal,Ships,Austal Ships
> 
> Company: 'Australia Post'
>  Aliases: Post,Australia Post
> 
> Company: 'Australian Air Express'
>  Aliases: Air,Express,Australian Air,Air Express,Australian Air Express
> 
> Company: 'Australian Defence Industries'
>  Aliases: Defence,Industries,Australian Defence,Defence
>  Industries,Australian Defence Industries
> 
> Company: 'Australian Railroad Group'
>  Aliases: Railroad,Group,Australian Railroad,Railroad Group,Australian
>  Railroad Group
> 
> Company: 'Australian Submarine Corporation'
>  Aliases: Submarine,Corporation,Australian Submarine,Submarine
>  Corporation,Australian Submarine Corporation
> 
> Here's my solution:
> 
> from itertools import combinations, chain
> 
> companies = [
>     "Aerosonde Ltd",
>     "Amcor",
>     "ANCA",
>     "Austal Ships",
>     "Australia Post",
>     "Australian Air Express",
>     "Australian Defence Industries",
>     "Australian Railroad Group",
>     "Australian Submarine Corporation",
> ]
> 
> def flatten(i):
>     return list(chain.from_iterable(i))
> 
> companies_as_text_stream = ' '.join(companies)
> for company in companies:
>         word_combinations = [list(combinations(company.split(), r)) for r
>         in range(1, len(company))] phrases = [' '.join(phrase) for phrase
>         in flatten(word_combinations)] unique_phrases = [phrase for phrase
>         in phrases if companies_as_text_stream.count(phrase) == 1] aliases
>         = ','.join(unique_phrases) print("Company: '{0}'\n Aliases:
>         {1}\n".format(company, aliases))

Here's an alternative that operates on the words rather than the text:

import itertools
from collections import defaultdict


def adjacent_combinations(words):
    words = tuple(words)
    for i in range(1, len(words)):
        for k in range(len(words) + 1 - i):
            yield words[k:k+i]
    yield words


def unordered_combinations(words):
    for i in range(1, len(words) + 1):
        yield from map(frozenset, itertools.combinations(words, i))


def show_aliases(companies, combinations):
    combo_to_companies = defaultdict(list)
    for company in companies:
        for combo in combinations(company.split()):
            combo_to_companies[combo].append(company)

    company_to_uniquecombos = defaultdict(list)
    for combo, candidates in combo_to_companies.items():
        if len(candidates) == 1:
            company_to_uniquecombos[candidates[0]].append(combo)

    for company in companies:
        print("Company:", company)
        print(
            "Aliases:",
            ", ".join(map(" ".join, company_to_uniquecombos.get(company, 
()))))
        print()


if __name__ == "__main__":
    companies = [
        "Aerosonde Ltd",
        "Amcor",
        "ANCA",
        "Austal Ships",
        "Australia Post",
        "Australian Air Express",
        "Australian Defence Industries",
        "Australian Railroad Group",
        "Australian Submarine Corporation",
    ]
    show_aliases(companies, adjacent_combinations)
    print("=" * 40)
    show_aliases(companies, unordered_combinations)

Note that the result using adjacent_combinations() is similar but not 
identical to the one you asked for as "Australia" is sufficient to identify 
"Australia Post". 

The result based on unordered_combinations() accepts 
"Australian Corporation" to identify the "Australian Submarine Corporation" 
and would fail to find an alias if the list of companies contained items 
that differ only in the order of words.

So: different result and less concise -- what more can you wish for?

How about some nitpicking?

- you can avoid materialising some of the intermediate lists
- you can avoid cross-company matches (think "Express Australian") with an 
out-of-band character

Which gives:

def all_len_combinations(words):
    return chain.from_iterable(
        combinations(words, r) for r in range(1, len(words) + 1))

companies_as_text_stream = '\0'.join(companies)
for company in companies:
    word_combinations = all_len_combinations(company.split())
    phrases = (' '.join(phrase) for phrase in word_combinations)
    unique_phrases = (phrase for phrase in phrases
                      if companies_as_text_stream.count(phrase) == 1)
    aliases = ','.join(unique_phrases)
    print("Company: '{0}'\n Aliases: {1}\n".format(company, aliases))

You may also want to try adjacent_combinations() from above which omits 
impossible candidates like "Australian Corporation".




More information about the Python-list mailing list