table (ascii text) lin ayout recognition

Wed Sep 13 08:33:32 EDT 2006

My version, not much tested. It probably doesn't work well for tables
with few rows. It finds the most frequent word beginnings, and then
splits the data according to them.

data = """\
44544      ipod          apple     black         102
GFGFHHF-12 unknown thing bizar     brick mortar  tbc
45fjk      do not know   + is less               biac
           disk          seagate   250GB         130
5G_gff                   tbd       tbd
gjgh88hgg  media record  a and b                 12
hjj        foo           bar       hop           zip
hg uy oi   hj uuu ii a   qqq ccc v ZZZ Ughj
qdsd       zert                    nope          nope
"""

import re, pprint
# import collections # For Python 2.5

# RE to find the beginning of words
tpatt = re.compile(r"\b[^ ]")

# Remove empty lines
lines = filter(None, data.splitlines())

# Find the positions of all word beginnings
# This finds:  treshs = [0, 11, 25, 35, 49, ...
# 44544      ipod          apple     black         102
# ^          ^             ^         ^             ^
treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)]

# Find treshs frequences
freqs = {}
for el in treshs:
    freqs[el] = freqs.get(el, 0) + 1

# Find treshs frequences, alternative for Python V.2.5
# freqs = collections.defaultdict(int)
# for el in treshs:
#     freqs[el] += 1

# Find a big enough frequence
bigf = max(freqs.itervalues()) * 0.6

# Find the most common column beginnings
cols = sorted(k for k,v in freqs.iteritems() if v>bigf)

def xpairs(alist):
    "xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)"
    for i in xrange(len(alist)-1):
        yield alist[i:i+2]

result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in
lines]

print data
pprint.pprint(result)

"""
Output:

44544      ipod          apple     black         102
GFGFHHF-12 unknown thing bizar     brick mortar  tbc
45fjk      do not know   + is less               biac
           disk          seagate   250GB         130
5G_gff                   tbd       tbd
gjgh88hgg  media record  a and b                 12
hjj        foo           bar       hop           zip
hg uy oi   hj uuu ii a   qqq ccc v ZZZ Ughj
qdsd       zert                    nope          nope

[['44544', 'ipod', 'apple', 'black', '102'],
 ['GFGFHHF-12', 'unknown thing', 'bizar', 'brick mortar', 'tbc'],
 ['45fjk', 'do not know', '+ is less', '', 'biac'],
 ['', 'disk', 'seagate', '250GB', '130'],
 ['5G_gff', '', 'tbd', 'tbd', ''],
 ['gjgh88hgg', 'media record', 'a and b', '', '12'],
 ['hjj', 'foo', 'bar', 'hop', 'zip'],
 ['hg uy oi', 'hj uuu ii a', 'qqq ccc v', 'ZZZ Ughj', ''],
 ['qdsd', 'zert', '', 'nope', 'nope']]
"""

Bye,
bearophile