table (ascii text) lin ayout recognition
bearophileHUGS at lycos.com
bearophileHUGS at lycos.com
Wed Sep 13 08:33:32 EDT 2006
My version, not much tested. It probably doesn't work well for tables
with few rows. It finds the most frequent word beginnings, and then
splits the data according to them.
data = """\
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
"""
import re, pprint
# import collections # For Python 2.5
# RE to find the beginning of words
tpatt = re.compile(r"\b[^ ]")
# Remove empty lines
lines = filter(None, data.splitlines())
# Find the positions of all word beginnings
# This finds: treshs = [0, 11, 25, 35, 49, ...
# 44544 ipod apple black 102
# ^ ^ ^ ^ ^
treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)]
# Find treshs frequences
freqs = {}
for el in treshs:
freqs[el] = freqs.get(el, 0) + 1
# Find treshs frequences, alternative for Python V.2.5
# freqs = collections.defaultdict(int)
# for el in treshs:
# freqs[el] += 1
# Find a big enough frequence
bigf = max(freqs.itervalues()) * 0.6
# Find the most common column beginnings
cols = sorted(k for k,v in freqs.iteritems() if v>bigf)
def xpairs(alist):
"xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)"
for i in xrange(len(alist)-1):
yield alist[i:i+2]
result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in
lines]
print data
pprint.pprint(result)
"""
Output:
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
[['44544', 'ipod', 'apple', 'black', '102'],
['GFGFHHF-12', 'unknown thing', 'bizar', 'brick mortar', 'tbc'],
['45fjk', 'do not know', '+ is less', '', 'biac'],
['', 'disk', 'seagate', '250GB', '130'],
['5G_gff', '', 'tbd', 'tbd', ''],
['gjgh88hgg', 'media record', 'a and b', '', '12'],
['hjj', 'foo', 'bar', 'hop', 'zip'],
['hg uy oi', 'hj uuu ii a', 'qqq ccc v', 'ZZZ Ughj', ''],
['qdsd', 'zert', '', 'nope', 'nope']]
"""
Bye,
bearophile
More information about the Python-list
mailing list