Comments/Help on Index-module
thomas at cintra.no
thomas at cintra.no
Sun Apr 30 12:39:15 EDT 2000
Hi,
I`m trying to make a index-module. Words are mapped to a tuple
containg info using a dbhash-based dictionary. The tuple contains a
file-position and how many bytes to read. A second file contain a
serie of integers making a serie of ids. Three integers are used as a
id.
I`ve managed to write integers to a single file, but I need to merge
several files created like above into one huge one. The problem is
that somewhere something got screwed up, and the integers I`ve put
into the scripts are not the same as the ones coming out when I try
lookup words.
I put in:
>>> x.add('eggs',(33,2,42))
and this comes out :
>>> looking for eggs: [(536870912, 16777216, -1610612641)]
Other values seem to get thru ok, but it looks like they are located
first in the file with integers. I therefore suspect that the error is
located somewhere in the seeking in the binary-file.
If somebody understands what I`m trying to do can point to where I
screw up, or has another faster, better or perhaps easier way of doing
this, I`d be grateful
I`ve included my script below. Sorry if this is lots of text and take
up lots of bandwidth, but I`ve spent hours on this and don`t seem to
come any closer to a solution.
thanks.
Thomas
#!/usr/bin/python
import dbhash
import shelve
import struct
import os
StillOpen = "Indexer is still open. Close before searching."
Nothoing2Merge = "Nothin to merge. Add more entries."
def opendb(file):
return shelve.Shelf(dbhash.open(file, "crw"))
class Storage:
def __init__(self):
self.items = {}
self.indexed_items = 0
if not os.path.exists('main.dat'):
self.outputfile1 = 'main.idx'
self.outputfile2 = 'main.dat'
else:
self.outputfile1 = 'idx_cd.idx'
self.outputfile2 = 'idx_cd.dat'
self.dat = opendb(self.outputfile2)
self.out = open(self.outputfile1, "wb")
self.open = 1
def close(self):
self.dat.close()
self.out.close()
self.open = 0
def flush(self):
where_ = 0
fmt = 'lll'
sz = struct.calcsize(fmt)
for key in self.items.keys():
self.dat[key] = (where_, len(self.items[key]))
where_ = where_ + len(self.items[key])
for entry in self.items[key]:
self.out.write(struct.pack(fmt, entry[0], entry[1],
entry[2]))
self.items = {}
self.indexed_items = 0
self.dat.close()
self.out.close()
self.open = 0
def add(self, word, id):
if not self.items.has_key(word):
self.items[word] = []
self.items[word].append(id)
def merge(self):
if not os.path.exists('idx_cd.dat'):
raise Nothoing2Merge
else:
processed = []
fmt = 'lll'
sz = struct.calcsize(fmt)
os.rename('main.dat', '_main.dat')
os.rename('main.idx', '_main.idx')
dOld = opendb('_main.dat')
iOld = open('_main.idx','rb')
dNew = opendb('idx_cd.dat')
iNew = open('idx_cd.idx','rb')
d = opendb('main.dat')
i = open('main.idx','wb')
where = 0
for key in dOld.keys():
processed.append(key)
whereOld, noOld = dOld[key]
seek2 = whereOld
iOld.seek(seek2)
for j in range(0, noOld):
data = iOld.read(sz)
ds = struct.unpack(fmt, data)
i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))
if dNew.has_key(key):
whereNew, noNew = dNew[key]
seek2 = whereNew
iNew.seek(seek2)
noOld = noOld + noNew
for j in range(0, noNew):
data = iNew.read(sz)
ds = struct.unpack(fmt, data)
i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))
d[key] = (where, noOld)
where = i.tell()
for key in dNew.keys():
if not key in processed:
where_, no = dNew[key]
seek2 = where_
iNew.seek(seek2)
for j in range(0, no):
data = iNew.read(sz)
ds = struct.unpack(fmt, data)
i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))
d[key] = (where, no)
where = i.tell()
d.close()
i.close()
dOld.close()
iOld.close()
dNew.close()
iNew.close()
if os.path.exists('_main.dat'):
os.remove('_main.dat')
os.remove('_main.idx')
if os.path.exists('idx_cd.dat'):
os.remove('idx_cd.dat')
os.remove('idx_cd.idx')
class Locator:
def __init__(self, words, intersection = 1):
self.hits = {}
self.result = []
f = open('main.idx','rb')
d = opendb('main.dat')
fmt = 'iii'
sz = struct.calcsize(fmt)
for word in words:
self.hits[word] = []
where, no = d[word]
seek2 = where
f.seek(seek2)
for j in range(0,no):
data = f.read(sz)
ds = struct.unpack(fmt, data)
self.hits[word].append(ds)
workOn = []
for key in self.hits.keys():
workOn.append(self.hits[key])
if intersection:
res = []
if workOn:
for x in workOn[0]:
for other in workOn[1:]:
if not x in other:break
else:
res.append(x)
for list in res:
self.result.append(list)
else: # union => match all words
res_ = {}
for list in workOn:
for item in list:
res_[item] = None
self.result = res_.keys()
if __name__ == '__main__':
x = Storage()
x.add('spam',(21,2,32))
x.add('spam',(24321,3232,3232))
x.add('spam',(321,32,432))
x.add('eggs',(33,2,42))
x.add('spam',(33,2,42))
x.add('python',(312,3232,44322))
x.add('spam',(3323,12,443222))
x.flush()
x = Storage()
x.add('python',(3323,12,142))
x.add('monty',(1133,22,4662))
x.add('cheese',(33213,121,1142))
x.add('biscuit',(3323,132,41322))
x.add('spam',(2333,32,432))
x.flush()
x.merge()
y = Locator(['eggs'])
print "looking for eggs:", y.result
More information about the Python-list
mailing list