Comments/Help on Index-module

thomas at cintra.no thomas at cintra.no
Sun Apr 30 12:39:15 EDT 2000


Hi,

I`m trying to make a index-module. Words are mapped to a tuple
containg info using a dbhash-based dictionary. The tuple contains a
file-position and how many bytes to read. A second file contain a
serie of integers making a serie of ids. Three integers are used as a
id. 

I`ve managed to write integers to a single file, but I need to merge
several files created like above into one huge one. The problem is
that somewhere something got screwed up, and the integers I`ve put
into the scripts are not the same as the ones coming out when I try
lookup words.

I put in:
>>> x.add('eggs',(33,2,42))
and this comes out :
>>> looking for eggs: [(536870912, 16777216, -1610612641)]

Other values seem to get thru ok, but it looks like they are located
first in the file with integers. I therefore suspect that the error is
located  somewhere in the seeking in the binary-file.

If somebody understands what I`m trying to do can point to where I
screw up, or has another faster, better or perhaps easier way of doing
this, I`d be grateful

I`ve included my script below. Sorry if this is lots of text and take
up lots of bandwidth, but I`ve spent hours on this and don`t seem to
come any closer to a solution.

thanks.

Thomas 

#!/usr/bin/python

import dbhash
import shelve
import struct
import os

StillOpen = "Indexer is still open. Close before searching."
Nothoing2Merge = "Nothin to merge. Add more entries."

def opendb(file):
    return shelve.Shelf(dbhash.open(file, "crw"))

class Storage:

    def __init__(self):

        self.items = {}
        self.indexed_items = 0
        if not os.path.exists('main.dat'):
            self.outputfile1 = 'main.idx'
            self.outputfile2 = 'main.dat'
        else:
            self.outputfile1 = 'idx_cd.idx'
            self.outputfile2 = 'idx_cd.dat'
        self.dat = opendb(self.outputfile2)
        self.out = open(self.outputfile1, "wb")
        self.open = 1

    def close(self):
        self.dat.close()
        self.out.close()
        self.open = 0
        
    def flush(self):

        where_ = 0
        fmt = 'lll'
        sz = struct.calcsize(fmt)

        for key in self.items.keys():

            self.dat[key] = (where_, len(self.items[key]))
            where_ = where_ + len(self.items[key])

            for entry in self.items[key]:
                self.out.write(struct.pack(fmt, entry[0], entry[1],
entry[2]))

        self.items = {}
        self.indexed_items = 0
        self.dat.close()
        self.out.close()
        self.open = 0
        
    def add(self, word, id):

        if not self.items.has_key(word):
            self.items[word] = []

        self.items[word].append(id)
            
    def merge(self):

        if not os.path.exists('idx_cd.dat'):
            raise Nothoing2Merge
        else:

            processed = []
            fmt = 'lll'
            sz = struct.calcsize(fmt)
            
            os.rename('main.dat', '_main.dat')
            os.rename('main.idx', '_main.idx')

            dOld = opendb('_main.dat')
            iOld = open('_main.idx','rb')
            dNew = opendb('idx_cd.dat')
            iNew = open('idx_cd.idx','rb')

            d = opendb('main.dat')
            i = open('main.idx','wb')
            where = 0
            
            for key in dOld.keys():

                processed.append(key)
                whereOld, noOld = dOld[key]
                seek2 = whereOld
                iOld.seek(seek2)

                for j in range(0, noOld):
                    data = iOld.read(sz)
                    ds = struct.unpack(fmt, data)
                    i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))

                if dNew.has_key(key):

                    whereNew, noNew = dNew[key]
                    seek2 = whereNew
                    iNew.seek(seek2)
                    noOld =  noOld + noNew

                    for j in range(0, noNew):
                        data = iNew.read(sz)
                        ds = struct.unpack(fmt, data)
                        i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))

                d[key] = (where, noOld)
                where = i.tell()

            for key in dNew.keys():

                if not key in processed:
                    where_, no = dNew[key]
                    seek2 = where_
                    iNew.seek(seek2)

                    for j in range(0, no):
                        data = iNew.read(sz)
                        ds = struct.unpack(fmt, data)
                        i.write(struct.pack(fmt, ds[0], ds[1], ds[2]))

                    d[key] = (where, no)
                    where = i.tell()
                           
            d.close()
            i.close()
            dOld.close()
            iOld.close()
            dNew.close()
            iNew.close()
            
            if os.path.exists('_main.dat'):
                os.remove('_main.dat')
                os.remove('_main.idx')
            if os.path.exists('idx_cd.dat'):
                os.remove('idx_cd.dat')
                os.remove('idx_cd.idx')

class Locator:

    def __init__(self, words, intersection = 1):

        self.hits = {}
        self.result = []
        
        f = open('main.idx','rb')        
        d = opendb('main.dat')

        fmt = 'iii'
        sz = struct.calcsize(fmt)

        for word in words:

            self.hits[word] = []
            
            where, no = d[word]
            seek2 = where
            f.seek(seek2)

            for j in range(0,no):
                data = f.read(sz)
                ds = struct.unpack(fmt, data)
                self.hits[word].append(ds)

            workOn = []
            for key in self.hits.keys():
                workOn.append(self.hits[key])
            
        if intersection: 
            res = []
            if workOn: 
                    for x in workOn[0]:
                            for other in workOn[1:]:
                                    if not x in other:break
                            else:
                                res.append(x)
            for list in res:
                self.result.append(list)
                    
        else: # union => match all words
            res_ = {}
            for list in workOn:
                for item in list:
                    res_[item] = None
            self.result = res_.keys()
    
if __name__ == '__main__':

    x = Storage()
    x.add('spam',(21,2,32))
    x.add('spam',(24321,3232,3232))
    x.add('spam',(321,32,432))
    x.add('eggs',(33,2,42))
    x.add('spam',(33,2,42))
    x.add('python',(312,3232,44322))
    x.add('spam',(3323,12,443222))
    x.flush()

    x = Storage()
    x.add('python',(3323,12,142))
    x.add('monty',(1133,22,4662))
    x.add('cheese',(33213,121,1142))
    x.add('biscuit',(3323,132,41322))
    x.add('spam',(2333,32,432))
    x.flush()
    x.merge()

    y = Locator(['eggs'])
    print "looking for eggs:", y.result




More information about the Python-list mailing list