Large Dictionaries

Claudio Grondi claudio.grondi at freenet.de
Wed May 17 15:30:02 EDT 2006


Chris Foote wrote:
> Klaas wrote:
> 
>>> 22.2s  20m25s[3]
>>
>>
>> 20m to insert 1m keys?  You are doing something wrong.
> 
> 
> Hi Mike.
> 
> I've put together some simplified test code, but the bsddb
> module gives 11m for 1M keys:
> 
I have run your code for the bsddb on my P4 2.8 GHz and have got:
Number generator test for 1000000 number ranges
         with a maximum of 3 wildcard digits.
Wed May 17 16:34:06 2006 dictionary population started
Wed May 17 16:34:14 2006 dictionary population stopped, duration 8.4s
Wed May 17 16:34:14 2006 StorageBerkeleyDB population started
Wed May 17 16:35:59 2006 StorageBerkeleyDB population stopped, duration 
104.3s
Surprising here, that the dictionary population gives the same time, but 
the BerkeleyDB inserts the records 6 times faster on my computer than on 
yours. I am running Python 2.4.2 on Windows XP SP2, and you?

> Number generator test for 1000000 number ranges
>         with a maximum of 3 wildcard digits.
> Wed May 17 22:18:17 2006 dictionary population started
> Wed May 17 22:18:26 2006 dictionary population stopped, duration 8.6s
> Wed May 17 22:18:27 2006 StorageBerkeleyDB population started
> Wed May 17 22:29:32 2006 StorageBerkeleyDB population stopped, duration 
> 665.6s
> Wed May 17 22:29:33 2006 StorageSQLite population started
> Wed May 17 22:30:38 2006 StorageSQLite population stopped, duration 65.5s
As I don't have SQLite installed, it is interesting to see if the factor 
10 in the speed difference between BerkeleyDB and SQLite can be 
confirmed by someone else.
Why is SQLite faster here? I suppose, that SQLite first adds all the 
records and builds the index afterwards with all the records there (with 
db.commit()).
Can the same be done in BerkeleyDB, or does BerkeleyDB not support 
inserting of records without building an index each single insert 
command? If yes, how?

Claudio
> 
> test code is attached.
> 
>> With bdb's it is crucial to insert keys in bytestring-sorted order.
> 
> 
> For the bsddb test, I'm using a plain string.  (The module docs list a
> string being the only datatype supported for both keys & values).
> 
>> Also, be sure to give it a decent amount of cache.
> 
> 
> The bsddb.hashopen() factory seems to have a bug in this regard; if you
> supply a cachesize argument, then it barfs:
> 
> ....
>   File "bsddb-test.py", line 67, in runtest
>     db = bsddb.hashopen(None, flag='c', cachesize=8192)
>   File "/usr/lib/python2.4/bsddb/__init__.py", line 288, in hashopen
>     if cachesize is not None: d.set_cachesize(0, cachesize)
> bsddb._db.DBInvalidArgError: (22, 'Invalid argument -- 
> DB->set_cachesize: method not permitted when environment specified')
> 
> 
> I'll file a bug report on this if it isn't already fixed.
> 
> Cheers,
> Chris
> 
> 
> ------------------------------------------------------------------------
> 
> import bsddb
> import random
> import time
> import sqlite
> 
> import psyco
> psyco.full()
> 
> class WallClockTimer(object):
>     '''Simple timer for measuring tests.'''
>     def __init__(self, msg):
>         self.start = time.time()
>         self.msg = msg
>         print time.ctime(self.start), self.msg, 'started'
>         
>     def stop(self):
>         end = time.time()
>         duration = end - self.start
>         print time.ctime(end), self.msg, 'stopped, duration %.1fs' % duration
> 
> class NumberGen(object):
>     '''Phone number generator for testing different methods of storage.'''
>     def __init__(self, range_start, range_end, n, max_wildcards):
> 
>         print '''Number generator test for %d number ranges
>         with a maximum of %d wildcard digits.''' % (n, max_wildcards)
>         
>         wildcards = range(0, max_wildcards + 1)
>         # generate n unique numbers and store as keys in number_hash
>         timer = WallClockTimer('dictionary population')
>         self.number_hash = {}
>         
>         for i in xrange(0, n):
>             unique = False
>             while not unique:
>                 wildcard_digits = self.gen_wildcard_digits(wildcards)
>                 num = self.gen_number(range_start, range_end)
>                 if wildcard_digits > 0:
>                     num /= (10 ** wildcard_digits)
>                 key = (num, wildcard_digits)
>                 if self.number_hash.has_key(key):
>                     unique = False
>                 else:
>                     unique = True
>                     self.number_hash[key] = None
>         timer.stop()
>         
>     def gen_wildcard_digits(self, wildcards):
>         return random.choice(wildcards)
>     
>     def gen_number(self, start, end):
>         return int(random.uniform(start, end))
> 
>     def storage_test(self, StorageTestClass):
>         test = StorageTestClass(self.number_hash)
> 
> class StorageTest(object):
>     '''base class for testing storage.  Derive a test
>     class and provide your own runtest() method.'''
>     def __init__(self, number_hash):
>         timer = WallClockTimer('%s population' % type(self).__name__)
>         self.runtest(number_hash)
>         timer.stop()
> 
> class StorageBerkeleyDB(StorageTest):
>     def runtest(self, number_hash):
>         db = bsddb.hashopen(None, flag='c', cachesize=8192)
>         for (num, wildcard_digits) in number_hash.keys():
>             key = '%d:%d' % (num, wildcard_digits)
>             db[key] = None
>         db.close()
> 
> class StorageSQLite(StorageTest):
>     def runtest(self, number_hash):
>         db = sqlite.connect(':memory:')
>         cursor = db.cursor()
>         cursor.execute('CREATE TABLE numbers (num INTEGER, wd INTEGER)')
>         q = """insert into numbers (num, wd) values (%d, %d)"""
>         for (num, wildcard_digits) in number_hash.keys():
>             cursor.execute(q, num, wildcard_digits)
>         db.commit()
>         db.close()
> 
> 
> if __name__ == '__main__':
>     
>     test_vals = {'range_start'   : 61880 * 10 ** 7,
>                  'range_end'     : 61891 * 10 ** 7,
>                  'n'             : 1 * 10 ** 4,
>                  'max_wildcards' : 3}
>     
>     ngen = NumberGen(test_vals['range_start'], test_vals['range_end'],
>                      test_vals['n'], test_vals['max_wildcards'])
> 
>     ngen.storage_test(StorageBerkeleyDB)
>     ngen.storage_test(StorageSQLite)



More information about the Python-list mailing list