Crude statistics on the standard library

F. Petitjean littlejohn.75NOSPAM at news.free.fr.invalid
Mon Jan 31 11:41:44 EST 2005


I have written a script to find the modules which export the largest
number of names. The gc.getreferrers(*objs) function gives also an idea
of the dependencies between the modules.

The code (statsmod.py) :

#!/usr/bin/env python
# -*- coding: latin-1 -*-

"""
statsmod.py   module rudimentaire de statistiques des noms exportés
par les modules de la bibliothèque standard
"""

import sys
import gc

from glob import glob
import os, os.path
from os.path import basename

def browse_stdlib():
    """browse the standard library
    returns list of names of modules
    """
    pyver = 'python%s' % (sys.version[:3],)
    pyglob = os.path.join(sys.prefix, 'lib', pyver, '*.py')
    # lpys = glob(pyglob)
    if os.path.exists(os.path.join(sys.prefix, 'Lib', 'os.pyc')):
        pyglob = os.path.join(sys.prefix, 'Lib', '*.py')
    lpys = map(basename, glob(pyglob))
    names = [ name[:-3] for name in lpys ]
    # remove some obsolete modules ('this' + DeprecationWarning)
    for dontparse in ("this", "tzparse", 'FCNTL', 'posixfile', 'pre', 'regsub',
        'statcache', 'TERMIOS', 'xmllib'):
        try:
            names.remove(dontparse)
        except ValueError:
            continue
    return names

def exports(names, with_modules=False):
    """imports all the modules in names
    returns a 2-tuple :
    - list of tuples : NumberOfExternalNames len(dir(module)) nodname
    - list of modules (if with_modules is true)
    """
    res = []
    add = res.append
    _all = []
    modules = []
    # this simple minded method (__import__) doesn't include sys ?
    for name in names:
        print name, " ",
        try:
            module = __import__(name, globals(), locals(), _all)
            ldir = len(dir(module))
            if hasattr(module, '__all__'):
                nexports = len(module.__all__)
            else:
                nexports = ldir
            add((nexports, ldir, name))
            if with_modules:
                modules.append(module)
            # del sys.modules[name]
        except ImportError, msg:
            print "cannot import module", name, msg
    return res, modules

def pm_histo(values, nbins=20):
    """a poor man histogram
    Return a list of nbins tuples (left, right) such that
    the union of the consecutive ranges(left, right) is range(len(values)+1)
    values[k]
    """
    vlo, vhi = values[0], values[-1]+1
    nbins = min(nbins, vhi-vlo)
    deltax = int((vhi - vlo)/nbins)
    assert deltax > 0
    ranges = []
    add = ranges.append
    left = 0 # left index  first bin
    val = vlo + deltax
    while val < vhi:
        for right in range(left, len(values)):
            if values[right] > val:
                break
        add((left, right))
        left = right
        val = val + deltax
    return ranges

def basic_stat(seq):
    """basic statistics on the values in seq
    Returns NumberOfItems, MeanValue, StandardDeviation, variance
    """
    s0, s1, s2 = 0, 0, 0
    for indx, item in enumerate(seq):
        s0 = s0 + 1 # seq may be an iterable without len
        Xi = float(item)
        if not indx:
            Xmin = Xi
        s1 = s1 + Xi
        s2 = s2 + Xi*Xi
    # s0 = len(seq)  # sum of 0 order
    Xm = s1/s0  # mean value
    Xmax = Xi
    median = (Xmin + Xmax)*0.5
    variance = (s2 - s0*Xm*Xm)/s0  # ecart-type ** 2
    import math
    stddev = math.sqrt(variance)  # ecart-type
    return s0, Xmin, Xmax, median, Xm, stddev  # , variance

if __name__ == '__main__':
    names = ['cStringIO', 'sys', 'gc' ]
    names.extend(browse_stdlib())
    freqs, modules = exports(names, True)
    print    #  exports() prints without new line
    print "%d imported modules and %d in sys.modules" % (
        len(freqs), len(sys.modules))

    print "number of unreachable objects", gc.collect()
    simples = []
    while modules:
        module = modules.pop()
        # print module.__name__, sys.getrefcount(module)
        items = gc.get_referrers(module)
        litems = len(items)
        if litems <= 2:
            simples.append(module.__name__)
            del sys.modules[module.__name__], module, items
        else:
            print "referrers of %s" % (module.__name__,)
            for item in items[2:]:
                name = item.get('__file__', 'unknown')
                if name.endswith('__init__.pyc'):
                    pslash = name.rfind(os.sep)
                    pslash = name[:pslash].rfind(os.sep)
                    name = name[pslash+1:][:-4] # strip .pyc
                elif name.endswith('__init__.py'):
                    pslash = name.rfind(os.sep)
                    pslash = name[:pslash].rfind(os.sep)
                    name = name[pslash+1:][:-3] # strip .py
                elif name.endswith('.pyc'):
                    pslash = name.rfind(os.sep)
                    name = name[pslash+1:][:-4] # strip .pyc
                elif name.endswith('.py'):
                    pslash = name.rfind(os.sep)
                    name = name[pslash+1:][:-3] # strip .py
                print name,
            del module, items
            print

    print "number of unreachable objects", gc.collect()
    print "new length of sys.modules %d" % (len(sys.modules),)
    print "%d simple modules" % (len(simples),)
    freqs.sort()
    values = [item[0] for item in freqs ]
    # print freqs[-2:]  # supprimés
    # del values[-2:]
    ranges = pm_histo(values)
    ranges2 = [ item for item in ranges if item[1] > item[0]]
    limite = ranges[0][1] + 1  # first bin
    rangesbas = pm_histo(values[:95], 6)
    print rangesbas
    lbin = 11
    start = 0
    print "St Nb.  min   max  median  average stddev"
    fmt = "%3d%3d%6.1f%6.1f%8.3f%8.3f%8.3f"
    while start < len(values):
        res = (start,) + basic_stat(values[start:start+lbin])
        print fmt % res
        start = start + lbin

    print "modules with a lot of external names :"
    for item in freqs[140:]:
        print item

Parts of output of python -i statsmod.py  (python2.4 windows)
repr   rexec   rfc822   rlcompleter   cannot import module rlcompleter
No module named readline robotparser   sched
... etc ...
whrandom  C:\Python24\lib\whrandom.py:38: DeprecationWarning: the
whrandom module is deprecated; please use the random module
  DeprecationWarning)
 xdrlib   xmlrpclib   zipfile
... etc ...
_threading_local   __future__   __phello__.foo   Hello world...
cannot import module __phello__.foo No module named foo

cannot import module __phello__.foo No module named foo
number of unreachable objects 0
referrers of __future__
referrers of __future__
... etc ...
referrers of socket
asynchat asyncore BaseHTTPServer SocketServer urllib httplib ftplib
imaplib nntplib poplib smtpd smtplib Utils
... etc ...
referrers of cStringIO
logging\__init__ xmlrpclib
number of unreachable objects 564
new length of sys.modules 154
121 simple modules
[(0, 39), (39, 58), (58, 74), (74, 79), (79, 91), (91, 94)]
St Nb.  min   max  median  average stddev
  0 11   1.0   1.0   1.000   1.000   0.000
 11 11   1.0   2.0   1.500   1.545   0.498
 22 11   2.0   2.0   2.000   2.000   0.000
 33 11   2.0   3.0   2.500   2.455   0.498
 44 11   3.0   3.0   3.000   3.000   0.000
 55 11   3.0   4.0   3.500   3.727   0.445
 66 11   4.0   5.0   4.500   4.273   0.445
 77 11   5.0   6.0   5.500   5.818   0.386
 88 11   6.0   8.0   7.000   6.909   0.668
 99 11   9.0  10.0   9.500   9.545   0.498
110 11  10.0  12.0  11.000  11.182   0.716
121 11  12.0  16.0  14.000  13.818   1.113
132 11  16.0  21.0  18.500  17.636   1.367
143 11  21.0  29.0  25.000  24.818   2.367
154 11  31.0  51.0  41.000  38.364   7.413
165 11  55.0  92.0  73.500  70.636  10.764
176  5  97.0 136.0 116.500 111.400  13.865
modules with a lot of external names :
(18, 40, 'cgi')
(19, 19, 'cgitb')
... etc ...
(72, 72, 'pydoc')
(74, 74, 'cookielib')
(78, 78, 'urllib2')
(86, 86, 'symbol')
(92, 92, 'sre_constants')
(97, 97, 'xmlrpclib')
(101, 118, 'os')
(107, 107, 'sre_compile')
(116, 116, 'sre_parse')
(136, 151, 'socket')

Output with python 2.3.3 Linux gives a greater number for socket as the
OpenSSL library is wrapped.
gc.collect() at the interactive prompt gives 0. (good)

Conclusion :
 sre_compile and sre_parse should be coded with a __all__ attribute
 The standard library contains a module 'tzparse' which cannot be imported !
 Most library modules do not begin with #!/usr/bin/env python and a
coding cookie.

Regards



More information about the Python-list mailing list