Case sensitivity redux -- with measurements!

Wed May 24 22:00:55 EDT 2000

I really shouldn't be wading into this tar pit, but I can't help
myself. (Hey, giant sloths died because of tar pits, so I'm entitled!) 

Anyway, the repeated cries of doom got me curious how often people
actually write case-ambiguous variable names. So I wrote a script to
check it, and ran it on the Python 1.5.2 standard library. I'm sure
people will interpret these numbers to fit their prejudices^H^H^H^Hmake
an informed judgement. :)

Out of ~36K lines of Python code in 137 modules, there were 12,243
named identifiers. There were 35 ambiguous identifiers, and there were
never more than 2 versions of the same name.

When there were two names, one was inevitably all-lowercase. The other
name was either in all-caps -- as in 'TERMIOS' and 'termios' -- or it
was capitalized at word breaks -- as with 'multifile' and 'MultiFile'.

-*-*- casechecker.py -*-*-

#! /usr/bin/env python

import token, tokenize, string, sys

class Insensitives:
    def __init__(self):
        self.dict = {}
    def add(self, str):
        normstr = string.lower(str)
        if self.dict.has_key(normstr) and str not in self.dict[normstr]:
                self.dict[normstr].append(str)
        else:
            self.dict[normstr] = [str]
    def problem_names(self):
        problems = []
        for name, lst in self.dict.items():
            if len(lst) > 1:
                problems.append((name, lst))
        return problems
    def __len__(self):
        return len(self.dict.problem_names())
    def count_all_names(self):
        return len(self.dict)

class Eater:
   def __init__(self):
       self.set = Insensitives()
   def __call__(self, tok_type, tok_str, (srow,scol), (erow,ecall), line):
       if tok_type == token.NAME:
           self.set.add(tok_str)
   def problem_names(self):
       return self.set.problem_names()
   def count_all_names(self):
       return self.set.count_all_names()

def checkfile(filename):
    """Sloppy code -- has side effects and returns values!
    Prints the bad identifiers, and returns the number of bad
    identifiers and the number of identifiers."""
    #
    file = open(filename, 'r')
    eater = Eater()
    tokenize.tokenize(file.readline, eater)
    problems = eater.problem_names()
    if len(problems) > 0:
        print "File %s" % filename
        for name, variants in problems:
            print "    " + str(name) + " " * (21-len(name)) + str(variants)
        print
    file.close()
    return len(problems), eater.count_all_names()

if __name__ == '__main__':
    total_problems = 0
    total_names = 0
    for filename in sys.argv[1:]:
        problems, names = checkfile(filename)
        total_problems = total_problems + problems
        total_names = total_names + names
    print "\nTotal collisions %d" % total_problems
    print "Total names %d" % total_names

Neel