whitespace , comment stripper, and EOL converter

Sun Apr 17 00:54:29 EDT 2005

Thanks Jean,
I have thought about adding docstrings several times, but I was stumped
at how to determine a docstring from a regular tripleqoted string ;)
I have been thinking hard about the problem and I think I have an idea.
If the line has nothing before the start of the string it must be a
docstring.
Sounds simple enough but in Python there are 12 or so 'types' of
strings .
Here is my crack at it feel free to improve it ;)
I reversed  the logic on the comments and docstrings so I could add a
special mode to docstring stripping ...pep8 mode .
Pep8 mode only strips double triple quotes from your source code
leaving the offending single triple quotes behind. Probably just stupid
but someone might find it usefull.
 ######################################################################
# Python source stripper
######################################################################

import os
import sys
import token
import keyword
import StringIO
import tokenize
import traceback
__credits__ = '''
Jürgen Hermann
M.E.Farmer
Jean Brouwers
'''
__version__ = '.8'
__author__ = 'M.E.Farmer'
__date__ =  'Apr 16, 2005,' \
            'Jan 15 2005,' \
            'Oct 24 2004' \

######################################################################

class Stripper:
    """Python source stripper
    """
    def __init__(self, raw):
        self.raw = raw

    def format(self, out=sys.stdout, comments=0, docstrings=0,
                spaces=1, untabify=1, eol='unix'):
        """ strip comments,
            strip docstrings,
            strip extra whitespace and lines,
            convert tabs to spaces,
            convert EOL's in Python code.
        """
        # Store line offsets in self.lines
        self.lines = [0, 0]
        pos = 0
        # Strips the first blank line if 1
        self.lasttoken = 1
        self.temp = StringIO.StringIO()
        self.spaces = spaces
        self.comments = comments
        self.docstrings = docstrings

        if untabify:
           self.raw = self.raw.expandtabs()
        self.raw = self.raw.rstrip()+' '
        self.out = out

        # Have you ever had a multiple line ending script?
        # They can be nasty so lets get them all the same.
        self.raw = self.raw.replace('\r\n', '\n')
        self.raw = self.raw.replace('\r', '\n')
        self.lineend = '\n'

        # Gather lines
        while 1:
            pos = self.raw.find(self.lineend, pos) + 1
            if not pos: break
            self.lines.append(pos)

        self.lines.append(len(self.raw))
        self.pos = 0

        # Wrap text in a filelike object
        text = StringIO.StringIO(self.raw)

        # Parse the source.
        ## Tokenize calls the __call__
        ## method for each token till done.
        try:
            tokenize.tokenize(text.readline, self)
        except tokenize.TokenError, ex:
            traceback.print_exc()

        # Ok now we write it to a file
        # but we also need to clean the whitespace
        # between the lines and at the ends.
        self.temp.seek(0)

        # All this should be written into the
        # __call__ method just haven't yet...

        # Mac CR
        if eol == 'mac':
           self.lineend = '\r'
        # Windows CR LF
        elif eol == 'win':
           self.lineend = '\r\n'
        # Unix LF
        else:
           self.lineend = '\n'

        for line in self.temp.readlines():
            if spaces == -1:
                self.out.write(line.rstrip()+self.lineend)
            else:
                if not line.isspace():
                    self.lasttoken=0
                    self.out.write(line.rstrip()+self.lineend)
                else:
                    self.lasttoken+=1
                    if self.lasttoken<=self.spaces and self.spaces:
                        self.out.write(self.lineend)

    def __call__(self, toktype, toktext,
                 (srow,scol), (erow,ecol), line):
        """ Token handler.
        """
        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
        self.pos = newpos + len(toktext)

        # kill comments
        if self.comments:
            if toktype == tokenize.COMMENT:
                return

        # kill doc strings
        if self.docstrings:
            # Assume if there is nothing on the
            # left side it must be a docstring
            if toktype == tokenize.STRING and \
                line.lstrip(' rRuU')[0] in ["'",'"']:
                t = toktext.lstrip('rRuU')
                if (t.startswith('"""') and
                    (self.docstrings == 'pep8' or
                     self.docstrings =='8')):
                     return
                elif t.startswith('"""') or t.startswith("'''"):
                     return

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.temp.write(self.lineend)
            return

        # send the original whitespace
        if newpos > oldpos:
            self.temp.write(self.raw[oldpos:newpos])

        # skip indenting tokens
        if toktype in [token.INDENT, token.DEDENT]:
            self.pos = newpos
            return

       # send text to the temp file
        self.temp.write(toktext)
        return
######################################################################

def Main():
    import sys
    if sys.argv[1]:
        filein = open(sys.argv[1]).read()
        Stripper(filein).format(out=sys.stdout,
                 comments=0, docstrings=1, untabify=1, eol='win')
######################################################################

if __name__ == '__main__':
    Main()