whitespace , comment stripper, and EOL converter

Tue Apr 19 14:47:14 EDT 2005

Attached is another version of the stripper.py file.  It contains my
change which seem to handle docstring correctly (at least on itself).

/Jean Brouwers

<pre>

######################################################################
# Python source stripper / cleaner ;)
######################################################################

import os
import sys
import token
import keyword
import StringIO
import tokenize
import traceback
__credits__ = \
'''
J¸rgen Hermann
M.E.Farmer
Jean Brouwers
'''
__version__ = '.8'
__author__ = 'M.E.Farmer'
__date__ =  'Apr 16, 2005,' \
            'Jan 15 2005,' \
            'Oct 24 2004' \

'''this docstring should be removed
'''

######################################################################

class Stripper:
    """Python source stripper / cleaner
    """
    def __init__(self, raw):
        self.raw = raw

    def format(self, out=sys.stdout, comments=0, docstrings=0,
                spaces=1, untabify=1, eol='unix'):
        """ strip comments,
            strip docstrings,
            strip extra whitespace and lines,
            convert tabs to spaces,
            convert EOL's in Python code.
        """
        # Store line offsets in self.lines
        self.lines = [0, 0]
        pos = 0
        self.temp = StringIO.StringIO()
        # Strips the first blank line if 1
        self.lasttoken = 1
        self.spaces = spaces
        # 0  = no change, 1 = strip 'em
        self.comments = comments # yep even these
        # 0  = no change, 1 = strip 'em, 8 or 'pep8'= strip all but
"""'s
        self.docstrings = docstrings

        if untabify:
           self.raw = self.raw.expandtabs()
        self.raw = self.raw.rstrip()+' '
        self.out = out

        # Have you ever had a multiple line ending script?
        # They can be nasty so lets get them all the same.
        self.raw = self.raw.replace('\r\n', '\n')
        self.raw = self.raw.replace('\r', '\n')
        self.lineend = '\n'

        # Gather lines
        while 1:
            pos = self.raw.find(self.lineend, pos) + 1
            if not pos: break
            self.lines.append(pos)

        self.lines.append(len(self.raw))
        self.pos = 0
        self.lastOP = ''

        # Wrap text in a filelike object
        text = StringIO.StringIO(self.raw)

        # Parse the source.
        ## Tokenize calls the __call__
        ## method for each token till done.
        try:
            tokenize.tokenize(text.readline, self)
        except tokenize.TokenError, ex:
            traceback.print_exc()

        # Ok now we write it to a file
        # but we also need to clean the whitespace
        # between the lines and at the ends.
        self.temp.seek(0)

        # All this should be written into the
        # __call__ method just haven't yet...

        # Mac CR
        if eol == 'mac':
           self.lineend = '\r'
        # Windows CR LF
        elif eol == 'win':
           self.lineend = '\r\n'
        # Unix LF
        else:
           self.lineend = '\n'

        for line in self.temp.readlines():
            if spaces == -1:
                self.out.write(line.rstrip()+self.lineend)
            else:
                if not line.isspace():
                    self.lasttoken=0
                    self.out.write(line.rstrip()+self.lineend)
                else:
                    self.lasttoken+=1
                    if self.lasttoken<=self.spaces and self.spaces:
                        self.out.write(self.lineend)

    def __call__(self, toktype, toktext, (srow,scol), (erow,ecol),
line):
        """ Token handler.
        """
        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
        self.pos = newpos + len(toktext)

      ##print "*token: %s  text: %r  line: %r" % \
               (token.tok_name[toktype], toktext, line)

        # kill comments
        if self.comments:
            if toktype == tokenize.COMMENT:
                return

        # kill doc strings
        if self.docstrings:
            # a STRING must be a docstring
            # if the most recent OP was ':'
            if toktype == tokenize.STRING and self.lastOP == ':':
                # pep8 frowns on triple single quotes
                if (self.docstrings == 'pep8' or
                    self.docstrings == 8):
                    if not toktext.endswith('"""'):
                        return
                else:
                    return
            elif toktype == token.OP:
                # remember most recent OP
                self.lastOP = toktext
            elif self.lastOP == ':':
                # newline and indent are OK inside docstring
                if toktype not in [token.NEWLINE, token.INDENT]:
                   # otherwise the docstring ends
                   self.lastOP = ''
            elif toktype == token.NEWLINE:
                # consider any string starting
                # on a new line as a docstring
                self.lastOP = ':'

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.temp.write(self.lineend)
            return

        # send the original whitespace
        if newpos > oldpos:
            self.temp.write(self.raw[oldpos:newpos])

        # skip indenting tokens
        if toktype in [token.INDENT, token.DEDENT]:
            self.pos = newpos
            return

        # send text to the temp file
        self.temp.write(toktext)
        return
######################################################################

def Main():
    import sys
    if sys.argv[1]:
        filein = open(sys.argv[1]).read()
        Stripper(filein).format(out=sys.stdout,
                 comments=1, docstrings=1, untabify=1, eol='win')
######################################################################

if __name__ == '__main__':
    Main()

</pre>

M.E.Farmer wrote:
> I found the bug and hope I have squashed it.
> Single and qouble quoted strings that were assignments and spanned
> multilines using \ , were chopped after the first line.
> example:
> __date__ =  'Apr 16, 2005,' \
>             'Jan 15 2005,' \
>             'Oct 24 2004'
> became:
> __date__ =  'Apr 16, 2005,' \
>
> Not good :(
>
> tokenizer sends this as:
> name
> operator
> string
> string
> string
> newline
>
> I added test for string assignments that end in \.
> A flag is set and then all strings till a newline are ignored.
> Also rearranged the script a little.
> Maybe that will do it ...
> Updates available at
> > The script is located at:
> >  http://bellsouthpwp.net/m/e/mefjr75/python/stripper.py
> > 
> > M.E.Farmer