whitespace , comment stripper, and EOL converter

Sun Apr 17 05:33:49 EDT 2005

MrJean1 wrote:
> There is an issue with both my and your code: it only works if doc
> strings are triple quoted and if there are no other triple quoted
> strings in the Python code.
I had not considered single quoted strings ;)
> A triple quoted string used in an assignment will be removed, for
> example this case
>
>   s  =  '''this string should not be removed'''
>
>
> It is still unclear how to distinguish doc strings from other
strings.
> Also, I have not checked the precise Python syntax, but doc strings
do
> not need to be enclosed by triple quotes.  A single quote may be
> allowed too.
>
> Maybe this rule will work: a doc string is any string preceded by a
> COLON token followed by zero, one or more INDENT or NEWLINE tokens.
> Untested!
Not needed , if you reread my post I explain that I had solved that
issue.
If you use the line argument that tokenizer supplies we can strip
whitespace and 'rRuU' from the start of the line and look for a single
quote or a double quote .
I have tested it and it works.
Reworked the 'pep8' thing and fixed the bug you mentioned here is the
changes.

>
######################################################################
> > # Python source stripper
> >
>
######################################################################
> >
> > import os
> > import sys
> > import token
> > import keyword
> > import StringIO
> > import tokenize
> > import traceback
> > __credits__ = '''
> > Jürgen Hermann
> > M.E.Farmer
> > Jean Brouwers
> > '''
> > __version__ = '.8'
> > __author__ = 'M.E.Farmer'
> > __date__ =  'Apr 16, 2005,' \
> >             'Jan 15 2005,' \
> >             'Oct 24 2004' \
> >
> >
> >
>
######################################################################
> >
> > class Stripper:
> >     """Python source stripper
> >     """
> >     def __init__(self, raw):
> >         self.raw = raw
> >
> >     def format(self, out=sys.stdout, comments=0, docstrings=0,
> >                 spaces=1, untabify=1, eol='unix'):
> >         """ strip comments,
> >             strip docstrings,
> >             strip extra whitespace and lines,
> >             convert tabs to spaces,
> >             convert EOL's in Python code.
> >         """
> >         # Store line offsets in self.lines
> >         self.lines = [0, 0]
> >         pos = 0
> >         # Strips the first blank line if 1
> >         self.lasttoken = 1
> >         self.temp = StringIO.StringIO()
> >         self.spaces = spaces
> >         self.comments = comments
> >         self.docstrings = docstrings
> >
> >         if untabify:
> >            self.raw = self.raw.expandtabs()
> >         self.raw = self.raw.rstrip()+' '
> >         self.out = out
> >
> >         # Have you ever had a multiple line ending script?
> >         # They can be nasty so lets get them all the same.
> >         self.raw = self.raw.replace('\r\n', '\n')
> >         self.raw = self.raw.replace('\r', '\n')
> >         self.lineend = '\n'
> >
> >         # Gather lines
> >         while 1:
> >             pos = self.raw.find(self.lineend, pos) + 1
> >             if not pos: break
> >             self.lines.append(pos)
> >
> >         self.lines.append(len(self.raw))
> >         self.pos = 0
> >
> >         # Wrap text in a filelike object
> >         text = StringIO.StringIO(self.raw)
> >
> >         # Parse the source.
> >         ## Tokenize calls the __call__
> >         ## method for each token till done.
> >         try:
> >             tokenize.tokenize(text.readline, self)
> >         except tokenize.TokenError, ex:
> >             traceback.print_exc()
> >
> >         # Ok now we write it to a file
> >         # but we also need to clean the whitespace
> >         # between the lines and at the ends.
> >         self.temp.seek(0)
> >
> >         # All this should be written into the
> >         # __call__ method just haven't yet...
> >
> >         # Mac CR
> >         if eol == 'mac':
> >            self.lineend = '\r'
> >         # Windows CR LF
> >         elif eol == 'win':
> >            self.lineend = '\r\n'
> >         # Unix LF
> >         else:
> >            self.lineend = '\n'
> >
> >         for line in self.temp.readlines():
> >             if spaces == -1:
> >                 self.out.write(line.rstrip()+self.lineend)
> >             else:
> >                 if not line.isspace():
> >                     self.lasttoken=0
> >                     self.out.write(line.rstrip()+self.lineend)
> >                 else:
> >                     self.lasttoken+=1
> >                     if self.lasttoken<=self.spaces and self.spaces:
> >                         self.out.write(self.lineend)
> >
> >     def __call__(self, toktype, toktext,
> >                  (srow,scol), (erow,ecol), line):
> >         """ Token handler.
> >         """
> >         # calculate new positions
> >         oldpos = self.pos
> >         newpos = self.lines[srow] + scol
> >         self.pos = newpos + len(toktext)
> >
> >        # kill comments
> >        if self.comments:
> >            if toktype == tokenize.COMMENT:
> >                return
> >
        # kill doc strings
        if self.docstrings:
            # Assume if there is nothing on the
            # left side it must be a docstring
            if toktype == tokenize.STRING and \
                line.lstrip(' rRuU')[0] in ["'",'"']:
                t = toktext.lstrip('rRuU')
                # pep8 frowns on triple single quotes
                if ( self.docstrings == 'pep8' or
                     self.docstrings == 8):
                    # pep8 frowns on single triples
                    if not t.startswith('"""'):
                        return
                else:
                    return

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.temp.write(self.lineend)
            return

        # send the original whitespace
        if newpos > oldpos:
            self.temp.write(self.raw[oldpos:newpos])

        # skip indenting tokens
        if toktype in [token.INDENT, token.DEDENT]:
            self.pos = newpos
            return

       # send text to the temp file
        self.temp.write(toktext)
        return
######################################################################

def Main():
    import sys
    if sys.argv[1]:
        filein = open(sys.argv[1]).read()
        Stripper(filein).format(out=sys.stdout,
                 comments=0, docstrings='pep8', untabify=1, eol='win')
######################################################################

if __name__ == '__main__':
    Main()

That should work like a charm for all types of docstrings without
disturbing others strings.

M.E.Farmer