whitespace , comment stripper, and EOL converter

Sun Apr 17 04:30:06 EDT 2005

There is an issue with both my and your code: it only works if doc
strings are triple quoted and if there are no other triple quoted
strings in the Python code.

A triple quoted string used in an assignment will be removed, for
example this case

  s  =  '''this string should not be removed'''

It is still unclear how to distinguish doc strings from other strings.
Also, I have not checked the precise Python syntax, but doc strings do
not need to be enclosed by triple quotes.  A single quote may be
allowed too.

Maybe this rule will work: a doc string is any string preceded by a
COLON token followed by zero, one or more INDENT or NEWLINE tokens.
Untested!

/Jean Brouwers

M.E.Farmer wrote:
> Thanks Jean,
> I have thought about adding docstrings several times, but I was
stumped
> at how to determine a docstring from a regular tripleqoted string ;)
> I have been thinking hard about the problem and I think I have an
idea.
> If the line has nothing before the start of the string it must be a
> docstring.
> Sounds simple enough but in Python there are 12 or so 'types' of
> strings .
> Here is my crack at it feel free to improve it ;)
> I reversed  the logic on the comments and docstrings so I could add a
> special mode to docstring stripping ...pep8 mode .
> Pep8 mode only strips double triple quotes from your source code
> leaving the offending single triple quotes behind. Probably just
stupid
> but someone might find it usefull.
>
######################################################################
> # Python source stripper
>
######################################################################
>
> import os
> import sys
> import token
> import keyword
> import StringIO
> import tokenize
> import traceback
> __credits__ = '''
> Jürgen Hermann
> M.E.Farmer
> Jean Brouwers
> '''
> __version__ = '.8'
> __author__ = 'M.E.Farmer'
> __date__ =  'Apr 16, 2005,' \
>             'Jan 15 2005,' \
>             'Oct 24 2004' \
>
>
>
######################################################################
>
> class Stripper:
>     """Python source stripper
>     """
>     def __init__(self, raw):
>         self.raw = raw
>
>     def format(self, out=sys.stdout, comments=0, docstrings=0,
>                 spaces=1, untabify=1, eol='unix'):
>         """ strip comments,
>             strip docstrings,
>             strip extra whitespace and lines,
>             convert tabs to spaces,
>             convert EOL's in Python code.
>         """
>         # Store line offsets in self.lines
>         self.lines = [0, 0]
>         pos = 0
>         # Strips the first blank line if 1
>         self.lasttoken = 1
>         self.temp = StringIO.StringIO()
>         self.spaces = spaces
>         self.comments = comments
>         self.docstrings = docstrings
>
>         if untabify:
>            self.raw = self.raw.expandtabs()
>         self.raw = self.raw.rstrip()+' '
>         self.out = out
>
>         # Have you ever had a multiple line ending script?
>         # They can be nasty so lets get them all the same.
>         self.raw = self.raw.replace('\r\n', '\n')
>         self.raw = self.raw.replace('\r', '\n')
>         self.lineend = '\n'
>
>         # Gather lines
>         while 1:
>             pos = self.raw.find(self.lineend, pos) + 1
>             if not pos: break
>             self.lines.append(pos)
>
>         self.lines.append(len(self.raw))
>         self.pos = 0
>
>         # Wrap text in a filelike object
>         text = StringIO.StringIO(self.raw)
>
>         # Parse the source.
>         ## Tokenize calls the __call__
>         ## method for each token till done.
>         try:
>             tokenize.tokenize(text.readline, self)
>         except tokenize.TokenError, ex:
>             traceback.print_exc()
>
>         # Ok now we write it to a file
>         # but we also need to clean the whitespace
>         # between the lines and at the ends.
>         self.temp.seek(0)
>
>         # All this should be written into the
>         # __call__ method just haven't yet...
>
>         # Mac CR
>         if eol == 'mac':
>            self.lineend = '\r'
>         # Windows CR LF
>         elif eol == 'win':
>            self.lineend = '\r\n'
>         # Unix LF
>         else:
>            self.lineend = '\n'
>
>         for line in self.temp.readlines():
>             if spaces == -1:
>                 self.out.write(line.rstrip()+self.lineend)
>             else:
>                 if not line.isspace():
>                     self.lasttoken=0
>                     self.out.write(line.rstrip()+self.lineend)
>                 else:
>                     self.lasttoken+=1
>                     if self.lasttoken<=self.spaces and self.spaces:
>                         self.out.write(self.lineend)
>
>     def __call__(self, toktype, toktext,
>                  (srow,scol), (erow,ecol), line):
>         """ Token handler.
>         """
>         # calculate new positions
>         oldpos = self.pos
>         newpos = self.lines[srow] + scol
>         self.pos = newpos + len(toktext)
>
>         # kill comments
>         if self.comments:
>             if toktype == tokenize.COMMENT:
>                 return
>
>         # kill doc strings
>         if self.docstrings:
>             # Assume if there is nothing on the
>             # left side it must be a docstring
>             if toktype == tokenize.STRING and \
>                 line.lstrip(' rRuU')[0] in ["'",'"']:
>                 t = toktext.lstrip('rRuU')
>                 if (t.startswith('"""') and
>                     (self.docstrings == 'pep8' or
>                      self.docstrings =='8')):
>                      return
>                 elif t.startswith('"""') or t.startswith("'''"):
>                      return
>
>         # handle newlines
>         if toktype in [token.NEWLINE, tokenize.NL]:
>             self.temp.write(self.lineend)
>             return
>
>         # send the original whitespace
>         if newpos > oldpos:
>             self.temp.write(self.raw[oldpos:newpos])
>
>         # skip indenting tokens
>         if toktype in [token.INDENT, token.DEDENT]:
>             self.pos = newpos
>             return
>
>        # send text to the temp file
>         self.temp.write(toktext)
>         return
>
######################################################################
>
> def Main():
>     import sys
>     if sys.argv[1]:
>         filein = open(sys.argv[1]).read()
>         Stripper(filein).format(out=sys.stdout,
>                  comments=0, docstrings=1, untabify=1, eol='win')
>
######################################################################
> 
> if __name__ == '__main__':
>     Main()