whitespace , comment stripper, and EOL converter
MrJean1
MrJean1 at gmail.com
Tue Apr 19 14:47:14 EDT 2005
Attached is another version of the stripper.py file. It contains my
change which seem to handle docstring correctly (at least on itself).
/Jean Brouwers
<pre>
######################################################################
# Python source stripper / cleaner ;)
######################################################################
import os
import sys
import token
import keyword
import StringIO
import tokenize
import traceback
__credits__ = \
'''
J¸rgen Hermann
M.E.Farmer
Jean Brouwers
'''
__version__ = '.8'
__author__ = 'M.E.Farmer'
__date__ = 'Apr 16, 2005,' \
'Jan 15 2005,' \
'Oct 24 2004' \
'''this docstring should be removed
'''
######################################################################
class Stripper:
"""Python source stripper / cleaner
"""
def __init__(self, raw):
self.raw = raw
def format(self, out=sys.stdout, comments=0, docstrings=0,
spaces=1, untabify=1, eol='unix'):
""" strip comments,
strip docstrings,
strip extra whitespace and lines,
convert tabs to spaces,
convert EOL's in Python code.
"""
# Store line offsets in self.lines
self.lines = [0, 0]
pos = 0
self.temp = StringIO.StringIO()
# Strips the first blank line if 1
self.lasttoken = 1
self.spaces = spaces
# 0 = no change, 1 = strip 'em
self.comments = comments # yep even these
# 0 = no change, 1 = strip 'em, 8 or 'pep8'= strip all but
"""'s
self.docstrings = docstrings
if untabify:
self.raw = self.raw.expandtabs()
self.raw = self.raw.rstrip()+' '
self.out = out
# Have you ever had a multiple line ending script?
# They can be nasty so lets get them all the same.
self.raw = self.raw.replace('\r\n', '\n')
self.raw = self.raw.replace('\r', '\n')
self.lineend = '\n'
# Gather lines
while 1:
pos = self.raw.find(self.lineend, pos) + 1
if not pos: break
self.lines.append(pos)
self.lines.append(len(self.raw))
self.pos = 0
self.lastOP = ''
# Wrap text in a filelike object
text = StringIO.StringIO(self.raw)
# Parse the source.
## Tokenize calls the __call__
## method for each token till done.
try:
tokenize.tokenize(text.readline, self)
except tokenize.TokenError, ex:
traceback.print_exc()
# Ok now we write it to a file
# but we also need to clean the whitespace
# between the lines and at the ends.
self.temp.seek(0)
# All this should be written into the
# __call__ method just haven't yet...
# Mac CR
if eol == 'mac':
self.lineend = '\r'
# Windows CR LF
elif eol == 'win':
self.lineend = '\r\n'
# Unix LF
else:
self.lineend = '\n'
for line in self.temp.readlines():
if spaces == -1:
self.out.write(line.rstrip()+self.lineend)
else:
if not line.isspace():
self.lasttoken=0
self.out.write(line.rstrip()+self.lineend)
else:
self.lasttoken+=1
if self.lasttoken<=self.spaces and self.spaces:
self.out.write(self.lineend)
def __call__(self, toktype, toktext, (srow,scol), (erow,ecol),
line):
""" Token handler.
"""
# calculate new positions
oldpos = self.pos
newpos = self.lines[srow] + scol
self.pos = newpos + len(toktext)
##print "*token: %s text: %r line: %r" % \
(token.tok_name[toktype], toktext, line)
# kill comments
if self.comments:
if toktype == tokenize.COMMENT:
return
# kill doc strings
if self.docstrings:
# a STRING must be a docstring
# if the most recent OP was ':'
if toktype == tokenize.STRING and self.lastOP == ':':
# pep8 frowns on triple single quotes
if (self.docstrings == 'pep8' or
self.docstrings == 8):
if not toktext.endswith('"""'):
return
else:
return
elif toktype == token.OP:
# remember most recent OP
self.lastOP = toktext
elif self.lastOP == ':':
# newline and indent are OK inside docstring
if toktype not in [token.NEWLINE, token.INDENT]:
# otherwise the docstring ends
self.lastOP = ''
elif toktype == token.NEWLINE:
# consider any string starting
# on a new line as a docstring
self.lastOP = ':'
# handle newlines
if toktype in [token.NEWLINE, tokenize.NL]:
self.temp.write(self.lineend)
return
# send the original whitespace
if newpos > oldpos:
self.temp.write(self.raw[oldpos:newpos])
# skip indenting tokens
if toktype in [token.INDENT, token.DEDENT]:
self.pos = newpos
return
# send text to the temp file
self.temp.write(toktext)
return
######################################################################
def Main():
import sys
if sys.argv[1]:
filein = open(sys.argv[1]).read()
Stripper(filein).format(out=sys.stdout,
comments=1, docstrings=1, untabify=1, eol='win')
######################################################################
if __name__ == '__main__':
Main()
</pre>
M.E.Farmer wrote:
> I found the bug and hope I have squashed it.
> Single and qouble quoted strings that were assignments and spanned
> multilines using \ , were chopped after the first line.
> example:
> __date__ = 'Apr 16, 2005,' \
> 'Jan 15 2005,' \
> 'Oct 24 2004'
> became:
> __date__ = 'Apr 16, 2005,' \
>
> Not good :(
>
> tokenizer sends this as:
> name
> operator
> string
> string
> string
> newline
>
> I added test for string assignments that end in \.
> A flag is set and then all strings till a newline are ignored.
> Also rearranged the script a little.
> Maybe that will do it ...
> Updates available at
> > The script is located at:
> > http://bellsouthpwp.net/m/e/mefjr75/python/stripper.py
> >
> > M.E.Farmer
More information about the Python-list
mailing list