whitespace , comment stripper, and EOL converter
M.E.Farmer
mefjr75 at hotmail.com
Sun Apr 17 05:33:49 EDT 2005
MrJean1 wrote:
> There is an issue with both my and your code: it only works if doc
> strings are triple quoted and if there are no other triple quoted
> strings in the Python code.
I had not considered single quoted strings ;)
> A triple quoted string used in an assignment will be removed, for
> example this case
>
> s = '''this string should not be removed'''
>
>
> It is still unclear how to distinguish doc strings from other
strings.
> Also, I have not checked the precise Python syntax, but doc strings
do
> not need to be enclosed by triple quotes. A single quote may be
> allowed too.
>
> Maybe this rule will work: a doc string is any string preceded by a
> COLON token followed by zero, one or more INDENT or NEWLINE tokens.
> Untested!
Not needed , if you reread my post I explain that I had solved that
issue.
If you use the line argument that tokenizer supplies we can strip
whitespace and 'rRuU' from the start of the line and look for a single
quote or a double quote .
I have tested it and it works.
Reworked the 'pep8' thing and fixed the bug you mentioned here is the
changes.
>
######################################################################
> > # Python source stripper
> >
>
######################################################################
> >
> > import os
> > import sys
> > import token
> > import keyword
> > import StringIO
> > import tokenize
> > import traceback
> > __credits__ = '''
> > Jürgen Hermann
> > M.E.Farmer
> > Jean Brouwers
> > '''
> > __version__ = '.8'
> > __author__ = 'M.E.Farmer'
> > __date__ = 'Apr 16, 2005,' \
> > 'Jan 15 2005,' \
> > 'Oct 24 2004' \
> >
> >
> >
>
######################################################################
> >
> > class Stripper:
> > """Python source stripper
> > """
> > def __init__(self, raw):
> > self.raw = raw
> >
> > def format(self, out=sys.stdout, comments=0, docstrings=0,
> > spaces=1, untabify=1, eol='unix'):
> > """ strip comments,
> > strip docstrings,
> > strip extra whitespace and lines,
> > convert tabs to spaces,
> > convert EOL's in Python code.
> > """
> > # Store line offsets in self.lines
> > self.lines = [0, 0]
> > pos = 0
> > # Strips the first blank line if 1
> > self.lasttoken = 1
> > self.temp = StringIO.StringIO()
> > self.spaces = spaces
> > self.comments = comments
> > self.docstrings = docstrings
> >
> > if untabify:
> > self.raw = self.raw.expandtabs()
> > self.raw = self.raw.rstrip()+' '
> > self.out = out
> >
> > # Have you ever had a multiple line ending script?
> > # They can be nasty so lets get them all the same.
> > self.raw = self.raw.replace('\r\n', '\n')
> > self.raw = self.raw.replace('\r', '\n')
> > self.lineend = '\n'
> >
> > # Gather lines
> > while 1:
> > pos = self.raw.find(self.lineend, pos) + 1
> > if not pos: break
> > self.lines.append(pos)
> >
> > self.lines.append(len(self.raw))
> > self.pos = 0
> >
> > # Wrap text in a filelike object
> > text = StringIO.StringIO(self.raw)
> >
> > # Parse the source.
> > ## Tokenize calls the __call__
> > ## method for each token till done.
> > try:
> > tokenize.tokenize(text.readline, self)
> > except tokenize.TokenError, ex:
> > traceback.print_exc()
> >
> > # Ok now we write it to a file
> > # but we also need to clean the whitespace
> > # between the lines and at the ends.
> > self.temp.seek(0)
> >
> > # All this should be written into the
> > # __call__ method just haven't yet...
> >
> > # Mac CR
> > if eol == 'mac':
> > self.lineend = '\r'
> > # Windows CR LF
> > elif eol == 'win':
> > self.lineend = '\r\n'
> > # Unix LF
> > else:
> > self.lineend = '\n'
> >
> > for line in self.temp.readlines():
> > if spaces == -1:
> > self.out.write(line.rstrip()+self.lineend)
> > else:
> > if not line.isspace():
> > self.lasttoken=0
> > self.out.write(line.rstrip()+self.lineend)
> > else:
> > self.lasttoken+=1
> > if self.lasttoken<=self.spaces and self.spaces:
> > self.out.write(self.lineend)
> >
> > def __call__(self, toktype, toktext,
> > (srow,scol), (erow,ecol), line):
> > """ Token handler.
> > """
> > # calculate new positions
> > oldpos = self.pos
> > newpos = self.lines[srow] + scol
> > self.pos = newpos + len(toktext)
> >
> > # kill comments
> > if self.comments:
> > if toktype == tokenize.COMMENT:
> > return
> >
# kill doc strings
if self.docstrings:
# Assume if there is nothing on the
# left side it must be a docstring
if toktype == tokenize.STRING and \
line.lstrip(' rRuU')[0] in ["'",'"']:
t = toktext.lstrip('rRuU')
# pep8 frowns on triple single quotes
if ( self.docstrings == 'pep8' or
self.docstrings == 8):
# pep8 frowns on single triples
if not t.startswith('"""'):
return
else:
return
# handle newlines
if toktype in [token.NEWLINE, tokenize.NL]:
self.temp.write(self.lineend)
return
# send the original whitespace
if newpos > oldpos:
self.temp.write(self.raw[oldpos:newpos])
# skip indenting tokens
if toktype in [token.INDENT, token.DEDENT]:
self.pos = newpos
return
# send text to the temp file
self.temp.write(toktext)
return
######################################################################
def Main():
import sys
if sys.argv[1]:
filein = open(sys.argv[1]).read()
Stripper(filein).format(out=sys.stdout,
comments=0, docstrings='pep8', untabify=1, eol='win')
######################################################################
if __name__ == '__main__':
Main()
That should work like a charm for all types of docstrings without
disturbing others strings.
M.E.Farmer
More information about the Python-list
mailing list