whitespace , comment stripper, and EOL converter
MrJean1
MrJean1 at gmail.com
Sun Apr 17 04:30:06 EDT 2005
There is an issue with both my and your code: it only works if doc
strings are triple quoted and if there are no other triple quoted
strings in the Python code.
A triple quoted string used in an assignment will be removed, for
example this case
s = '''this string should not be removed'''
It is still unclear how to distinguish doc strings from other strings.
Also, I have not checked the precise Python syntax, but doc strings do
not need to be enclosed by triple quotes. A single quote may be
allowed too.
Maybe this rule will work: a doc string is any string preceded by a
COLON token followed by zero, one or more INDENT or NEWLINE tokens.
Untested!
/Jean Brouwers
M.E.Farmer wrote:
> Thanks Jean,
> I have thought about adding docstrings several times, but I was
stumped
> at how to determine a docstring from a regular tripleqoted string ;)
> I have been thinking hard about the problem and I think I have an
idea.
> If the line has nothing before the start of the string it must be a
> docstring.
> Sounds simple enough but in Python there are 12 or so 'types' of
> strings .
> Here is my crack at it feel free to improve it ;)
> I reversed the logic on the comments and docstrings so I could add a
> special mode to docstring stripping ...pep8 mode .
> Pep8 mode only strips double triple quotes from your source code
> leaving the offending single triple quotes behind. Probably just
stupid
> but someone might find it usefull.
>
######################################################################
> # Python source stripper
>
######################################################################
>
> import os
> import sys
> import token
> import keyword
> import StringIO
> import tokenize
> import traceback
> __credits__ = '''
> Jürgen Hermann
> M.E.Farmer
> Jean Brouwers
> '''
> __version__ = '.8'
> __author__ = 'M.E.Farmer'
> __date__ = 'Apr 16, 2005,' \
> 'Jan 15 2005,' \
> 'Oct 24 2004' \
>
>
>
######################################################################
>
> class Stripper:
> """Python source stripper
> """
> def __init__(self, raw):
> self.raw = raw
>
> def format(self, out=sys.stdout, comments=0, docstrings=0,
> spaces=1, untabify=1, eol='unix'):
> """ strip comments,
> strip docstrings,
> strip extra whitespace and lines,
> convert tabs to spaces,
> convert EOL's in Python code.
> """
> # Store line offsets in self.lines
> self.lines = [0, 0]
> pos = 0
> # Strips the first blank line if 1
> self.lasttoken = 1
> self.temp = StringIO.StringIO()
> self.spaces = spaces
> self.comments = comments
> self.docstrings = docstrings
>
> if untabify:
> self.raw = self.raw.expandtabs()
> self.raw = self.raw.rstrip()+' '
> self.out = out
>
> # Have you ever had a multiple line ending script?
> # They can be nasty so lets get them all the same.
> self.raw = self.raw.replace('\r\n', '\n')
> self.raw = self.raw.replace('\r', '\n')
> self.lineend = '\n'
>
> # Gather lines
> while 1:
> pos = self.raw.find(self.lineend, pos) + 1
> if not pos: break
> self.lines.append(pos)
>
> self.lines.append(len(self.raw))
> self.pos = 0
>
> # Wrap text in a filelike object
> text = StringIO.StringIO(self.raw)
>
> # Parse the source.
> ## Tokenize calls the __call__
> ## method for each token till done.
> try:
> tokenize.tokenize(text.readline, self)
> except tokenize.TokenError, ex:
> traceback.print_exc()
>
> # Ok now we write it to a file
> # but we also need to clean the whitespace
> # between the lines and at the ends.
> self.temp.seek(0)
>
> # All this should be written into the
> # __call__ method just haven't yet...
>
> # Mac CR
> if eol == 'mac':
> self.lineend = '\r'
> # Windows CR LF
> elif eol == 'win':
> self.lineend = '\r\n'
> # Unix LF
> else:
> self.lineend = '\n'
>
> for line in self.temp.readlines():
> if spaces == -1:
> self.out.write(line.rstrip()+self.lineend)
> else:
> if not line.isspace():
> self.lasttoken=0
> self.out.write(line.rstrip()+self.lineend)
> else:
> self.lasttoken+=1
> if self.lasttoken<=self.spaces and self.spaces:
> self.out.write(self.lineend)
>
> def __call__(self, toktype, toktext,
> (srow,scol), (erow,ecol), line):
> """ Token handler.
> """
> # calculate new positions
> oldpos = self.pos
> newpos = self.lines[srow] + scol
> self.pos = newpos + len(toktext)
>
> # kill comments
> if self.comments:
> if toktype == tokenize.COMMENT:
> return
>
> # kill doc strings
> if self.docstrings:
> # Assume if there is nothing on the
> # left side it must be a docstring
> if toktype == tokenize.STRING and \
> line.lstrip(' rRuU')[0] in ["'",'"']:
> t = toktext.lstrip('rRuU')
> if (t.startswith('"""') and
> (self.docstrings == 'pep8' or
> self.docstrings =='8')):
> return
> elif t.startswith('"""') or t.startswith("'''"):
> return
>
> # handle newlines
> if toktype in [token.NEWLINE, tokenize.NL]:
> self.temp.write(self.lineend)
> return
>
> # send the original whitespace
> if newpos > oldpos:
> self.temp.write(self.raw[oldpos:newpos])
>
> # skip indenting tokens
> if toktype in [token.INDENT, token.DEDENT]:
> self.pos = newpos
> return
>
> # send text to the temp file
> self.temp.write(toktext)
> return
>
######################################################################
>
> def Main():
> import sys
> if sys.argv[1]:
> filein = open(sys.argv[1]).read()
> Stripper(filein).format(out=sys.stdout,
> comments=0, docstrings=1, untabify=1, eol='win')
>
######################################################################
>
> if __name__ == '__main__':
> Main()
More information about the Python-list
mailing list