whitespace , comment stripper, and EOL converter
MrJean1
MrJean1 at gmail.com
Sat Apr 16 15:49:33 EDT 2005
Great tool, indeed! But doc strings stay in the source text.
If you do need to remove doc strings as well, add the following into
the __call__ method.
... # kill doc strings
... if not self.docstrings:
... if toktype == tokenize.STRING and len(toktext) >= 6:
... t = toktext.lstrip('rRuU')
... if ((t.startswith("'''") and t.endswith("'''")) or
... (t.startswith('"""') and t.endswith('"""'))):
... return
as shown in the original post below. Also, set self.docstrings in the
format method, similar to self.comments as shown below in lines
starting with '...'.
/Jean Brouwers
M.E.Farmer wrote:
> qwweeeit wrote:
> > Thanks! If you answer to my posts one more time I could consider
you
> as
> > my tutor...
> >
> > It was strange to have found a bug...! In any case I will not go
> deeper
> > into the matter, because for me it's enough your explanatiom.
> > I corrected the problem by hand removing the tokens spanning
multiple
> lines
> > (there were only 8 cases...).
> >
> > Instead I haven't understood your hint about comments...
> > I succeded in realizing a python script which removes comments.
> >
> > Here it. is (in all its cumbersome and criptic appearence!...):
> >
> > # removeCommentsTok.py
> > import tokenize
> > Input = "pippo1"
> > Output = "pippo2"
> > f = open(Input)
> > fOut=open(Output,"w")
> >
> > nLastLine=0
> > for i in tokenize.generate_tokens(f.readline):
> > . if i[0]==52 and nLastLine != (i[2])[0]:
> > . . fOut.write((i[4].replace(i[1],'')).rstrip()+'\n')
> > . . nLastLine=(i[2])[0]
> > . elif i[0]==4 and nLastLine != (i[2])[0]:
> > . . fOut.write((i[4]))
> > . . nLastLine=(i[2])[0]
> > f.close()
> > fOut.close()
> >
> > Some explanations for the guys like me...:
> > - 52 and 4 are the arbitrary codes for comments and NEWLINE
> respectively
> > - the comment removing is obtained by clearing the comment (i[1])
in
> the
> > input line (i[4])
> > - I also right trimmed the line to get rid off the remaining
blanks.
> Tokenizer sends multiline strings and comments as a single token.
>
>
######################################################################
> # python comment and whitespace stripper :)
>
######################################################################
>
> import keyword, os, sys, traceback
> import StringIO
> import token, tokenize
> __credits__ = 'just another tool that I needed'
> __version__ = '.7'
> __author__ = 'M.E.Farmer'
> __date__ = 'Jan 15 2005, Oct 24 2004'
>
>
######################################################################
>
> class Stripper:
> """python comment and whitespace stripper :)
> """
> def __init__(self, raw):
> self.raw = raw
>
... def format(self, out=sys.stdout, comments=0, docstrings=0,
spaces=1,
> untabify=1, eol='unix'):
> ''' strip comments, strip extra whitespace,
> convert EOL's from Python code.
> '''
> # Store line offsets in self.lines
> self.lines = [0, 0]
> pos = 0
> # Strips the first blank line if 1
> self.lasttoken = 1
> self.temp = StringIO.StringIO()
> self.spaces = spaces
> self.comments = comments
... self.docstrings = docstrings
>
> if untabify:
> self.raw = self.raw.expandtabs()
> self.raw = self.raw.rstrip()+' '
> self.out = out
>
> self.raw = self.raw.replace('\r\n', '\n')
> self.raw = self.raw.replace('\r', '\n')
> self.lineend = '\n'
>
> # Gather lines
> while 1:
> pos = self.raw.find(self.lineend, pos) + 1
> if not pos: break
> self.lines.append(pos)
>
> self.lines.append(len(self.raw))
> # Wrap text in a filelike object
> self.pos = 0
>
> text = StringIO.StringIO(self.raw)
>
> # Parse the source.
> ## Tokenize calls the __call__
> ## function for each token till done.
> try:
> tokenize.tokenize(text.readline, self)
> except tokenize.TokenError, ex:
> traceback.print_exc()
>
> # Ok now we write it to a file
> # but we also need to clean the whitespace
> # between the lines and at the ends.
> self.temp.seek(0)
>
> # Mac CR
> if eol == 'mac':
> self.lineend = '\r'
> # Windows CR LF
> elif eol == 'win':
> self.lineend = '\r\n'
> # Unix LF
> else:
> self.lineend = '\n'
>
> for line in self.temp.readlines():
> if spaces == -1:
> self.out.write(line.rstrip()+self.lineend)
> else:
> if not line.isspace():
> self.lasttoken=0
> self.out.write(line.rstrip()+self.lineend)
> else:
> self.lasttoken+=1
> if self.lasttoken<=self.spaces and self.spaces:
> self.out.write(self.lineend)
>
>
> def __call__(self, toktype, toktext,
> (srow,scol), (erow,ecol), line):
> ''' Token handler.
> '''
> # calculate new positions
> oldpos = self.pos
> newpos = self.lines[srow] + scol
> self.pos = newpos + len(toktext)
>
> #kill the comments
> if not self.comments:
> # Kill the comments ?
> if toktype == tokenize.COMMENT:
> return
>
... # kill doc strings
... if not self.docstrings:
... if toktype == tokenize.STRING and len(toktext) >= 6:
... t = toktext.lstrip('rRuU')
... if ((t.startswith("'''") and t.endswith("'''")) or
... (t.startswith('"""') and t.endswith('"""'))):
... return
> # handle newlines
> if toktype in [token.NEWLINE, tokenize.NL]:
> self.temp.write(self.lineend)
> return
>
> # send the original whitespace, if needed
> if newpos > oldpos:
> self.temp.write(self.raw[oldpos:newpos])
>
> # skip indenting tokens
> if toktype in [token.INDENT, token.DEDENT]:
> self.pos = newpos
> return
>
> # send text to the temp file
> self.temp.write(toktext)
> return
>
######################################################################
>
> def Main():
> import sys
> if sys.argv[1]:
> filein = open(sys.argv[1]).read()
> Stripper(filein).format(out=sys.stdout, comments=1,
untabify=1,
> eol='win')
>
>
######################################################################
>
> if __name__ == '__main__':
> Main()
>
> M.E.Farmer
More information about the Python-list
mailing list