whitespace , comment stripper, and EOL converter
M.E.Farmer
mefjr75 at hotmail.com
Wed Apr 13 12:34:54 EDT 2005
qwweeeit wrote:
> Thanks! If you answer to my posts one more time I could consider you
as
> my tutor...
>
> It was strange to have found a bug...! In any case I will not go
deeper
> into the matter, because for me it's enough your explanatiom.
> I corrected the problem by hand removing the tokens spanning multiple
lines
> (there were only 8 cases...).
>
> Instead I haven't understood your hint about comments...
> I succeded in realizing a python script which removes comments.
>
> Here it is (in all its cumbersome and criptic appearence!...):
>
> # removeCommentsTok.py
> import tokenize
> Input = "pippo1"
> Output = "pippo2"
> f = open(Input)
> fOut=open(Output,"w")
>
> nLastLine=0
> for i in tokenize.generate_tokens(f.readline):
> . if i[0]==52 and nLastLine != (i[2])[0]:
> . . fOut.write((i[4].replace(i[1],'')).rstrip()+'\n')
> . . nLastLine=(i[2])[0]
> . elif i[0]==4 and nLastLine != (i[2])[0]:
> . . fOut.write((i[4]))
> . . nLastLine=(i[2])[0]
> f.close()
> fOut.close()
>
> Some explanations for the guys like me...:
> - 52 and 4 are the arbitrary codes for comments and NEWLINE
respectively
> - the comment removing is obtained by clearing the comment (i[1]) in
the
> input line (i[4])
> - I also right trimmed the line to get rid off the remaining blanks.
Tokenizer sends multiline strings and comments as a single token.
######################################################################
# python comment and whitespace stripper :)
######################################################################
import keyword, os, sys, traceback
import StringIO
import token, tokenize
__credits__ = 'just another tool that I needed'
__version__ = '.7'
__author__ = 'M.E.Farmer'
__date__ = 'Jan 15 2005, Oct 24 2004'
######################################################################
class Stripper:
"""python comment and whitespace stripper :)
"""
def __init__(self, raw):
self.raw = raw
def format(self, out=sys.stdout, comments=0, spaces=1,
untabify=1, eol='unix'):
''' strip comments, strip extra whitespace,
convert EOL's from Python code.
'''
# Store line offsets in self.lines
self.lines = [0, 0]
pos = 0
# Strips the first blank line if 1
self.lasttoken = 1
self.temp = StringIO.StringIO()
self.spaces = spaces
self.comments = comments
if untabify:
self.raw = self.raw.expandtabs()
self.raw = self.raw.rstrip()+' '
self.out = out
self.raw = self.raw.replace('\r\n', '\n')
self.raw = self.raw.replace('\r', '\n')
self.lineend = '\n'
# Gather lines
while 1:
pos = self.raw.find(self.lineend, pos) + 1
if not pos: break
self.lines.append(pos)
self.lines.append(len(self.raw))
# Wrap text in a filelike object
self.pos = 0
text = StringIO.StringIO(self.raw)
# Parse the source.
## Tokenize calls the __call__
## function for each token till done.
try:
tokenize.tokenize(text.readline, self)
except tokenize.TokenError, ex:
traceback.print_exc()
# Ok now we write it to a file
# but we also need to clean the whitespace
# between the lines and at the ends.
self.temp.seek(0)
# Mac CR
if eol == 'mac':
self.lineend = '\r'
# Windows CR LF
elif eol == 'win':
self.lineend = '\r\n'
# Unix LF
else:
self.lineend = '\n'
for line in self.temp.readlines():
if spaces == -1:
self.out.write(line.rstrip()+self.lineend)
else:
if not line.isspace():
self.lasttoken=0
self.out.write(line.rstrip()+self.lineend)
else:
self.lasttoken+=1
if self.lasttoken<=self.spaces and self.spaces:
self.out.write(self.lineend)
def __call__(self, toktype, toktext,
(srow,scol), (erow,ecol), line):
''' Token handler.
'''
# calculate new positions
oldpos = self.pos
newpos = self.lines[srow] + scol
self.pos = newpos + len(toktext)
#kill the comments
if not self.comments:
# Kill the comments ?
if toktype == tokenize.COMMENT:
return
# handle newlines
if toktype in [token.NEWLINE, tokenize.NL]:
self.temp.write(self.lineend)
return
# send the original whitespace, if needed
if newpos > oldpos:
self.temp.write(self.raw[oldpos:newpos])
# skip indenting tokens
if toktype in [token.INDENT, token.DEDENT]:
self.pos = newpos
return
# send text to the temp file
self.temp.write(toktext)
return
######################################################################
def Main():
import sys
if sys.argv[1]:
filein = open(sys.argv[1]).read()
Stripper(filein).format(out=sys.stdout, comments=1, untabify=1,
eol='win')
######################################################################
if __name__ == '__main__':
Main()
M.E.Farmer
More information about the Python-list
mailing list