whitespace , comment stripper, and EOL converter

Wed Apr 13 12:34:54 EDT 2005

qwweeeit wrote:
> Thanks! If you answer to my posts one more time I could  consider you
as
> my tutor...
>
> It was strange to have found a bug...! In any case I will not go
deeper
> into the matter, because for me it's enough your explanatiom.
> I corrected the problem by hand removing the tokens spanning multiple
lines
> (there were only 8 cases...).
>
> Instead I haven't understood your hint about comments...
> I succeded  in realizing a python script which removes comments.
>
> Here it is (in all its cumbersome and criptic appearence!...):
>
> # removeCommentsTok.py
> import tokenize
> Input = "pippo1"
> Output = "pippo2"
> f = open(Input)
> fOut=open(Output,"w")
>
> nLastLine=0
> for i in tokenize.generate_tokens(f.readline):
> .   if i[0]==52 and nLastLine != (i[2])[0]:
> .   .   fOut.write((i[4].replace(i[1],'')).rstrip()+'\n')
> .   .   nLastLine=(i[2])[0]
> .   elif i[0]==4 and nLastLine != (i[2])[0]:
> .   .   fOut.write((i[4]))
> .   .   nLastLine=(i[2])[0]
> f.close()
> fOut.close()
>
> Some explanations for the guys like me...:
> - 52 and 4 are the arbitrary codes for comments and NEWLINE
respectively
> - the comment removing is obtained by clearing the comment (i[1]) in
the
>   input line (i[4])
> - I also right trimmed the line to get rid off the remaining blanks.
Tokenizer sends multiline strings and comments as a single token.

######################################################################
# python comment and whitespace stripper :)
######################################################################

import keyword, os, sys, traceback
import StringIO
import token, tokenize
__credits__ = 'just another tool that I needed'
__version__ = '.7'
__author__ = 'M.E.Farmer'
__date__ =  'Jan 15 2005, Oct 24 2004'

######################################################################

class Stripper:
    """python comment and whitespace stripper :)
    """
    def __init__(self, raw):
        self.raw = raw

    def format(self, out=sys.stdout, comments=0, spaces=1,
                untabify=1, eol='unix'):
        ''' strip comments, strip extra whitespace,
            convert EOL's from Python code.
        '''
        # Store line offsets in self.lines
        self.lines = [0, 0]
        pos = 0
        # Strips the first blank line if 1
        self.lasttoken = 1
        self.temp = StringIO.StringIO()
        self.spaces = spaces
        self.comments = comments

        if untabify:
           self.raw = self.raw.expandtabs()
        self.raw = self.raw.rstrip()+' '
        self.out = out

        self.raw = self.raw.replace('\r\n', '\n')
        self.raw = self.raw.replace('\r', '\n')
        self.lineend = '\n'

        # Gather lines
        while 1:
            pos = self.raw.find(self.lineend, pos) + 1
            if not pos: break
            self.lines.append(pos)

        self.lines.append(len(self.raw))
        # Wrap text in a filelike object
        self.pos = 0

        text = StringIO.StringIO(self.raw)

        # Parse the source.
        ## Tokenize calls the __call__
        ## function for each token till done.
        try:
            tokenize.tokenize(text.readline, self)
        except tokenize.TokenError, ex:
            traceback.print_exc()

        # Ok now we write it to a file
        # but we also need to clean the whitespace
        # between the lines and at the ends.
        self.temp.seek(0)

        # Mac CR
        if eol == 'mac':
           self.lineend = '\r'
        # Windows CR LF
        elif eol == 'win':
           self.lineend = '\r\n'
        # Unix LF
        else:
           self.lineend = '\n'

        for line in self.temp.readlines():
            if spaces == -1:
                self.out.write(line.rstrip()+self.lineend)
            else:
                if not line.isspace():
                    self.lasttoken=0
                    self.out.write(line.rstrip()+self.lineend)
                else:
                    self.lasttoken+=1
                    if self.lasttoken<=self.spaces and self.spaces:
                        self.out.write(self.lineend)


    def __call__(self, toktype, toktext,
            (srow,scol), (erow,ecol), line):
        ''' Token handler.
        '''
        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
        self.pos = newpos + len(toktext)

        #kill the comments
        if not self.comments:
            # Kill the comments ?
            if toktype == tokenize.COMMENT:
                return

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.temp.write(self.lineend)
            return

        # send the original whitespace, if needed
        if newpos > oldpos:
            self.temp.write(self.raw[oldpos:newpos])

        # skip indenting tokens
        if toktype in [token.INDENT, token.DEDENT]:
            self.pos = newpos
            return

       # send text to the temp file
        self.temp.write(toktext)
        return
######################################################################

def Main():
    import sys
    if sys.argv[1]:
        filein = open(sys.argv[1]).read()
        Stripper(filein).format(out=sys.stdout, comments=1, untabify=1,
eol='win')

######################################################################

if __name__ == '__main__':
    Main()

M.E.Farmer