whitespace , comment stripper, and EOL converter

Sat Apr 16 15:49:33 EDT 2005

Great tool, indeed!  But doc strings stay in the source text.

If you do need to remove doc strings as well, add the following into
the __call__ method.

...      # kill doc strings
...      if not self.docstrings:
...          if toktype == tokenize.STRING and len(toktext) >= 6:
...              t = toktext.lstrip('rRuU')
...              if ((t.startswith("'''") and t.endswith("'''")) or
...                  (t.startswith('"""') and t.endswith('"""'))):
...                  return

as shown in the original post below.  Also, set self.docstrings in the
format method, similar to self.comments as shown below in lines
starting with '...'.

/Jean Brouwers

M.E.Farmer wrote:
> qwweeeit wrote:
> > Thanks! If you answer to my posts one more time I could  consider
you
> as
> > my tutor...
> >
> > It was strange to have found a bug...! In any case I will not go
> deeper
> > into the matter, because for me it's enough your explanatiom.
> > I corrected the problem by hand removing the tokens spanning
multiple
> lines
> > (there were only 8 cases...).
> >
> > Instead I haven't understood your hint about comments...
> > I succeded  in realizing a python script which removes comments.
> >
> > Here it. is (in all its cumbersome and criptic appearence!...):
> >
> > # removeCommentsTok.py
> > import tokenize
> > Input = "pippo1"
> > Output = "pippo2"
> > f = open(Input)
> > fOut=open(Output,"w")
> >
> > nLastLine=0
> > for i in tokenize.generate_tokens(f.readline):
> > .   if i[0]==52 and nLastLine != (i[2])[0]:
> > .   .   fOut.write((i[4].replace(i[1],'')).rstrip()+'\n')
> > .   .   nLastLine=(i[2])[0]
> > .   elif i[0]==4 and nLastLine != (i[2])[0]:
> > .   .   fOut.write((i[4]))
> > .   .   nLastLine=(i[2])[0]
> > f.close()
> > fOut.close()
> >
> > Some explanations for the guys like me...:
> > - 52 and 4 are the arbitrary codes for comments and NEWLINE
> respectively
> > - the comment removing is obtained by clearing the comment (i[1])
in
> the
> >   input line (i[4])
> > - I also right trimmed the line to get rid off the remaining
blanks.
> Tokenizer sends multiline strings and comments as a single token.
>
>
######################################################################
> # python comment and whitespace stripper :)
>
######################################################################
>
> import keyword, os, sys, traceback
> import StringIO
> import token, tokenize
> __credits__ = 'just another tool that I needed'
> __version__ = '.7'
> __author__ = 'M.E.Farmer'
> __date__ =  'Jan 15 2005, Oct 24 2004'
>
>
######################################################################
>
> class Stripper:
>     """python comment and whitespace stripper :)
>     """
>     def __init__(self, raw):
>         self.raw = raw
>
...   def format(self, out=sys.stdout, comments=0, docstrings=0,
spaces=1,
>                 untabify=1, eol='unix'):
>         ''' strip comments, strip extra whitespace,
>             convert EOL's from Python code.
>         '''
>         # Store line offsets in self.lines
>         self.lines = [0, 0]
>         pos = 0
>         # Strips the first blank line if 1
>         self.lasttoken = 1
>         self.temp = StringIO.StringIO()
>         self.spaces = spaces
>         self.comments = comments
...       self.docstrings = docstrings
>
>         if untabify:
>            self.raw = self.raw.expandtabs()
>         self.raw = self.raw.rstrip()+' '
>         self.out = out
>
>         self.raw = self.raw.replace('\r\n', '\n')
>         self.raw = self.raw.replace('\r', '\n')
>         self.lineend = '\n'
>
>         # Gather lines
>         while 1:
>             pos = self.raw.find(self.lineend, pos) + 1
>             if not pos: break
>             self.lines.append(pos)
>
>         self.lines.append(len(self.raw))
>         # Wrap text in a filelike object
>         self.pos = 0
>
>         text = StringIO.StringIO(self.raw)
>
>         # Parse the source.
>         ## Tokenize calls the __call__
>         ## function for each token till done.
>         try:
>             tokenize.tokenize(text.readline, self)
>         except tokenize.TokenError, ex:
>             traceback.print_exc()
>
>         # Ok now we write it to a file
>         # but we also need to clean the whitespace
>         # between the lines and at the ends.
>         self.temp.seek(0)
>
>         # Mac CR
>         if eol == 'mac':
>            self.lineend = '\r'
>         # Windows CR LF
>         elif eol == 'win':
>            self.lineend = '\r\n'
>         # Unix LF
>         else:
>            self.lineend = '\n'
>
>         for line in self.temp.readlines():
>             if spaces == -1:
>                 self.out.write(line.rstrip()+self.lineend)
>             else:
>                 if not line.isspace():
>                     self.lasttoken=0
>                     self.out.write(line.rstrip()+self.lineend)
>                 else:
>                     self.lasttoken+=1
>                     if self.lasttoken<=self.spaces and self.spaces:
>                         self.out.write(self.lineend)
>
>
>     def __call__(self, toktype, toktext,
>             (srow,scol), (erow,ecol), line):
>         ''' Token handler.
>         '''
>         # calculate new positions
>         oldpos = self.pos
>         newpos = self.lines[srow] + scol
>         self.pos = newpos + len(toktext)
>
>         #kill the comments
>         if not self.comments:
>             # Kill the comments ?
>             if toktype == tokenize.COMMENT:
>                 return
>
...       # kill doc strings
...       if not self.docstrings:
...          if toktype == tokenize.STRING and len(toktext) >= 6:
...              t = toktext.lstrip('rRuU')
...              if ((t.startswith("'''") and t.endswith("'''")) or
...                  (t.startswith('"""') and t.endswith('"""'))):
...                  return

>         # handle newlines
>         if toktype in [token.NEWLINE, tokenize.NL]:
>             self.temp.write(self.lineend)
>             return
>
>         # send the original whitespace, if needed
>         if newpos > oldpos:
>             self.temp.write(self.raw[oldpos:newpos])
>
>         # skip indenting tokens
>         if toktype in [token.INDENT, token.DEDENT]:
>             self.pos = newpos
>             return
>
>        # send text to the temp file
>         self.temp.write(toktext)
>         return
>
######################################################################
>
> def Main():
>     import sys
>     if sys.argv[1]:
>         filein = open(sys.argv[1]).read()
>         Stripper(filein).format(out=sys.stdout, comments=1,
untabify=1,
> eol='win')
>
>
######################################################################
> 
> if __name__ == '__main__':
>     Main()
> 
> M.E.Farmer