Problem with tokenize module and indents

Wed Aug 23 17:46:10 EDT 2006

Tim wrote:
> I ran into a problem with a script i was playing with to check code
> indents and need some direction.  It seems to depend on if tabsize is
> set to 4 in editor and spaces and tabs indents are mixed on consecutive
> lines.  Works fine when editors tabsize was 8 regardless if indents are
> mixed.
>
> Below are how the 3 test files are laid out, the sample code and output
> I get.
> Any help on how to detect this correctly would be appreciated.
>
>
> # nano -T4 tabspacing_4.py
> class Test:
>     """triple quote"""              #indent is 1 tab
>     def __init__(self, msg):    #indent is 4 spaces               <<
> this gets reported as a dedent when there is no change in indent level
>         self.msg = msg            #indent is 2 tabs
>
> #nano -T8 tabspacing_8A.py
> class Test:
>         """triple quote"""              #indent is 1 tab
>         def __init__(self, msg):    #indent is 8 spaces            << no
> indent change reported
>             self.msg = msg            #indent is 1 tab + 4 spaces
>
> #nano -T8 tabspacing_8B.py
> class Test:
>         """triple quote"""              #indent is 1 tab
>         def __init__(self, msg):    #indent is 1 tab                  <<
> no indent change reported
>             self.msg = msg            #indent is 1 tab + 4 spaces
>
>
>
> My script
>
> #!/usr/bin/env python
>
> import tokenize
> from sys import argv
>
> indent_lvl = 0
> line_number = 0
> lines = file(argv[1]).readlines()
> done = False
>
> def parse():
>
>     def feed():
>
>         global line_number, lines
>
>         if line_number < len(lines):
>             txt = lines[line_number]
>             line_number += 1
>         else:
>             txt = ''
>
>         return txt
>
>     def indents(type, token, start, end, line):
>
>         global indent_lvl, done
>
>         if type == tokenize.DEDENT:
>             indent_lvl -= 1
>         elif type == tokenize.INDENT:
>             indent_lvl += 1
>         elif type == tokenize.ENDMARKER:
>             done = True
>             return
>         else:
>             return
>
>         print "token=%s, line_number=%i, indent_lvl=%i" %
> (tokenize.tok_name[type], start[0], indent_lvl), line.strip()
>
>     while not done:
>         tokenize.tokenize(feed, indents)
>
> parse()
>
>
> $ ./sample.py tabspacing_4.py
> token=INDENT, line_number=3, indent_lvl=1 """triple quote"""
> #indent is 1 tab
> token=DEDENT, line_number=4, indent_lvl=0 def __init__(self, msg):
> #indent is 4 spaces      <-- PROBLEM HERE
> token=INDENT, line_number=5, indent_lvl=1 self.msg = msg
> #indent is 2 tabs
> token=DEDENT, line_number=8, indent_lvl=0
>
> $ ./sample.py tabspacing_8A.py
> token=INDENT, line_number=3, indent_lvl=1 """triple quote"""
> #indent is 1 tab
> token=INDENT, line_number=5, indent_lvl=2 self.msg = msg
> #indent is 1 tab + 4 spaces
> token=DEDENT, line_number=8, indent_lvl=1
> token=DEDENT, line_number=8, indent_lvl=0
>
> $ ./sample.py tabspacing_8B.py
> token=INDENT, line_number=3, indent_lvl=1 """triple quote"""
> #indent is 1 tab
> token=INDENT, line_number=5, indent_lvl=2 self.msg = msg
> #indent is 1 tab + 4 spaces
> token=DEDENT, line_number=8, indent_lvl=1
> token=DEDENT, line_number=8, indent_lvl=0

Well, the simple answer is "Don't mix tabs and spaces."   But if that's
unhelpful  ;-) , check out the tabnanny script (now in the standard
library) and also the expandtabs() method of strings.

http://docs.python.org/lib/module-tabnanny.html

Peace,
~Simon