Python Documentation generation

Thu Jun 27 17:05:40 EDT 2002

On Thu, 20 Jun 2002 14:49:14 -0400, "Mahrt, Dallas"
<dallasm at aiinet.com> wrote:

>I am looking for an application which can generate HTML documentation from
>Python modules.
>(Note: some examples based on JavaDoc syntax)

You can use doxygen (http://www.doxygen.org) to create such a
documentation. Basically, it's a tool like JavaDoc but for the C/C++
language. It has no built-in support for Python but it allows to pass
files through filters before they're actually processed.
I've written a filter that converts Python code into something that
looks like a C++ header files (see below). That way doxygen can also
be used to document Python code.

Every comment starting with ## and every docstring is literally
translated into a doxygen comment block. 

>I have a few desires from such a program:
>	- Ability to exclude elements based on naming conventions or
>keywords 
>		(ex. Exclude names starting with a single underscore '_foo'
>or containing the keyword @private)

My filter understands the special comments:

## public: 
## protected:
## private:

which will insert the corresponding keyword in the C++ code. Thus, you
can mark methods as protected or private and tell doxygen to exclude
them from the documentation. (However, this won't work for functions).

>	- Ability to have inter module linking. (If module A uses module B,
>then there should be an '@see B' like link [JavaDoc-ish])

You can add such commands into the doxygen comments (just like
JavaDoc). Here you should know that my filter puts everything into a
namespace with the same name as the file name. This means, if you
manually link to other parts of the documentation you have to include
the namespace.

>	- The program should also allow for either:
>		- generating module documentation from C extension modules
>(Should be tricky)

Well, that's what doxygen was actually written for. :)

>		- allow for other documentation sections to be linked
>automatically by registration of module name

Uhm, I'm not sure if I know what you mean here. Maybe it's this: If
you mention a class name in your documentation this class name will
automatically appear as a link to the documentation of the class
(however, the "namespace feature" often prevents the automatic linking
from working properly).

- Matthias -

---------------------------------------------------------------------------------------------------
Here's the filter (pythfilter.py), I hope the word wrapping didn't
mangle the script....

#! /usr/bin/python

# Doxygen filter which can be used to document Python source code.
# Classes (incl. methods) and functions can be documented.
# Every comment that begins with ## is literally turned into an
# Doxygen comment. Consecutive comment lines are turned into
# comment blocks (-> /** ... */).
# All the stuff is put inside a namespace with the same name as
# the source file.

# Conversions:
# ============
# ##-blocks                  ->  /** ... */
# "class name(base): ..."    ->  "class name : public base {...}"
# "def name(params): ..."    ->  "name(params) {...}"

# Written by Matthias Baas (baas at ira.uka.de)

# Changelog:
# 18.06.2002: Es gibt jetzt auch ein ## public:
# 21.01.2002: from ... import wird umgesetzt in "using namespace ...;"
#             TODO: "from ... import *" vs "from ... import names"
#             TODO: Bei normalem Import: name.name -> name::name
# 20.01.2002: #includes werden VOR den Namespace gesetzt

######################################################################

# The program is written as a state machine with the following states:
#
# - OUTSIDE               The current position is outside any comment,
#                         class definition or function.
#
# - BUILD_COMMENT         Begins with first "##".
#                         Ends with the first token that is no "##"
#                         at the same column as before.
#
# - BUILD_CLASS_DECL      Begins with "class".
#                         Ends with ":"
# - BUILD_CLASS_BODY      Begins just after BUILD_CLASS_DECL.
#                         The first following token (which is no
comment)
#                         determines indentation depth.
#                         Ends with a token that has a smaller
indendation.
#
# - BUILD_DEF_DECL        Begins with "def".
#                         Ends with ":".
# - BUILD_DEF_BODY        Begis just after BUILD_DEF_DECL.
#                         The first following token (which is no
comment)
#                         determines indentation depth.
#                         Ends with a token that has a smaller
indendation.

import sys,os.path,string,token,tokenize

OUTSIDE          = 0
BUILD_COMMENT    = 1
BUILD_CLASS_DECL = 2
BUILD_CLASS_BODY = 3
BUILD_DEF_DECL   = 4
BUILD_DEF_BODY   = 5
IMPORT           = 6
IMPORT_OP        = 7
IMPORT_APPEND    = 8

# Output buffer
outbuffer = []

out_row = 0
out_col = 0

# Variables used by rec_name_n_param()
name         = ""
param        = ""
doc_string   = ""
record_state = 0

# Tuple: (row,column)
class_spos  = (0,0)
def_spos    = (0,0)
import_spos = (0,0)

# Which import was used? ("import" or "from")
import_token = ""

# Comment block buffer
comment_block = []
comment_finished = 0

# Imported modules
modules = []

# Program state
stateStack = [OUTSIDE]

######################################################################
# Output string s. '\n' may only be at the end of the string (not
# somewhere in the middle).
#
# In: s    - String
#     spos - Startpos
######################################################################
def output(s,spos, immediate=0):
  global outbuffer, out_row, out_col

  os = string.rjust(s,spos[1]-out_col+len(s))
  if immediate:
    sys.stdout.write(os)
  else:
    outbuffer.append(os)
  if (s[-1:]=="\n"):
     out_row = out_row+1
     out_col = 0
  else:
     out_col = spos[1]+len(s)

######################################################################
# Records a name and parameters. The name is either a class name or
# a function name. Then the parameter is either the base class or
# the function parameters.
# The name is stored in the global variable "name", the parameters
# in "param".
# The variable "record_state" holds the current state of this internal
# state machine.
# The recording is started by calling start_recording().
#
# In: type, tok 
######################################################################
def rec_name_n_param(type, tok):
  global record_state,name,param,doc_string
  s = record_state
  # State 0: Do nothing.
  if   (s==0): 
     return
  # State 1: Remember name.
  elif (s==1):
     name = tok
     record_state = 2
  # State 2: Wait for opening bracket or colon
  elif (s==2):
     if (tok=='('): record_state=3
     if (tok==':'): record_state=4
  # State 3: Store parameter (or base class) and wait for an ending
bracket
  elif (s==3):
     if (tok==')'): 
        record_state=4
     else:
        param=param+tok
  # State 4: Look for doc string
  elif (s==4):
    if (type==token.NEWLINE or type==token.INDENT or
type==token.SLASHEQUAL):
      return
    elif (tok==":"):
      return
    elif (type==token.STRING):
      while tok[:1]=='"':
        tok=tok[1:]
      while tok[-1:]=='"':
        tok=tok[:-1]
      doc_string=tok
    record_state=0

######################################################################
# Starts the recording of a name & param part.
# The function rec_name_n_param() has to be fed with tokens. After
# the necessary tokens are fed the name and parameters can be found
# in the global variables "name" und "param".
######################################################################
def start_recording():
  global record_state,param,name, doc_string
  record_state=1
  name=""
  param=""
  doc_string=""

######################################################################
# Test if recording is finished
######################################################################
def is_recording_finished():
  global record_state
  return record_state==0

######################################################################
## Gather comment block
######################################################################
def gather_comment(type,tok,spos):
  global comment_block,comment_finished
  if (type!=tokenize.COMMENT):
     comment_finished = 1
  else:
    # Output old comment block if a new one is started.
    if (comment_finished): 
       print_comment(spos)
       comment_finished=0
    if (tok[0:2]=="##" and tok[0:3]!="###"):
       comment_block.append(tok[2:])

######################################################################
## Output comment block and empty buffer.
######################################################################
def print_comment(spos):
  global comment_block,comment_finished
  if (comment_block!=[]):
     output("/**\n",spos)
     for c in comment_block:
       output(c,spos)
     output("*/\n",spos)
  comment_block    = []
  comment_finished = 0

######################################################################
def set_state(s):
  global stateStack
  stateStack[len(stateStack)-1]=s

######################################################################
def get_state():
  global stateStack
  return stateStack[len(stateStack)-1]

######################################################################
def push_state(s):
  global stateStack
  stateStack.append(s)

######################################################################
def pop_state():
  global stateStack
  stateStack.pop()  

######################################################################
def tok_eater(type, tok, spos, epos, line):
  global stateStack,name,param,class_spos,def_spos,import_spos
  global doc_string, modules, import_token

  rec_name_n_param(type,tok)
  if (string.replace(string.strip(tok)," ","")=="##private:"):
     output("private:\n",spos)
  elif (string.replace(string.strip(tok)," ","")=="##protected:"):
     output("protected:\n",spos)
  elif (string.replace(string.strip(tok)," ","")=="##public:"):
     output("public:\n",spos)
  else:
     gather_comment(type,tok,spos)

  state = get_state()
  # OUTSIDE
  if   (state==OUTSIDE):
    if   (tok=="class"):
       start_recording()
       class_spos = spos
       push_state(BUILD_CLASS_DECL)
    elif (tok=="def"):
       start_recording()
       def_spos = spos
       push_state(BUILD_DEF_DECL)
    elif (tok=="import") or (tok=="from"):
       import_token = tok
       import_spos = spos
       modules     = []
       push_state(IMPORT)
  # IMPORT
  elif (state==IMPORT):
    if (type==token.NAME):
       modules.append(tok)
       set_state(IMPORT_OP)
  # IMPORT_OP
  elif (state==IMPORT_OP):
    if (tok=="."):
       set_state(IMPORT_APPEND)
    elif (tok==","):
       set_state(IMPORT)
    else:
       for m in modules:
         output('#include "'+m+'.py"\n', import_spos, immediate=1)
         if import_token=="from":
           output('using namespace '+m+';\n', import_spos)
       pop_state()
  # IMPORT_APPEND
  elif (state==IMPORT_APPEND):
    if (type==token.NAME):
       modules[len(modules)-1]+="."+tok
       set_state(IMPORT_OP)
  # BUILD_CLASS_DECL
  elif (state==BUILD_CLASS_DECL):
    if (is_recording_finished()):
       s = "class "+name
       if (param!=""): s = s+" : public "+param
       if (doc_string!=""): comment_block.append(doc_string)
       print_comment(class_spos)
       output(s+"\n",class_spos)
       output("{\n",(class_spos[0]+1,class_spos[1]))
       output("  public:\n",(class_spos[0]+2,class_spos[1]))
       set_state(BUILD_CLASS_BODY)
  # BUILD_CLASS_BODY
  elif (state==BUILD_CLASS_BODY):
    if (type!=token.INDENT and type!=token.NEWLINE and type!=40 and
type!=tokenize.NL\
       and type!=tokenize.COMMENT \
       and (spos[1]<=class_spos[1])):
          output("}; // end of class\n",(out_row+1,class_spos[1]))
          pop_state()
    elif (tok=="def"):
       start_recording()
       def_spos = spos
       push_state(BUILD_DEF_DECL)
  # BUILD_DEF_DECL
  elif (state==BUILD_DEF_DECL):
    if (is_recording_finished()):
       s = name+"("+param+");\n"
       if (doc_string!=""): comment_block.append(doc_string)
       print_comment(def_spos)
       output(s,def_spos)
#       output("{\n",(def_spos[0]+1,def_spos[1]))
       set_state(BUILD_DEF_BODY)
  # BUILD_DEF_BODY
  elif (state==BUILD_DEF_BODY):
       if (type!=token.INDENT and type!=token.NEWLINE \
          and type!=40 and type!=tokenize.NL \
          and (spos[1]<=def_spos[1])):
#            output("} // end of
method/function\n",(out_row+1,def_spos[1]))
            pop_state()
#       else:
#            output(tok,spos)

def dump(filename):
  f = open(filename)
  r = f.readlines()
  for s in r:
    sys.stdout.write(s)

def filter(filename):
  path,name = os.path.split(filename)
  root,ext  = os.path.splitext(name)

  output("namespace "+root+" {\n",(0,0))

  sys.stderr.write('Filtering "'+filename+'"...')
  f = open(filename)
  tokenize.tokenize(f.readline, tok_eater)
  f.close()
  print_comment((0,0))

  output("\n",(0,0))
  output("}  // end of namespace\n",(0,0))

  for s in outbuffer:
    sys.stdout.write(s)

######################################################################
######################################################################
######################################################################

try:
  filename = string.join(sys.argv[1:])
  root,ext  = os.path.splitext(filename)

  if ext==".py":
    filter(filename)
  else:
    dump(filename)

  sys.stderr.write("OK\n")
except IOError,e:
  sys.stderr.write(e[1]+"\n")