[I18n-sig] pygettext.py extraction of docstrings

Thu, 26 Oct 2000 15:48:33 -0400 (EDT)

Hi all,

I have a tentative patch for Tools/i18n/pygettext.py which adds
optional extraction of module, class, method, and function
docstrings.  Francois Pinard's po-utils does something similar I
believe, and it makes a lot of sense to add this.

If you provide the -D/--docstrings flag, then it'll extract these
docstrings without requiring them to be wrapped in _() markers.  You'd
of course, still need to send the strings through a translation step
but that's okay because you'll probably want deferred translation of
them anyway.

I've only done some minimal testing so I don't know how easy it is to
confuse the TokenEater.

One question: should docstring extraction be turned on my default?

Attached is a patch against Python 2.0's pygettext.py.

-Barry

-------------------- snip snip --------------------
Index: pygettext.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/i18n/pygettext.py,v
retrieving revision 1.9
diff -u -r1.9 pygettext.py

--- pygettext.py	2000/05/02 19:28:30	1.9
+++ pygettext.py	2000/10/26 19:43:18
@@ -4,16 +4,8 @@
 # minimally patched to make it even more xgettext compatible 
 # by Peter Funk <pf@artcom-gmbh.de>
 
-# for selftesting
-try:
-    import fintl
-    _ = fintl.gettext
-except ImportError:
-    def _(s): return s
+"""pygettext -- Python equivalent of xgettext(1)
 
-
-__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
-
 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
 internationalization of C programs.  Most of these tools are independent of
 the programming language and can be used from within Python programs.  Martin
@@ -65,7 +57,12 @@
 
     -E
     --escape
-        replace non-ASCII characters with octal escape sequences.
+        Replace non-ASCII characters with octal escape sequences.
+
+    -D
+    --docstrings
+        Extract module, class, method, and function docstrings.  This requires
+        an import of the specified module, so beware of import side effects.
 
     -h
     --help
@@ -132,15 +129,22 @@
 
 If `inputfile' is -, standard input is read.
 
-""")
+"""
 
 import os
 import sys
 import time
 import getopt
 import tokenize
+
+# for selftesting
+try:
+    import fintl
+    _ = fintl.gettext
+except ImportError:
+    def _(s): return s
 
-__version__ = '1.1'
+__version__ = '1.2'
 
 default_keywords = ['_']
 DEFAULTKEYWORDS = ', '.join(default_keywords)
@@ -171,9 +175,9 @@
 
 
 def usage(code, msg=''):
-    print __doc__ % globals()
+    print >> sys.stderr, _(__doc__) % globals()
     if msg:
-        print msg
+        print >> sys.stderr, msg
     sys.exit(code)
 
 
@@ -239,15 +243,48 @@
         self.__state = self.__waiting
         self.__data = []
         self.__lineno = -1
+        self.__freshmodule = 1
 
     def __call__(self, ttype, tstring, stup, etup, line):
         # dispatch
+##        import token
+##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
+##              'tstring:', tstring
         self.__state(ttype, tstring, stup[0])
 
     def __waiting(self, ttype, tstring, lineno):
+        # Do docstring extractions, if enabled
+        if self.__options.docstrings:
+            # module docstring?
+            if self.__freshmodule:
+                if ttype == tokenize.STRING:
+                    self.__addentry(safe_eval(tstring), lineno)
+                    self.__freshmodule = 0
+                elif ttype not in (tokenize.COMMENT, tokenize.NL):
+                    self.__freshmodule = 0
+                return
+            # class docstring?
+            if ttype == tokenize.NAME and tstring in ('class', 'def'):
+                self.__state = self.__suiteseen
+                return
         if ttype == tokenize.NAME and tstring in self.__options.keywords:
             self.__state = self.__keywordseen
 
+    def __suiteseen(self, ttype, tstring, lineno):
+        # ignore anything until we see the colon
+        if ttype == tokenize.OP and tstring == ':':
+            self.__state = self.__suitedocstring
+
+    def __suitedocstring(self, ttype, tstring, lineno):
+        # ignore any intervening noise
+        if ttype == tokenize.STRING:
+            self.__addentry(safe_eval(tstring), lineno)
+            self.__state = self.__waiting
+        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
+                           tokenize.COMMENT):
+            # there was no class docstring
+            self.__state = self.__waiting
+
     def __keywordseen(self, ttype, tstring, lineno):
         if ttype == tokenize.OP and tstring == '(':
             self.__data = []
@@ -263,58 +300,54 @@
             # of messages seen.  Reset state for the next batch.  If there
             # were no strings inside _(), then just ignore this entry.
             if self.__data:
-                msg = EMPTYSTRING.join(self.__data)
-                if not msg in self.__options.toexclude:
-                    entry = (self.__curfile, self.__lineno)
-                    linenos = self.__messages.get(msg)
-                    if linenos is None:
-                        self.__messages[msg] = [entry]
-                    else:
-                        linenos.append(entry)
+                self.__addentry(EMPTYSTRING.join(self.__data))
             self.__state = self.__waiting
         elif ttype == tokenize.STRING:
             self.__data.append(safe_eval(tstring))
         # TBD: should we warn if we seen anything else?
 
+    def __addentry(self, msg, lineno=None):
+        if lineno is None:
+            lineno = self.__lineno
+        if not msg in self.__options.toexclude:
+            entry = (self.__curfile, lineno)
+            self.__messages.setdefault(msg, []).append(entry)
+
     def set_filename(self, filename):
         self.__curfile = filename
 
     def write(self, fp):
         options = self.__options
         timestamp = time.ctime(time.time())
-        # common header
-        try:
-            sys.stdout = fp
-            # The time stamp in the header doesn't have the same format
-            # as that generated by xgettext...
-            print pot_header % {'time': timestamp, 'version': __version__}
-            for k, v in self.__messages.items():
-                if not options.writelocations:
-                    pass
-                # location comments are different b/w Solaris and GNU:
-                elif options.locationstyle == options.SOLARIS:
-                    for filename, lineno in v:
-                        d = {'filename': filename, 'lineno': lineno}
-                        print _('# File: %(filename)s, line: %(lineno)d') % d
-                elif options.locationstyle == options.GNU:
-                    # fit as many locations on one line, as long as the
-                    # resulting line length doesn't exceeds 'options.width'
-                    locline = '#:'
-                    for filename, lineno in v:
-                        d = {'filename': filename, 'lineno': lineno}
-                        s = _(' %(filename)s:%(lineno)d') % d
-                        if len(locline) + len(s) <= options.width:
-                            locline = locline + s
-                        else:
-                            print locline
-                            locline = "#:" + s
-                    if len(locline) > 2:
-                        print locline
-                # TBD: sorting, normalizing
-                print 'msgid', normalize(k)
-                print 'msgstr ""\n'
-        finally:
-            sys.stdout = sys.__stdout__
+        # The time stamp in the header doesn't have the same format as that
+        # generated by xgettext...
+        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
+        for k, v in self.__messages.items():
+            if not options.writelocations:
+                pass
+            # location comments are different b/w Solaris and GNU:
+            elif options.locationstyle == options.SOLARIS:
+                for filename, lineno in v:
+                    d = {'filename': filename, 'lineno': lineno}
+                    print >> fp, _('# File: %(filename)s, line: %(lineno)d') \
+                          % d
+            elif options.locationstyle == options.GNU:
+                # fit as many locations on one line, as long as the
+                # resulting line length doesn't exceeds 'options.width'
+                locline = '#:'
+                for filename, lineno in v:
+                    d = {'filename': filename, 'lineno': lineno}
+                    s = _(' %(filename)s:%(lineno)d') % d
+                    if len(locline) + len(s) <= options.width:
+                        locline = locline + s
+                    else:
+                        print >> fp, locline
+                        locline = "#:" + s
+                if len(locline) > 2:
+                    print >> fp, locline
+            # TBD: sorting, normalizing
+            print >> fp, 'msgid', normalize(k)
+            print >> fp, 'msgstr ""\n'
 
 
 def main():
@@ -322,11 +355,12 @@
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
-            'ad:Ehk:Kno:p:S:Vvw:x:',
+            'ad:DEhk:Kno:p:S:Vvw:x:',
             ['extract-all', 'default-domain', 'escape', 'help',
              'keyword=', 'no-default-keywords',
              'add-location', 'no-location', 'output=', 'output-dir=',
              'style=', 'verbose', 'version', 'width=', 'exclude-file=',
+             'docstrings',
              ])
     except getopt.error, msg:
         usage(1, msg)
@@ -347,6 +381,7 @@
         verbose = 0
         width = 78
         excludefilename = ''
+        docstrings = 0
 
     options = Options()
     locations = {'gnu' : options.GNU,
@@ -363,6 +398,8 @@
             options.outfile = arg + '.pot'
         elif opt in ('-E', '--escape'):
             options.escape = 1
+        elif opt in ('-D', '--docstrings'):
+            options.docstrings = 1
         elif opt in ('-k', '--keyword'):
             options.keywords.append(arg)
         elif opt in ('-K', '--no-default-keywords'):