[Mailman-i18n] translation checker

Fri, 24 May 2002 23:23:44 +0200

--jI8keyz6grp/JLjh
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi everyone,

During the italian translation of Mailman I've done many typo errors
which, in the worst case, caused Mailman misbehaving.  I think the 
real problem is that I couldn't find any editor or tool which does 
the following checks:

 - when translating a text template, if a given %s or %(var)s is 
   in the original file it probably should appear the same number
   of times in the translated text.

 - in the translated text you shouldn't have a %(var)s if that
   %(var)s wasn't in the original text.

 - when translating an html template, the same concept should 
   apply for <MM-*> tags.

 - when translating the .po file, the same concept should apply
   for each msgid/msgstr pair.

So, to easy the process, I've written a small script to check my 
translation whenever I do some update.  This script has been 
generalized and now you can use it for your language too.

Just for an overview of how much the tool can be effective, this 
simple bash script:
-----------------------------------------------
for i in big5 cs de es fi fr hu it ja ko no ru; 
do 
	echo -ne "$i:\t"; 
	transcheck -q $i; 
done
-----------------------------------------------
applied to the last Mailman-CVS has reported:

big5:   33 warnings in 9 files
cs:     192 warnings in 3 files
de:     136 warnings in 6 files
es:     115 warnings in 12 files
fi:     312 warnings in 6 files
fr:     58 warnings in 4 files
hu:     105 warnings in 6 files
it:     1 warnings in 1 files
ja:     182 warnings in 3 files
ko:     231 warnings in 8 files
no:     30 warnings in 3 files
ru:     341 warnings in 13 files

Before using my script, the italian translation counted about 70
warnings.  The one still counted actually is not an error: a 
%(var)s appears two times in the original english text but in 
italian we have intentionally only one.

Using the script without the -q switch, you can see a detailed 
report.

The script isn't perferct and can be improved in many ways:
 - better regexp to search for Python %(var)s 
 - better .po parser
 - better exception handling for strange situation
 - generalizations to use it in other python projects
but it's already good (at least for italian).

Feel free to use it and to report any feedback.

Cheers,
Simone

-- 
Simone Piunno, FerraraLUG - http://members.ferrara.linux.it/pioppo

--jI8keyz6grp/JLjh
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=transcheck

#!/usr/bin/python
#
# transcheck - (c) 2002 by Simone Piunno <pioppo@ferrara.linux.it>
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the version 2.0 of the GNU General Public License as
# published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
# 
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

"""
Check a given Mailman translation, making sure that variables and 
tags referenced in translation are the same variables and tags in 
the original templates and catalog.

Usage:

cd $MAILMAN_DIR
%(program)s [-q] <lang>

Where <lang> is your country code (e.g. 'it' for Italy) and -q is 
to ask for a brief summary.
"""

import sys
import re
import os
import getopt

class TransChecker:
	"check a translation comparing with the original string"
	def __init__(self, regexp):
		self.dict = {}
		self.errs = []
		self.regexp = re.compile(regexp)

	def checkin(self, string):
		"scan a string from the original file"
		for key in self.regexp.findall(string):
			if self.dict.has_key(key): 
				self.dict[key] += 1
			else:
				self.dict[key] = 1

	def checkout(self, string):
		"scan a translated string"
		for key in self.regexp.findall(string):
			if self.dict.has_key(key):
				self.dict[key] -= 1
			else:
				self.errs.append(
					"%(key)s was not found" % 
					{ 'key' : key }
				)

	def computeErrors(self):
		"check for differences between checked in and checked out"
		for key in self.dict.keys():
			if self.dict[key] < 0:
				self.errs.append(
					"Too much %(key)s" % 
					{ 'key'  : key }
				)
			if self.dict[key] > 0:
				self.errs.append(
					"Too few %(key)s" %
					{ 'key'  : key }
				)
		return self.errs

	def status(self):
		if self.errs:
			return "FAILED"
		else:
			return "OK"

	def errorsAsString(self):
		msg = ""
		for err in self.errs:
			msg += " - %(err)s" % { 'err': err }
		return msg

	def reset(self):
		self.dict = {}
		self.errs = []

class POParser:
	"parse a .po file extracting msgids and msgstrs"
	def __init__(self, filename=""):
		self.status = 0
		self.files = []
		self.msgid = ""
		self.msgstr = ""
		self.line = 1
		self.f = None
		self.esc = { "n": "\n", "r": "\r", "t": "\t" }
		if filename:
			self.f = open(filename)

	def open(self, filename):
		self.f = open(filename)

	def close(self):
		self.f.close()

	def parse(self):
		"""States table for the finite-states-machine parser:
			0  idle
			1  filename-or-comment
			2  msgid
			3  msgstr
			4  end
		"""
		# each time we can safely re-initialize those vars
		self.files = []
		self.msgid = ""
		self.msgstr = ""

		# can't continue if status == 4, this is a dead status
		if self.status == 4:
			return 0

		while 1:
			# continue scanning, char-by-char
			c = self.f.read(1)
			if not c:
				# EOF -> maybe we have a msgstr to save?
				self.status = 4
				if self.msgstr:
					return 1
				else:
					return 0

			# keep the line count up-to-date
			if c == "\n": 
				self.line += 1

			# a pound was detected the previous char... 
			if self.status == 1:
				if c == ":": 
					# was a line of filenames
					row = self.f.readline()
					self.files += row.split()
					self.line += 1
				elif c == "\n":
					# was a single pount on the line
					pass
				else:
					# was a comment... discard
					self.f.readline()
					self.line += 1
				# in every case, we switch to idle status
				self.status = 0;
				continue

			# in idle status we search for a '#' or for a 'm'
			if self.status == 0:
				if   c == "#": 
					# this could be a comment or a filename
					self.status = 1;
					continue
				elif c == "m": 
					# this should be a msgid start...
					s = self.f.read(4)
					assert s == "sgid"
					# so now we search for a '"'
					self.status = 2
					continue
				# in idle only those other chars are possibile
				assert c in [ "\n", " ", "\t" ]

			# searching for the msgid string
			if self.status == 2:
				if c == "\n":
					# a double LF is not possible here
					c = self.f.read(1)
					assert c != "\n"
				if c == "\"":
					# ok, this is the start of the string,
					# now search for the end
					while 1:
						c = self.f.read(1)
						if not c:
							# EOF, bailout
							self.status = 4
							return 0
						if c == "\\":
							# a quoted char...
							c = self.f.read(1)
							if self.esc.has_key(c):
								self.msgid += self.esc[c]
							else:
								self.msgid += c
							continue
						if c == "\"":
							# end of string found
							break
						# a normal char, add it 
						self.msgid += c
				if c == "m":
					# this should be a msgstr identifier
					s = self.f.read(5)
					assert s == "sgstr"
					# ok, now search for the msgstr string
					self.status = 3

			# searching for the msgstr string
			if self.status == 3:
				if c == "\n":
					# a double LF is the end of the msgstr!
					c = self.f.read(1)
					if c == "\n":
						# ok, time to go idle and return
						self.status = 0
						self.line += 1
						return 1
				if c == "\"":
					# start of string found
					while 1:
						c = self.f.read(1)
						if not c:
							# EOF, bail out
							self.status = 4
							return 1
						if c == "\\":
							# a quoted char...
							c = self.f.read(1)
							if self.esc.has_key(c):
								self.msgid += self.esc[c]
							else:
								self.msgid += c
							continue
						if c == "\"":
							# end of string
							break
						# a normal char, add it
						self.msgstr += c

def check_file(translatedFile, originalFile, html=0, quiet=0):
	"""check a translated template against the original one
	   search also <MM-*> tags if html is not zero"""

	if html:
		c = TransChecker("(%\([^)]+\)[0-9]*[sd]|</?MM-[^>]+>)")
	else:
		c = TransChecker("(%\([^)]+\)[0-9]*[sd])")

	try:
		f = open(originalFile)
	except IOError:
		if not quiet:
			print " - Can'open original file " + originalFile
		return 1

	while 1:
		line = f.readline()
		if not line: break
		c.checkin(line)

	f.close()

	try:
		f = open(translatedFile)
	except IOError:
		if not quiet:
			print " - Can'open translated file " + translatedFile
		return 1

	while 1:
		line = f.readline()
		if not line: break
		c.checkout(line)

	f.close()

	n = 0
	msg = ""
	for desc in c.computeErrors():
		n +=1
		if not quiet:
			print " - %(desc)s" % { 'desc': desc }
	return n

def check_po(file, quiet=0):
	"scan the po file comparing msgids with msgstrs"
	n = 0
	p = POParser(file)
	c = TransChecker("(%\([^)]+\)[0-9]*[sdu]|%[0-9]*[sdu])")
	while p.parse():
		c.reset()
		c.checkin(p.msgid)
		c.checkout(p.msgstr)
		for desc in c.computeErrors():
			n += 1
			if not quiet:
				print " - near line %(line)d %(file)s: %(desc)s" % {
					'line': p.line,
					'file': p.files,
					'desc': desc
				}
	p.close()
	return n

def __main__():
	#try:
	quiet = 0
	optlist, args = getopt.getopt(sys.argv[1:], "q");
	lang = args[0]

	#except:
		#print "Usage: %s [-q] <lang>" % sys.argv[0]
		#sys.exit(1)

	for o, a in optlist:
		if o == "-q":
			if a:
				print "q is a valid option but without parameters"
				sys.exit(1)
			quiet = 1
			break
		print "%s is not a valid option" % o
		sys.exit(1)

	isHtml = re.compile("\.html$");
	isTxt = re.compile("\.txt$");

	numerrors = 0
	numfiles = 0
	try:
		files = os.listdir("templates/" + lang + "/")
	except:
		print "can't open templates/%s/" % lang
	for file in files:
		fileEN = "templates/en/" + file
		fileIT = "templates/" + lang + "/" + file
		errlist = []
		if isHtml.search(file):
			if not quiet:
				print "HTML checking " + fileIT + "... "
			n = check_file(fileIT, fileEN, html=1, quiet=quiet)
			if n:
				numerrors += n
				numfiles += 1
		elif isTxt.search(file):
			if not quiet:
				print "TXT  checking " + fileIT + "... "
			n = check_file(fileIT, fileEN, html=0, quiet=quiet)
			if n:
				numerrors += n
				numfiles += 1

		else:
			continue

	file = "messages/" + lang + "/LC_MESSAGES/mailman.po"
	if not quiet:
		print "PO   checking " + file + "... "
	n = check_po(file, quiet=quiet)
	if n:
		numerrors += n
		numfiles += 1

	if quiet:
		print "%(errs)u warnings in %(files)u files" % { 
			'errs':  numerrors, 
			'files': numfiles 
		}

if __name__ == '__main__':
	__main__()

--jI8keyz6grp/JLjh--