From barry@wooz.org Thu Oct 26 20:48:33 2000 From: barry@wooz.org (Barry A. Warsaw) Date: Thu, 26 Oct 2000 15:48:33 -0400 (EDT) Subject: [I18n-sig] pygettext.py extraction of docstrings Message-ID: <14840.35473.307059.990479@anthem.concentric.net> Hi all, I have a tentative patch for Tools/i18n/pygettext.py which adds optional extraction of module, class, method, and function docstrings. Francois Pinard's po-utils does something similar I believe, and it makes a lot of sense to add this. If you provide the -D/--docstrings flag, then it'll extract these docstrings without requiring them to be wrapped in _() markers. You'd of course, still need to send the strings through a translation step but that's okay because you'll probably want deferred translation of them anyway. I've only done some minimal testing so I don't know how easy it is to confuse the TokenEater. One question: should docstring extraction be turned on my default? Attached is a patch against Python 2.0's pygettext.py. -Barry -------------------- snip snip -------------------- Index: pygettext.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Tools/i18n/pygettext.py,v retrieving revision 1.9 diff -u -r1.9 pygettext.py --- pygettext.py 2000/05/02 19:28:30 1.9 +++ pygettext.py 2000/10/26 19:43:18 @@ -4,16 +4,8 @@ # minimally patched to make it even more xgettext compatible # by Peter Funk -# for selftesting -try: - import fintl - _ = fintl.gettext -except ImportError: - def _(s): return s +"""pygettext -- Python equivalent of xgettext(1) - -__doc__ = _("""pygettext -- Python equivalent of xgettext(1) - Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the internationalization of C programs. Most of these tools are independent of the programming language and can be used from within Python programs. Martin @@ -65,7 +57,12 @@ -E --escape - replace non-ASCII characters with octal escape sequences. + Replace non-ASCII characters with octal escape sequences. + + -D + --docstrings + Extract module, class, method, and function docstrings. This requires + an import of the specified module, so beware of import side effects. -h --help @@ -132,15 +129,22 @@ If `inputfile' is -, standard input is read. -""") +""" import os import sys import time import getopt import tokenize + +# for selftesting +try: + import fintl + _ = fintl.gettext +except ImportError: + def _(s): return s -__version__ = '1.1' +__version__ = '1.2' default_keywords = ['_'] DEFAULTKEYWORDS = ', '.join(default_keywords) @@ -171,9 +175,9 @@ def usage(code, msg=''): - print __doc__ % globals() + print >> sys.stderr, _(__doc__) % globals() if msg: - print msg + print >> sys.stderr, msg sys.exit(code) @@ -239,15 +243,48 @@ self.__state = self.__waiting self.__data = [] self.__lineno = -1 + self.__freshmodule = 1 def __call__(self, ttype, tstring, stup, etup, line): # dispatch +## import token +## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ +## 'tstring:', tstring self.__state(ttype, tstring, stup[0]) def __waiting(self, ttype, tstring, lineno): + # Do docstring extractions, if enabled + if self.__options.docstrings: + # module docstring? + if self.__freshmodule: + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno) + self.__freshmodule = 0 + elif ttype not in (tokenize.COMMENT, tokenize.NL): + self.__freshmodule = 0 + return + # class docstring? + if ttype == tokenize.NAME and tstring in ('class', 'def'): + self.__state = self.__suiteseen + return if ttype == tokenize.NAME and tstring in self.__options.keywords: self.__state = self.__keywordseen + def __suiteseen(self, ttype, tstring, lineno): + # ignore anything until we see the colon + if ttype == tokenize.OP and tstring == ':': + self.__state = self.__suitedocstring + + def __suitedocstring(self, ttype, tstring, lineno): + # ignore any intervening noise + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno) + self.__state = self.__waiting + elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, + tokenize.COMMENT): + # there was no class docstring + self.__state = self.__waiting + def __keywordseen(self, ttype, tstring, lineno): if ttype == tokenize.OP and tstring == '(': self.__data = [] @@ -263,58 +300,54 @@ # of messages seen. Reset state for the next batch. If there # were no strings inside _(), then just ignore this entry. if self.__data: - msg = EMPTYSTRING.join(self.__data) - if not msg in self.__options.toexclude: - entry = (self.__curfile, self.__lineno) - linenos = self.__messages.get(msg) - if linenos is None: - self.__messages[msg] = [entry] - else: - linenos.append(entry) + self.__addentry(EMPTYSTRING.join(self.__data)) self.__state = self.__waiting elif ttype == tokenize.STRING: self.__data.append(safe_eval(tstring)) # TBD: should we warn if we seen anything else? + def __addentry(self, msg, lineno=None): + if lineno is None: + lineno = self.__lineno + if not msg in self.__options.toexclude: + entry = (self.__curfile, lineno) + self.__messages.setdefault(msg, []).append(entry) + def set_filename(self, filename): self.__curfile = filename def write(self, fp): options = self.__options timestamp = time.ctime(time.time()) - # common header - try: - sys.stdout = fp - # The time stamp in the header doesn't have the same format - # as that generated by xgettext... - print pot_header % {'time': timestamp, 'version': __version__} - for k, v in self.__messages.items(): - if not options.writelocations: - pass - # location comments are different b/w Solaris and GNU: - elif options.locationstyle == options.SOLARIS: - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - print _('# File: %(filename)s, line: %(lineno)d') % d - elif options.locationstyle == options.GNU: - # fit as many locations on one line, as long as the - # resulting line length doesn't exceeds 'options.width' - locline = '#:' - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - s = _(' %(filename)s:%(lineno)d') % d - if len(locline) + len(s) <= options.width: - locline = locline + s - else: - print locline - locline = "#:" + s - if len(locline) > 2: - print locline - # TBD: sorting, normalizing - print 'msgid', normalize(k) - print 'msgstr ""\n' - finally: - sys.stdout = sys.__stdout__ + # The time stamp in the header doesn't have the same format as that + # generated by xgettext... + print >> fp, pot_header % {'time': timestamp, 'version': __version__} + for k, v in self.__messages.items(): + if not options.writelocations: + pass + # location comments are different b/w Solaris and GNU: + elif options.locationstyle == options.SOLARIS: + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + print >> fp, _('# File: %(filename)s, line: %(lineno)d') \ + % d + elif options.locationstyle == options.GNU: + # fit as many locations on one line, as long as the + # resulting line length doesn't exceeds 'options.width' + locline = '#:' + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + s = _(' %(filename)s:%(lineno)d') % d + if len(locline) + len(s) <= options.width: + locline = locline + s + else: + print >> fp, locline + locline = "#:" + s + if len(locline) > 2: + print >> fp, locline + # TBD: sorting, normalizing + print >> fp, 'msgid', normalize(k) + print >> fp, 'msgstr ""\n' def main(): @@ -322,11 +355,12 @@ try: opts, args = getopt.getopt( sys.argv[1:], - 'ad:Ehk:Kno:p:S:Vvw:x:', + 'ad:DEhk:Kno:p:S:Vvw:x:', ['extract-all', 'default-domain', 'escape', 'help', 'keyword=', 'no-default-keywords', 'add-location', 'no-location', 'output=', 'output-dir=', 'style=', 'verbose', 'version', 'width=', 'exclude-file=', + 'docstrings', ]) except getopt.error, msg: usage(1, msg) @@ -347,6 +381,7 @@ verbose = 0 width = 78 excludefilename = '' + docstrings = 0 options = Options() locations = {'gnu' : options.GNU, @@ -363,6 +398,8 @@ options.outfile = arg + '.pot' elif opt in ('-E', '--escape'): options.escape = 1 + elif opt in ('-D', '--docstrings'): + options.docstrings = 1 elif opt in ('-k', '--keyword'): options.keywords.append(arg) elif opt in ('-K', '--no-default-keywords'): From tree@basistech.com Thu Oct 26 21:32:23 2000 From: tree@basistech.com (Tom Emerson) Date: Thu, 26 Oct 2000 16:32:23 -0400 (EDT) Subject: [I18n-sig] Codecs for Big Five and GB 2312 Message-ID: <14840.38103.767963.696009@cymru.basistech.com> I need codecs for transcoding to and from Big Five and GB 2312: has anyone written these yet? If not, I'll do it, but I would rather not duplicate the work. Thanks. -tree -- Tom Emerson Basis Technology Corp. Zenkaku Language Hacker http://www.basistech.com "Beware the lollipop of mediocrity: lick it once and you suck forever" From kajiyama@grad.sccs.chukyo-u.ac.jp Fri Oct 27 02:10:10 2000 From: kajiyama@grad.sccs.chukyo-u.ac.jp (Tamito KAJIYAMA) Date: Fri, 27 Oct 2000 10:10:10 +0900 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <14840.38103.767963.696009@cymru.basistech.com> (message from Tom Emerson on Thu, 26 Oct 2000 16:32:23 -0400 (EDT)) References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> Message-ID: <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> Tom Emerson writes: | I need codecs for transcoding to and from Big Five and GB 2312: has | anyone written these yet? If not, I'll do it, but I would rather not | duplicate the work. I've maintained a codecs package named JapaneseCodecs which contains two Japanese encodings EUC-JP and Shift JIS. The two encodings and Big5 are all 8-bit encodings, so you may use my codecs as a starting point for implementing a Big5 codec. The JapaneseCodecs package is available at: http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/ For personal use I also wrote a preliminary codec for a subset of ISO 2022 (or exactly speaking, a subset of the Emacs/MULE internal encoding, which in turn an extension of ISO 2022). Currently the codec can handle a text that contains Japanese, Thai, and Vietnamese characters. The codec is written without efficiency consideration, but it works. Since GB 2312 is an encoding based on ISO 2022, the codec may be a starting point, too. The only things that need to be done for handling GB 2312 is to add a character mapping and escape sequences for designating character sets. If you are interested, the codec is available at: http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/iso_2022_7bit.py.gz Regards, -- KAJIYAMA, Tamito From mal@lemburg.com Fri Oct 27 09:02:11 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Fri, 27 Oct 2000 10:02:11 +0200 Subject: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> Message-ID: <39F93683.9DB59F8B@lemburg.com> Tamito KAJIYAMA wrote: > > Tom Emerson writes: > | I need codecs for transcoding to and from Big Five and GB 2312: has > | anyone written these yet? If not, I'll do it, but I would rather not > | duplicate the work. > > I've maintained a codecs package named JapaneseCodecs which > contains two Japanese encodings EUC-JP and Shift JIS. The two > encodings and Big5 are all 8-bit encodings, so you may use my > codecs as a starting point for implementing a Big5 codec. The > JapaneseCodecs package is available at: > > http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/ > > For personal use I also wrote a preliminary codec for a subset > of ISO 2022 (or exactly speaking, a subset of the Emacs/MULE > internal encoding, which in turn an extension of ISO 2022). > Currently the codec can handle a text that contains Japanese, > Thai, and Vietnamese characters. The codec is written without > efficiency consideration, but it works. Since GB 2312 is an > encoding based on ISO 2022, the codec may be a starting point, > too. The only things that need to be done for handling GB 2312 > is to add a character mapping and escape sequences for > designating character sets. If you are interested, the codec is > available at: > > http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/iso_2022_7bit.py.gz Andy, I think you ought to put these links on the i18n-sig web page... if someone finds some time, I think it would be worth- while starting a topic guide for Unicode which also includes all these valuable resources. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From guido@python.org Fri Oct 27 12:42:20 2000 From: guido@python.org (Guido van Rossum) Date: Fri, 27 Oct 2000 06:42:20 -0500 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: Your message of "Fri, 27 Oct 2000 10:02:11 +0200." <39F93683.9DB59F8B@lemburg.com> References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <39F93683.9DB59F8B@lemburg.com> Message-ID: <200010271142.GAA03982@cj20424-a.reston1.va.home.com> > > http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/iso_2022_7bit.py.gz > > Andy, I think you ought to put these links on the i18n-sig web > page... Done. --Guido van Rossum (home page: http://www.python.org/~guido/) From martin@loewis.home.cs.tu-berlin.de Fri Oct 27 22:24:56 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Fri, 27 Oct 2000 23:24:56 +0200 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: (message from Lars Marius Garshol on 27 Oct 2000 11:05:46 +0200) References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> Message-ID: <200010272124.XAA00854@loewis.home.cs.tu-berlin.de> > Yup. I plan to teach xmlproc the IANA registry, so that this should > not be a problem with xmlproc. With due respect, I hope this is not the way it that is done. Instead, I think codecs.lookup should know the IANA registry. It may be that this information comes with PyXML only for now, but it should be available to all Python applications. E.g. xml/__init__.py could do codecs.register(iana_lookup) where iana_lookup simply maps encodings to the "normalized" form. I agree with MAL that this should eventually end-up in Python proper. In any case, knowing the official aliases should not be restricted to xmlproc. > However, it is a problem that Python does not support any of the Far > East encodings yet. Does anyone know if there are any plans to change > that? Again, I'd see no problem including Tamito Kajiyama's code in PyXML, if he wants us to ship it - or we could recommend JapaneseCodecs as an valuable addition to PyXML; this package also uses the distutils, so it is quite easy to install. [using Python codecs in expat] > I don't think it's really all that difficult. [...] > The only possible stumbling block is when expat discovers an XML > declaration that says something other than "utf-16"... Wouldn't that be the normal case where encodings other than UTF-8 become interesting? I'd assume that most XML documents which don't use UTF-8 do declare the encoding in the XML declaration, instead of relying on some higher-level protocol to correctly transmit encoding information. So I'd rather see an approach where expat itself finds out eventually what the encoding is, and then goes to the application (i.e. the Python SAX driver) and asks to convert the input. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Fri Oct 27 22:40:42 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Fri, 27 Oct 2000 23:40:42 +0200 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> (message from Tamito KAJIYAMA on Fri, 27 Oct 2000 10:10:10 +0900) References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> Message-ID: <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> > If you are interested, the codec is available at: > http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/iso_2022_7bit.py.gz I just had a look, and it seems like an interesting package. I'm slightly confused about the installation procedure, though. Installing into python2.0/encodings/{euc_jp,shift_jis,japanese} doesn't look right to me - add-on packages should be capable of installing into site-packages by default. I believe it would actually work if you just install without any arguments to setup.py. euc_jp would then end-up in python2.0/site-packages. Later, when you do u"Hello".encode("euc-jp") it looks for a codec. Here, encodings.__init__.search_function do modname = encoding.replace('-', '_') modname = aliases.aliases.get(modname,modname) try: mod = __import__(modname,globals(),locals(),'*') except ImportError,why: _cache[encoding] = None return None First, encoding becomes euc_jp. With no registered aliases, it would then call __import__ with "euc_jp", which will find the codec in site-packages. In the long run, I'd hope that distutils provides a mean to install additional codecs, e.g via setup( ... codecs = ['japanese'] ...) Then, distutils would collect all these strings, and importing codecs would roughly do for package in distutils.registered_codec_packages: p=__import__(package,global(),locals(),"*") p.register() japanese/__init__.py would provide a register function which registers another search_function, which would load euc_jp and shift_jis on demand. That way, users could install additional codecs which are available to everybody on the system, without having to hack the Python library proper. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Fri Oct 27 22:47:42 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Fri, 27 Oct 2000 23:47:42 +0200 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: (message from Lars Marius Garshol on 27 Oct 2000 12:24:09 +0200) References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> <39F953E9.493A0DE1@lemburg.com> Message-ID: <200010272147.XAA00953@loewis.home.cs.tu-berlin.de> > That's only Shift-JIS and EUC-JP, though. Is there any concerted > effort afoot to make a more complete set? At the very least, > ISO 2022-JP, Big5, VISCII, GB-2312 and EUC-KR should be implemented. I'd hope that somebody exposes the operating system's converters to Python. For example, on Linux and Solaris, the iconv library offers a wide variety of codecs (at least in its gconv form), which are also highly performant. On W2k, a huge set of converters is available, which just waits being exposed to Python. I'm always concerned by the fact that every package seems to come with its own set of conversion tables, instead on relying on other people to do a good job (and report bugs if they don't). Tcl has such tables, Java does, X11 has some, ICU has more - I really can't see the reason to reimplement them all again in Python. > | More about this on the i18n-sig mailing list. > > Well, if only a single response is required I would prefer to get that > here. This is free software. You never get away with a single response only :-) Regards, Martin From martin@loewis.home.cs.tu-berlin.de Fri Oct 27 23:28:56 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Sat, 28 Oct 2000 00:28:56 +0200 Subject: [I18n-sig] pygettext.py extraction of docstrings In-Reply-To: <14840.35473.307059.990479@anthem.concentric.net> (barry@wooz.org) References: <14840.35473.307059.990479@anthem.concentric.net> Message-ID: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> > I have a tentative patch for Tools/i18n/pygettext.py which adds > optional extraction of module, class, method, and function > docstrings. Francois Pinard's po-utils does something similar I > believe, and it makes a lot of sense to add this. It certainly does. One question though: > + --docstrings > + Extract module, class, method, and function docstrings. This requires > + an import of the specified module, so beware of import side effects. It's probably too late tonight to analyse tricky Python code - but where exactly does that import the module? > One question: should docstring extraction be turned on my default? I'd say so, yes. People who are confronted with gettext for the first time will say "Wow, it even does that!". In the rare cases where doc strings would confuse the meat of the catalog, people will be able to turn that off. Perhaps it may be good to indicate in the catalog that this is a doc string? I'm thinking of #, py-doc I don't know the exact specification of the #, comments, but it can look like #, c-format, fuzzy i.e. it appears to be a comma-separated list of informative flags. Translators could then decide to deal with doc strings in a different manner (e.g follow different grammatical conventions). Regards, Martin From mal@lemburg.com Sat Oct 28 14:54:19 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Sat, 28 Oct 2000 15:54:19 +0200 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> <200010272124.XAA00854@loewis.home.cs.tu-berlin.de> Message-ID: <39FADA8B.8D5FE731@lemburg.com> "Martin v. Loewis" wrote: > > > Yup. I plan to teach xmlproc the IANA registry, so that this should > > not be a problem with xmlproc. > > With due respect, I hope this is not the way it that is done. Instead, > I think codecs.lookup should know the IANA registry. It may be that > this information comes with PyXML only for now, but it should be > available to all Python applications. E.g. xml/__init__.py could > do > > codecs.register(iana_lookup) > > where iana_lookup simply maps encodings to the "normalized" form. That would be another option (this codec search function design turns out to be far more useful than originally though ;-)... > I agree with MAL that this should eventually end-up in Python proper. > In any case, knowing the official aliases should not be restricted to > xmlproc. Right. Python's encodings package should know at least about all common aliases used for the provided codecs. Do you have a pointer to a list of IANA aliases ? > > However, it is a problem that Python does not support any of the Far > > East encodings yet. Does anyone know if there are any plans to change > > that? > > Again, I'd see no problem including Tamito Kajiyama's code in PyXML, > if he wants us to ship it - or we could recommend JapaneseCodecs as an > valuable addition to PyXML; this package also uses the distutils, so > it is quite easy to install. I think it should distributed as separate package: the codecs are useful in a lot of contexts -- not only XML. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From mal@lemburg.com Sat Oct 28 14:59:40 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Sat, 28 Oct 2000 15:59:40 +0200 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> Message-ID: <39FADBCC.B19B65C9@lemburg.com> "Martin v. Loewis" wrote: > > > If you are interested, the codec is available at: > > > http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/iso_2022_7bit.py.gz > > I just had a look, and it seems like an interesting package. I'm > slightly confused about the installation procedure, though. > Installing into python2.0/encodings/{euc_jp,shift_jis,japanese} > doesn't look right to me - add-on packages should be capable of > installing into site-packages by default. > > I believe it would actually work if you just install without any > arguments to setup.py. euc_jp would then end-up in > python2.0/site-packages. Later, when you do > > u"Hello".encode("euc-jp") > > it looks for a codec. Here, encodings.__init__.search_function do > > modname = encoding.replace('-', '_') > modname = aliases.aliases.get(modname,modname) > try: > mod = __import__(modname,globals(),locals(),'*') > except ImportError,why: > _cache[encoding] = None > return None > > First, encoding becomes euc_jp. With no registered aliases, it would > then call __import__ with "euc_jp", which will find the codec in > site-packages. The "right" way to install new codec packages is by placing them inside a package which then registers a new search function in the codec registry. Tamito's other does this AFAIR. To be able to use the codecs, a Python script must then import the codecs package (which then registers the search function). Having to import the package has two benefits: 1. the need for another codec package is visible in the source code 2. registering the search function is delayed until the codec package is first used > In the long run, I'd hope that distutils provides a mean to install > additional codecs, e.g via > > setup( ... > codecs = ['japanese'] > ...) > > Then, distutils would collect all these strings, and importing codecs > would roughly do > > for package in distutils.registered_codec_packages: > p=__import__(package,global(),locals(),"*") > p.register() > > japanese/__init__.py would provide a register function which registers > another search_function, which would load euc_jp and shift_jis on > demand. That way, users could install additional codecs which are > available to everybody on the system, without having to hack the > Python library proper. Hmm, not sure here: programs which rely on non-standard codecs should have an explicit "import myCodecs" at the top of the file. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From mal@lemburg.com Sat Oct 28 15:09:13 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Sat, 28 Oct 2000 16:09:13 +0200 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> <39F953E9.493A0DE1@lemburg.com> <200010272147.XAA00953@loewis.home.cs.tu-berlin.de> Message-ID: <39FADE09.D257A7DF@lemburg.com> "Martin v. Loewis" wrote: > > > That's only Shift-JIS and EUC-JP, though. Is there any concerted > > effort afoot to make a more complete set? At the very least, > > ISO 2022-JP, Big5, VISCII, GB-2312 and EUC-KR should be implemented. > > I'd hope that somebody exposes the operating system's converters to > Python. For example, on Linux and Solaris, the iconv library offers a > wide variety of codecs (at least in its gconv form), which are also > highly performant. On W2k, a huge set of converters is available, > which just waits being exposed to Python. > > I'm always concerned by the fact that every package seems to come with > its own set of conversion tables, instead on relying on other people > to do a good job (and report bugs if they don't). Tcl has such tables, > Java does, X11 has some, ICU has more - I really can't see the reason > to reimplement them all again in Python. Sure would be nice... the only problem I see is that the different codecs for the Asian scripts will most probably behave differently, e.g. there are many issues with private code point areas in Unicode and the various Asian encodings. It would still be nice to have different codec packages around though -- even if they all implement the same converters, e.g. AsianCharmapCodecs, NativeWin32Codecs, NativeCLibCodecs, etc. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From martin@loewis.home.cs.tu-berlin.de Sat Oct 28 21:26:16 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Sat, 28 Oct 2000 22:26:16 +0200 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FADBCC.B19B65C9@lemburg.com> (mal@lemburg.com) References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> Message-ID: <200010282026.WAA00748@loewis.home.cs.tu-berlin.de> > Having to import the package has two benefits: > 1. the need for another codec package is visible in the source code I don't think this is a benefit for a typical application that uses multiple codecs. More often than not, the application will learn about a required encoding by means of an application-level protocol (e.g. a Content-Type in a MIME header). It doesn't really *require* any encoding; instead, it needs the codecs of any data it happens to process in a certain session. The application designer is normally not interested in a specific encoding; she expects Python to do the right thing whenever .encode is invoked. > 2. registering the search function is delayed until the codec > package is first used That is hardly a benefit: registering the search function is not an expensive operation, and the typical application would start with try: import japanese except ImportError: pass try: import windows_codepages except ImportError: pass try: import iana except ImportError: pass try: import OSFCharmaps except ImportError: pass anyway, so all codecs it may need are registered right from the start. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Sat Oct 28 21:34:45 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Sat, 28 Oct 2000 22:34:45 +0200 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FADA8B.8D5FE731@lemburg.com> (mal@lemburg.com) References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> <200010272124.XAA00854@loewis.home.cs.tu-berlin.de> <39FADA8B.8D5FE731@lemburg.com> Message-ID: <200010282034.WAA00771@loewis.home.cs.tu-berlin.de> > Do you have a pointer to a list of IANA aliases ? It's at http://www.isi.edu/in-notes/iana/assignments/character-sets Regards, Martin From andy@reportlab.com Sun Oct 29 07:14:01 2000 From: andy@reportlab.com (Andy Robinson) Date: Sun, 29 Oct 2000 07:14:01 -0000 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <200010272147.XAA00953@loewis.home.cs.tu-berlin.de> Message-ID: > -----Original Message----- > From: i18n-sig-admin@python.org [mailto:i18n-sig-admin@python.org]On > Behalf Of Martin v. Loewis > Sent: 27 October 2000 22:48 > To: larsga@garshol.priv.no > Cc: i18n-sig@python.org; xml-sig@python.org > Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat > > > > That's only Shift-JIS and EUC-JP, though. Is there any concerted > > effort afoot to make a more complete set? At the very least, > > ISO 2022-JP, Big5, VISCII, GB-2312 and EUC-KR should be > implemented. > That was the intention, but I admit we have run out of steam somewhat. Tamito Kajiyama is the only person to have made a really big contribution. I was hoping to, but that hope was on the basis of a large customer project needing this stuff which got cancelled, and running a startup is taking so much time that I won't manage much until ReportLab gets a customer who needs to reencode data. When that happens, we'll have to do it, and fast. As an aside, we're doing the work to allow use of Adobe's Asian Font Packs in reportlab at the moment, and they use the native encodings. So once that comes out, we'll be under a lot of pressure to do it. I am very hopeful of the first half of next year if no one else has done the work already. In the meantime, frankly, not enough people need it badly enough and nobody but Tamito has had a go. Volunteers welcome! >I'm always concerned by the fact that every package seems to come with >its own set of conversion tables, instead on relying on other people >to do a good job (and report bugs if they don't). Tcl has such tables, >Java does, X11 has some, ICU has more - I really can't see the reason >to reimplement them all again in Python. I don't use Tcl, Java or X11 and don't know what ICU is, but I do use Python on several platforms and would want to know that the encodings library worked identically on all platforms - i.e. if there are bugs in the codecs, they are consistent and can be fixed consistently. I think this issue was pretty much settled in MAL's original i18n proposal. However, no sane person retypes mapping tables; if we built something Pythonic we'd hopefully do it by extracting data from two different sources, building our own tables and checking they got identical results. With compression into a Zip file and careful use of diff-like techniques (all the obscure Asian codecs go like 'take this base encoding and add these extra code points'), I believe a good codec database could be quite small. - Andy Robinson From martin@loewis.home.cs.tu-berlin.de Sun Oct 29 08:50:50 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Sun, 29 Oct 2000 09:50:50 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: References: Message-ID: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> > I don't use Tcl, Java or X11 and don't know what ICU > is, but I do use Python on several platforms and would > want to know that the encodings library worked > identically on all platforms - i.e. if there are bugs > in the codecs, they are consistent and can be fixed > consistently. I think this issue was pretty much settled > in MAL's original i18n proposal. I sense a certain "reinvent the wheel" attitude here. Why do you assume that the codecs developed by somebody else will have bugs? While the "we know how character sets work" approach provides consistency across platforms, it doesn't provide consistency between applications on a single platform. I believe most users are more interested in that - they install some codec tables on their system, and then all applications recognize these codecs, whether written in C or Python. Regards, Martin From mal@lemburg.com Mon Oct 30 08:48:40 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Mon, 30 Oct 2000 09:48:40 +0100 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010282026.WAA00748@loewis.home.cs.tu-berlin.de> Message-ID: <39FD35E8.17B4AF5C@lemburg.com> "Martin v. Loewis" wrote: > > > Having to import the package has two benefits: > > 1. the need for another codec package is visible in the source code > > I don't think this is a benefit for a typical application that uses > multiple codecs. More often than not, the application will learn about > a required encoding by means of an application-level protocol (e.g. a > Content-Type in a MIME header). It doesn't really *require* any > encoding; instead, it needs the codecs of any data it happens to > process in a certain session. The application designer is normally not > interested in a specific encoding; she expects Python to do the right > thing whenever .encode is invoked. But the requirement for a non-standard codec package is made visible this way and that's what I was referring to. An application which relies on availability of Japanese codecs will produce an ImportError in case these are not installed. > > 2. registering the search function is delayed until the codec > > package is first used > > That is hardly a benefit: registering the search function is not an > expensive operation, and the typical application would start with > > try: > import japanese > except ImportError: > pass > try: > import windows_codepages > except ImportError: > pass > try: > import iana > except ImportError: > pass > try: > import OSFCharmaps > except ImportError: > pass > > anyway, so all codecs it may need are registered right from the start. No. You wouldn't hide these ImportErrors if you rely on the packages being installed. If the application doesn't care for the specific encodings being installed, then the administrator could add these imports to the sitecustomize.py module after installing the codec packages. I don't think that doing this automatically is a good idea. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From kajiyama@grad.sccs.chukyo-u.ac.jp Mon Oct 30 08:38:11 2000 From: kajiyama@grad.sccs.chukyo-u.ac.jp (Tamito KAJIYAMA) Date: Mon, 30 Oct 2000 17:38:11 +0900 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> (martin@loewis.home.cs.tu-berlin.de) References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> Message-ID: <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> * Martin v. Loewis | | Installing into python2.0/encodings/{euc_jp,shift_jis,japanese} | doesn't look right to me - add-on packages should be capable of | installing into site-packages by default. * M.-A. Lemburg | | The "right" way to install new codec packages is by placing them | inside a package which then registers a new search function in the | codec registry. | | Tamito's other does this AFAIR. | | To be able to use the codecs, a Python script must then import | the codecs package (which then registers the search function). Beta versions of the Japanese codecs have been implemented as a usual add-on package, so applications need to import it before using a Japanese codec. I had provided a module named codecs_ja which registers codecs for EUC-JP and Shift_JIS at a time. The current version of the codecs has been implemented as a special "codecs" package that needs to be installed into lib/encodings as well as the standard encodings. I think we need an agreement on how non-standard codecs should be installed. I prefer the later approach. I want Python to take care of all encoding issues, and if possible I want to write applications without considering which encodings can be handled at the core language level. I hope that in the near future Python will support all encodings that have mappings from/to Unicode. If an application requires an encoding that is not supported by Python at that time, then a LookupError raises; all the application needs to do is to catch that exception and to tell the user that the encoding is currently not supported. I think this is not a problem, since it is automatically solved without any changes to the application once Python supports that encoding. Regards, -- KAJIYAMA, Tamito From mal@lemburg.com Mon Oct 30 08:59:49 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Mon, 30 Oct 2000 09:59:49 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> Message-ID: <39FD3885.56458C3C@lemburg.com> "Martin v. Loewis" wrote: > > > I don't use Tcl, Java or X11 and don't know what ICU > > is, but I do use Python on several platforms and would > > want to know that the encodings library worked > > identically on all platforms - i.e. if there are bugs > > in the codecs, they are consistent and can be fixed > > consistently. I think this issue was pretty much settled > > in MAL's original i18n proposal. > > I sense a certain "reinvent the wheel" attitude here. Why do you > assume that the codecs developed by somebody else will have bugs? > > While the "we know how character sets work" approach provides > consistency across platforms, it doesn't provide consistency between > applications on a single platform. I believe most users are more > interested in that - they install some codec tables on their system, > and then all applications recognize these codecs, whether written in C > or Python. I don't think that reinventing the wheel for the sake of cross-platform compatibility is a bad thing. Besides, noone prevents anyone from writing Python extensions to make the system codecs available to Python. The problem we face with these, though, is that they won't be available everywhere. The basic design decision we made for Unicode was to have it available everywhere -- not only on platforms where Unicode is supported. This includes a usable set of codecs for all common encodings. The Asian codecs were just left out of the standard dist due to size problems. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From mal@lemburg.com Mon Oct 30 10:43:34 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Mon, 30 Oct 2000 11:43:34 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> Message-ID: <39FD50D6.11FBF133@lemburg.com> Tamito KAJIYAMA wrote: > > * Martin v. Loewis > | > | Installing into python2.0/encodings/{euc_jp,shift_jis,japanese} > | doesn't look right to me - add-on packages should be capable of > | installing into site-packages by default. > > * M.-A. Lemburg > | > | The "right" way to install new codec packages is by placing them > | inside a package which then registers a new search function in the > | codec registry. > | > | Tamito's other does this AFAIR. > | > | To be able to use the codecs, a Python script must then import > | the codecs package (which then registers the search function). > > Beta versions of the Japanese codecs have been implemented as a > usual add-on package, so applications need to import it before > using a Japanese codec. I had provided a module named codecs_ja > which registers codecs for EUC-JP and Shift_JIS at a time. > > The current version of the codecs has been implemented as a > special "codecs" package that needs to be installed into > lib/encodings as well as the standard encodings. > > I think we need an agreement on how non-standard codecs should > be installed. They should be installed as separate package and then register a search function which adds the included codecs to the codec registry. Lib/encodings should in all cases be left untouched. Installing third party software directly into the standard lib directory is bad practice and not really needed anymore now that we have distutils. If you don't want to bother with importing the codec packages in your application, you can use the sitecustomize.py module to do the imports at startup time. Another possibly approach would be creating a new codec top level package "sitecodecs" which is then used as pool for all site specific codecs and also searched by the encodings search function if present. > I prefer the later approach. I want Python to take care of all > encoding issues, and if possible I want to write applications > without considering which encodings can be handled at the core > language level. I hope that in the near future Python will > support all encodings that have mappings from/to Unicode. If an > application requires an encoding that is not supported by Python > at that time, then a LookupError raises; all the application > needs to do is to catch that exception and to tell the user that > the encoding is currently not supported. I think this is not a > problem, since it is automatically solved without any changes to > the application once Python supports that encoding. The standard distribution will probably not include the Asian codecs -- just like it doesn't include all the other goodies which people are fond of. Instead, Python distribution packagers like ActivePython will ship versions of Python which include these extra packages. At least that's the idea behind keeping the Python core rather small and maintainable. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From andy@reportlab.com Mon Oct 30 10:49:57 2000 From: andy@reportlab.com (Andy Robinson) Date: Mon, 30 Oct 2000 10:49:57 -0000 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FD3885.56458C3C@lemburg.com> Message-ID: > The Asian codecs were just left out of the standard dist due > to size problems. ...and also due to not all being written yet :-) - Andy From mal@lemburg.com Mon Oct 30 11:44:14 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Mon, 30 Oct 2000 12:44:14 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: Message-ID: <39FD5F0E.867C76A7@lemburg.com> Andy Robinson wrote: > > > The Asian codecs were just left out of the standard dist due > > to size problems. > > ...and also due to not all being written yet :-) Well, we could have included Tamito's codecs, but the general consent was not to due to the size of the mapping tables. I think that we ought to start a project for implementing the AsianCodecs package. I'll look into wrapping the C lib iconv interface into a codec package... provided I find some time :-( I've had a look at the IANA character set registry (http://www.isi.edu/in-notes/iana/assignments/character-sets) and compared the info to what we already have in Python 2.0. Here is a list of codecs which are not present in Python 2.0. It would be nice if someone with access to the various sources could help in putting together a few charmap codecs for these in case they are really needed (I think some EBCDIC codecs would be helpful for conversion of host data files)... Missing Codecs: ------------------------------------------------------------------------ ISO-2022-KR : RFC-1557 (see also KS_C_5601-1987) IBM00858 : IBM See (.../assignments/character-set-info/IBM00858) [Mahdi] DEC-MCS : VAX/VMS User's Manual, EBCDIC-UK : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 ISO-2022-CN : RFC-1922 MNEM : RFC 1345, also known as "mnemonic+ascii+8200" T.101-G2 : ECMA registry KOI8-U : RFC 2319 IBM880 : IBM NLS RM Vol2 SE09-8002-01, March 1990 Windows-31J : Windows Japanese. A further extension of Shift_JIS ISO_5427:1981 : ECMA registry JUS_I.B1.003-mac : ECMA registry ISO-8859-2-Windows-Latin-2 : Extended ISO 8859-2. Latin-2 for Windows 3.1. Adobe-Symbol-Encoding : PostScript Language Reference Manual IBM275 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IT : ECMA registry EBCDIC-AT-DE-A : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 GB_1988-80 : ECMA registry DS_2089 : Danish Standard, DS 2089, February 1974 ISO-10646-UCS-Basic : ASCII subset of Unicode. Basic Latin = collection 1 EBCDIC-CA-FR : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 TIS-620 : Thai Industrial Standards Institute (TISI) [Tantsetthi] IBM-Symbols : Presentation Set, CPGID: 259 MNEMONIC : RFC 1345, also known as "mnemonic+ascii+38" CSA_Z243.4-1985-2 : ECMA registry ISO-8859-9-Windows-Latin-5 : Extended ISO 8859-9. Latin-5 for Windows 3.1 ISO-2022-JP : RFC-1468 (see also RFC-2237) GOST_19768-74 : ECMA registry DIN_66003 : ECMA registry EBCDIC-FR : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 ASMO_449 : ECMA registry ISO-Unicode-IBM-1276 : IBM Cyrillic Greek Extended Presentation Set, GCSGID: 1276 latin-greek : ECMA registry HZ-GB-2312 : RFC 1842, RFC 1843 [RFC1842, RFC1843] Big5-HKSCS : See (.../assignments/character-set-info/Big5-HKSCS) ISO-10646-UCS-4 : the full code space. (same comment about byte order, ISO-10646-UTF-1 : Universal Transfer Format (1), this is the multibyte ISO-10646-UCS-2 : the 2-octet Basic Multilingual Plane, aka Unicode CSA_Z243.4-1985-gr : ECMA registry latin-lap : ECMA registry EBCDIC-ES : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 UNKNOWN-8BIT : EBCDIC-FI-SE : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 BS_4730 : ECMA registry IBM290 : IBM 3174 Character Set Ref, GA27-3831-02, March 1990 IBM420 : IBM NLS RM Vol2 SE09-8002-01, March 1990, JIS_Encoding : JIS X 0202-1991. Uses ISO 2022 escape sequences to T.61-8bit : ECMA registry ISO-2022-CN-EXT : RFC-1922 Microsoft-Publishing : PCL 5 Comparison Guide, Hewlett-Packard, ISO-2022-JP-2 : RFC-1554 ISO_5428:1980 : ECMA registry Ventura-Math : PCL 5 Comparison Guide, Hewlett-Packard, EBCDIC-ES-S : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 windows-1251 : Microsoft (see ../character-set-info/windows-1251) [Lazhintseva] windows-1250 : Microsoft (see ../character-set-info/windows-1250) [Lazhintseva] windows-1253 : Microsoft (see ../character-set-info/windows-1253) [Lazhintseva] windows-1252 : Microsoft (see ../character-set-info/windows-1252) [Wendt] windows-1255 : Microsoft (see ../character-set-info/windows-1255) [Lazhintseva] windows-1254 : Microsoft (see ../character-set-info/windows-1254) [Lazhintseva] windows-1257 : Microsoft (see ../character-set-info/windows-1257) [Lazhintseva] windows-1256 : Microsoft (see ../character-set-info/windows-1256) [Lazhintseva] windows-1258 : Microsoft (see ../character-set-info/windows-1258) [Lazhintseva] JUS_I.B1.002 : ECMA registry ISO_8859-8-I : RFC-1556 CSA_Z243.4-1985-1 : ECMA registry JIS_X0212-1990 : ECMA registry ISO_5427 : ECMA registry ISO_6937-2-add : ECMA registry and ISO 6937-2:1983 ISO_8859-8-E : RFC-1556 BS_viewdata : ECMA registry IBM281 : IBM 3174 Character Set Ref, GA27-3831-02, March 1990 IBM280 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IBM285 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IBM284 : IBM NLS RM Vol2 SE09-8002-01, March 1990 Adobe-Standard-Encoding : PostScript Language Reference Manual ISO_646.irv:1983 : ECMA registry GB2312 : Chinese for People's Republic of China (PRC) mixed one byte, Extended_UNIX_Code_Fixed_Width_for_Japanese : Used in Japan. Each character is 2 octets. SEN_850200_B : ECMA registry SEN_850200_C : ECMA registry Ventura-International : Ventura International. ASCII plus coded characters similar ISO-Unicode-IBM-1265 : IBM Hebrew Presentation Set, GCSGID: 1265 ISO-Unicode-IBM-1264 : IBM Arabic Presentation Set, GCSGID: 1264 ISO-Unicode-IBM-1261 : IBM Latin-2, -3, -5, Extended Presentation Set, GCSGID: 1261 IBM851 : IBM NLS RM Vol2 SE09-8002-01, March 1990 PC8-Turkish : PC Latin Turkish. PCL Symbol Set id: 9T ISO_8859-supp : ECMA registry ISO-Unicode-IBM-1268 : IBM Latin-4 Extended Presentation Set, GCSGID: 1268 EBCDIC-ES-A : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 ISO-8859-1-Windows-3.0-Latin-1 : Extended ISO 8859-1 Latin-1 for Windows 3.0. IBM01149 : IBM See (.../assignments/character-set-info/IBM01149) [Mahdi] ECMA-cyrillic : ECMA registry IBM01147 : IBM See (.../assignments/character-set-info/IBM01147) [Mahdi] NATS-DANO-ADD : ECMA registry IBM01145 : IBM See (.../assignments/character-set-info/IBM01145) [Mahdi] IBM01144 : IBM See (.../assignments/character-set-info/IBM01144) [Mahdi] IBM01143 : IBM See (.../assignments/character-set-info/IBM01143) [Mahdi] IBM01141 : IBM See (.../assignments/character-set-info/IBM01141) [Mahdi] IBM01140 : IBM See (.../assignments/character-set-info/IBM01140) [Mahdi] macintosh : The Unicode Standard ver1.0, ISBN 0-201-56788-1, Oct 1991 IBM278 : IBM NLS RM Vol2 SE09-8002-01, March 1990 NS_4551-2 : ECMA registry IBM274 : IBM 3174 Character Set Ref, GA27-3831-02, March 1990 NS_4551-1 : ECMA registry JIS_C6226-1983 : ECMA registry ANSI_X3.110-1983 : ECMA registry IBM273 : IBM NLS RM Vol2 SE09-8002-01, March 1990 JIS_C6229-1984-b : ECMA registry greek7 : ECMA registry EUC-KR : RFC-1557 (see also KS_C_5861-1992) NF_Z_62-010 : ECMA registry JIS_X0201 : JIS X 0201-1976. One byte only, this is equivalent to IBM01146 : IBM See (.../assignments/character-set-info/IBM01146) [Mahdi] IBM01148 : IBM See (.../assignments/character-set-info/IBM01148) [Mahdi] ES : ECMA registry PT2 : ECMA registry INIS-cyrillic : ECMA registry NF_Z_62-010_(1973) : ECMA registry greek-ccitt : ECMA registry EBCDIC-AT-DE : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 JIS_C6229-1984-b-add : ECMA registry Big5 : Chinese for Taiwan Multi-byte set. MSZ_7795.3 : ECMA registry JIS_C6220-1969-ro : ECMA registry videotex-suppl : ECMA registry HP-Math8 : PCL 5 Comparison Guide, Hewlett-Packard, IBM01142 : IBM See (.../assignments/character-set-info/IBM01142) [Mahdi] HP-DeskTop : PCL 5 Comparison Guide, Hewlett-Packard, ISO_8859-6-I : RFC-1556 IBM00924 : IBM See (.../assignments/character-set-info/IBM00924) [Mahdi] JIS_C6229-1984-kana : ECMA registry IBM277 : IBM NLS RM Vol2 SE09-8002-01, March 1990 JUS_I.B1.003-serb : ECMA registry IBM870 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IBM871 : IBM NLS RM Vol2 SE09-8002-01, March 1990 EBCDIC-FI-SE-A : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 IBM903 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IBM904 : IBM NLS RM Vol2 SE09-8002-01, March 1990 VIQR : RFC 1456 EBCDIC-PT : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 JIS_C6220-1969-jp : ECMA registry ISO_10367-box : ECMA registry JIS_C6229-1984-hand-add : ECMA registry PC8-Danish-Norwegian : PC Danish Norwegian KS_C_5601-1987 : ECMA registry iso-ir-90 : ECMA registry greek7-old : ECMA registry us-dk : ISO-8859-1-Windows-3.1-Latin-1 : Extended ISO 8859-1 Latin-1 for Windows 3.1. IBM918 : IBM NLS RM Vol2 SE09-8002-01, March 1990 hp-roman8 : LaserJet IIP Printer User's Manual, IBM905 : IBM 3174 Character Set Ref, GA27-3831-02, March 1990 ISO_2033-1983 : ECMA registry IBM-Thai : Presentation Set, CPGID: 838 NATS-DANO : ECMA registry IBM868 : IBM NLS RM Vol2 SE09-8002-01, March 1990 IBM297 : IBM NLS RM Vol2 SE09-8002-01, March 1990 Latin-greek-1 : ECMA registry EBCDIC-US : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 IBM423 : IBM NLS RM Vol2 SE09-8002-01, March 1990 ISO_6937-2-25 : ECMA registry ES2 : ECMA registry NATS-SEFI : ECMA registry KSC5636 : ISO-10646-Unicode-Latin1 : ISO Latin-1 subset of Unicode. Basic Latin and Latin-1 GB_2312-80 : ECMA registry HP-Legal : PCL 5 Comparison Guide, Hewlett-Packard, ISO_8859-6-E : RFC-1556 Extended_UNIX_Code_Packed_Format_for_Japanese : Standardized by OSF, UNIX International, and UNIX Systems ISO_646.basic:1983 : ECMA registry INIS-8 : ECMA registry JIS_C6229-1984-hand : ECMA registry EBCDIC-DK-NO : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 VISCII : RFC 1456 INIS : ECMA registry PT : ECMA registry Ventura-US : Ventura US. ASCII plus characters typically used in CSN_369103 : ECMA registry JIS_C6226-1978 : ECMA registry IBM891 : IBM NLS RM Vol2 SE09-8002-01, March 1990 dk-us : EBCDIC-IT : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 UNICODE-1-1 : RFC 1641 UNICODE-1-1-UTF-7 : RFC 1642 JIS_C6229-1984-a : ECMA registry INVARIANT : HP-Pi-font : PCL 5 Comparison Guide, Hewlett-Packard, NATS-SEFI-ADD : ECMA registry IBM038 : IBM 3174 Character Set Ref, GA27-3831-02, March 1990 T.61-7bit : ECMA registry IEC_P27-1 : ECMA registry ISO-10646-J-1 : ISO 10646 Japanese, see RFC 1815. Shift_JIS : This charset is an extension of csHalfWidthKatakana by EBCDIC-DK-NO-A : IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 NC_NC00-10:81 : ECMA registry -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From tree@basistech.com Mon Oct 30 16:07:56 2000 From: tree@basistech.com (Tom Emerson) Date: Mon, 30 Oct 2000 11:07:56 -0500 (EST) Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FD5F0E.867C76A7@lemburg.com> References: <39FD5F0E.867C76A7@lemburg.com> Message-ID: <14845.40156.328377.745349@cymru.basistech.com> M.-A. Lemburg writes: > I think that we ought to start a project for implementing > the AsianCodecs package. Agreed. Basis Technology can host this effort if necessary. Regardless, I will volunteer to handle the Chinese encodings, including: GB 2312-80, CP936 Big Five, Big Five Plus, CP950, Big Five HKSCS and others in the IANA list not mentioned above. I would also suggest that someone (perhaps based on Tamito's work) implement a general purpose ISO-2022 codec that can be subclassed for the locale specific environments. -tree -- Tom Emerson Basis Technology Corp. Zenkaku Language Hacker http://www.basistech.com "Beware the lollipop of mediocrity: lick it once and you suck forever" From andy@reportlab.com Mon Oct 30 16:13:07 2000 From: andy@reportlab.com (Andy Robinson) Date: Mon, 30 Oct 2000 16:13:07 -0000 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <14845.40156.328377.745349@cymru.basistech.com> Message-ID: > M.-A. Lemburg writes: > > I think that we ought to start a project for implementing > > the AsianCodecs package. > > Agreed. Basis Technology can host this effort if > necessary. Regardless, I will volunteer to handle the Chinese > encodings, including: > > GB 2312-80, CP936 > Big Five, Big Five Plus, CP950, Big Five HKSCS > > and others in the IANA list not mentioned above. > > I would also suggest that someone (perhaps based on > Tamito's work) > implement a general purpose ISO-2022 codec that can be > subclassed for > the locale specific environments. > I agree this needs to move. I'm sig coordinator and realy should have done more myself this summer. Tom, thanks for the offer of hosting, but can any see a reason NOT to use Sourceforge? Python itself and pyxml are there, many people have accounts, and it makes it a lot easier for us to set up web-based test harnesses and eventually for Guido et al to grab what they need if they need it. I can set up a pyi18n project right away, unless someone can come up with a more marketable name :-) I'll volunteer to at least set up a test harness in December so that people can submit jobs to it via the web asking it to translate files; and I have two good authors of Python extensions working with me who can help out. - Andy From tree@basistech.com Mon Oct 30 16:49:17 2000 From: tree@basistech.com (Tom Emerson) Date: Mon, 30 Oct 2000 11:49:17 -0500 (EST) Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: References: <14845.40156.328377.745349@cymru.basistech.com> Message-ID: <14845.42637.157421.378878@cymru.basistech.com> Andy Robinson writes: > Tom, thanks for the offer of hosting, but can any see a reason > NOT to use Sourceforge? Python itself and pyxml are > there, many people have accounts, and it makes it [...] No, this makes much more sense. My SourceForge username is "tree". -tree -- Tom Emerson Basis Technology Corp. Zenkaku Language Hacker http://www.basistech.com "Beware the lollipop of mediocrity: lick it once and you suck forever" From andy@reportlab.com Mon Oct 30 20:18:57 2000 From: andy@reportlab.com (Andy Robinson) Date: Mon, 30 Oct 2000 20:18:57 -0000 Subject: [I18n-sig] Sourceforge project name In-Reply-To: <14845.42637.157421.378878@cymru.basistech.com> Message-ID: OK, we can set up a sourceforge project. We need a name for it, which cannot be changed later; this does not affect the name of any Python packages we produce later. I'll suggest 'pycodecs'. But I'm feeling uninspired tonight and hope someone can come up with some other ideas. If there are no other suggestions in 24 hours, I'll set it up. Who wants checkin and admin rights? - Andy From mal@lemburg.com Mon Oct 30 20:32:48 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Mon, 30 Oct 2000 21:32:48 +0100 Subject: [I18n-sig] Sourceforge project name References: Message-ID: <39FDDAF0.5A7BDF45@lemburg.com> Andy Robinson wrote: > > OK, we can set up a sourceforge project. We need a name for it, which > cannot be changed later; this does not affect the name of any Python > packages we produce later. > > I'll suggest 'pycodecs'. But I'm feeling uninspired tonight and hope > someone can come up with some other ideas. If there are no other > suggestions in 24 hours, I'll set it up. I'd suggest 'python-codecs' -- why not use the obvious ;-) -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From larsga@garshol.priv.no Tue Oct 31 10:56:25 2000 From: larsga@garshol.priv.no (Lars Marius Garshol) Date: 31 Oct 2000 11:56:25 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FADA8B.8D5FE731@lemburg.com> References: <200010252312.BAA01255@loewis.home.cs.tu-berlin.de> <200010272124.XAA00854@loewis.home.cs.tu-berlin.de> <39FADA8B.8D5FE731@lemburg.com> Message-ID: * Martin von Loewis | | Again, I'd see no problem including Tamito Kajiyama's code in PyXML, | if he wants us to ship it - or we could recommend JapaneseCodecs as an | valuable addition to PyXML; this package also uses the distutils, so | it is quite easy to install. * mal@lemburg.com | | I think it should distributed as separate package: the codecs | are useful in a lot of contexts -- not only XML. Agreed. Anyone who wants the codecs at all will want them regardless of whether they want the XML package or not. --Lars M. From larsga@garshol.priv.no Tue Oct 31 11:01:24 2000 From: larsga@garshol.priv.no (Lars Marius Garshol) Date: 31 Oct 2000 12:01:24 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: References: Message-ID: * Lars Marius Garshol | | That's only Shift-JIS and EUC-JP, though. Is there any concerted | effort afoot to make a more complete set? At the very least, ISO | 2022-JP, Big5, VISCII, GB-2312 and EUC-KR should be implemented. * Andy Robinson | | That was the intention, but I admit we have run out of steam | somewhat. Tamito Kajiyama is the only person to have made a really | big contribution. [...] Volunteers welcome! Then I may have a go at it if I can find the time. I've written codecs for all these in C++ over the past few weeks, so it should be a simple job to redo it for Python. (It was for a closed-source project, so it can unfortunately not be reused directly.) | However, no sane person retypes mapping tables; if we built | something Pythonic we'd hopefully do it by extracting data from two | different sources, building our own tables and checking they got | identical results. www.unicode.org provides mapping tables that are really easy to parse with a Python script in order to build tables. | With compression into a Zip file and careful use of diff-like | techniques (all the obscure Asian codecs go like 'take this base | encoding and add these extra code points'), I believe a good codec | database could be quite small. My binary collection of conversion tables for ISO 8859 1->15, Windows-12xx, koi8-r, VISCII, Shift-JIS, EUC-JP, ISO 2022-JP, Big5, EUC-KR and GB-2312 is about 90k. --Lars M. From martin@loewis.home.cs.tu-berlin.de Mon Oct 30 22:30:59 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Mon, 30 Oct 2000 23:30:59 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> (message from Tamito KAJIYAMA on Mon, 30 Oct 2000 17:38:11 +0900) References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> Message-ID: <200010302230.XAA00795@loewis.home.cs.tu-berlin.de> > I prefer the later approach. I want Python to take care of all > encoding issues, and if possible I want to write applications > without considering which encodings can be handled at the core > language level. Exactly my feelings. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Mon Oct 30 22:45:42 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Mon, 30 Oct 2000 23:45:42 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FD3885.56458C3C@lemburg.com> (mal@lemburg.com) References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> Message-ID: <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> > I don't think that reinventing the wheel for the sake of > cross-platform compatibility is a bad thing. I think it is a bad thing if it deviates from the traditional Python to cross-plattform solutions, which is to unify different interfaces into a single one (e.g. os, threads, anydbm, DB API, even Tkinter). Traditionally, Python provides a fall-back if the platform does not provide the functionality. With the codecs, Python provides the functionality even though the platform provides it as well, and in a better way. That is a bad thing, but I hope it can be corrected in future releases. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Mon Oct 30 22:59:45 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Mon, 30 Oct 2000 23:59:45 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FD50D6.11FBF133@lemburg.com> (mal@lemburg.com) References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> Message-ID: <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> > If you don't want to bother with importing the codec packages > in your application, you can use the sitecustomize.py module > to do the imports at startup time. I don't think that's an acceptable solution. Having the superuser modify sitecustomize.py won't work on most installations. > Another possibly approach would be creating a new codec top > level package "sitecodecs" which is then used as pool for > all site specific codecs and also searched by the encodings search > function if present. That's one solution; another is to have distutils support installation of additional codecs, and making them "known". > The standard distribution will probably not include the > Asian codecs -- just like it doesn't include all the other > goodies which people are fond of. Instead, Python distribution > packagers like ActivePython will ship versions of Python which > include these extra packages. > > At least that's the idea behind keeping the Python core rather > small and maintainable. Myself, I don't see that as a problem. What *is* important that users requiring additional codecs can be given instructions as simple as 'python setup.py install'. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Mon Oct 30 22:29:42 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Mon, 30 Oct 2000 23:29:42 +0100 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FD35E8.17B4AF5C@lemburg.com> (mal@lemburg.com) References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010282026.WAA00748@loewis.home.cs.tu-berlin.de> <39FD35E8.17B4AF5C@lemburg.com> Message-ID: <200010302229.XAA00761@loewis.home.cs.tu-berlin.de> > No. You wouldn't hide these ImportErrors if you rely on the > packages being installed. That's my point. Most applications don't really *rely* on them. It's a matter of the data the user passes to the application. If data are processed whose encoding is not known, then the application will get a UnicodeError, which it will translate to some error message in advance. It doesn't know or care what the specific encoding was - the application itself only deals with Unicode strings. > If the application doesn't care for the specific encodings being > installed, then the administrator could add these imports to the > sitecustomize.py module after installing the codec packages. I don't > think that doing this automatically is a good idea. I think it is, and apparently Tamito thinks that way as well, since the JapaneseCodecs install themselves into the Python installation, to be always available. You shouldn't attempt to declare some feature as useless when there is a demonstrated need for it - instead, you should strive to provide the feature somehow if you disagree with the mechanism that is currently used to achieve it. Regards, Martin From mal@lemburg.com Tue Oct 31 14:46:31 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 15:46:31 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> Message-ID: <39FEDB47.41E89785@lemburg.com> "Martin v. Loewis" wrote: > > > I don't think that reinventing the wheel for the sake of > > cross-platform compatibility is a bad thing. > > I think it is a bad thing if it deviates from the traditional Python > to cross-plattform solutions, which is to unify different interfaces > into a single one (e.g. os, threads, anydbm, DB API, even > Tkinter). Traditionally, Python provides a fall-back if the platform > does not provide the functionality. With the codecs, Python provides > the functionality even though the platform provides it as well, and in > a better way. That is a bad thing, but I hope it can be corrected in > future releases. I don't get it: Python provides a new Unicode type on all platforms even ones which don't support Unicode at all. An even better: it is guaranteed to work the same on all platforms. You'll never get the same level of conformance with platform specific codecs since there are simply too many issues involved. Besides, what's the point in argueing against having a fairly complete set of codecs in the standard distribution ? You can always revert to using platform specific codecs if you like (provided you install a proper codec package first). -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From mal@lemburg.com Tue Oct 31 14:49:43 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 15:49:43 +0100 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010282026.WAA00748@loewis.home.cs.tu-berlin.de> <39FD35E8.17B4AF5C@lemburg.com> <200010302229.XAA00761@loewis.home.cs.tu-berlin.de> Message-ID: <39FEDC07.9B229A7D@lemburg.com> "Martin v. Loewis" wrote: > > > No. You wouldn't hide these ImportErrors if you rely on the > > packages being installed. > > That's my point. Most applications don't really *rely* on them. It's a > matter of the data the user passes to the application. If data are > processed whose encoding is not known, then the application will get a > UnicodeError, which it will translate to some error message in > advance. It doesn't know or care what the specific encoding was - the > application itself only deals with Unicode strings. Ok, but why is installing the codecs using a separate package and then adding them to site.py such a pain ? > > If the application doesn't care for the specific encodings being > > installed, then the administrator could add these imports to the > > sitecustomize.py module after installing the codec packages. I don't > > think that doing this automatically is a good idea. > > I think it is, and apparently Tamito thinks that way as well, since > the JapaneseCodecs install themselves into the Python installation, to > be always available. No third-party package should install itself into the standard Python installation. That's simply wrong. > You shouldn't attempt to declare some feature as > useless when there is a demonstrated need for it - instead, you should > strive to provide the feature somehow if you disagree with the > mechanism that is currently used to achieve it. Which I did by proposing to have the encodings search function try to import a package called 'sitecodecs'. The JapaneseCodecs package would then install itself into that site specific host package. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From mal@lemburg.com Tue Oct 31 15:02:28 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 16:02:28 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> Message-ID: <39FEDF04.B604E280@lemburg.com> "Martin v. Loewis" wrote: > > > If you don't want to bother with importing the codec packages > > in your application, you can use the sitecustomize.py module > > to do the imports at startup time. > > I don't think that's an acceptable solution. Having the superuser > modify sitecustomize.py won't work on most installations. Point taken. BTW, I've just had an idea which could help us out: when requesting a codec you can also specify a package name, e.g. say you have a codec package "mycodecs" which contains a codec "my_utf_8.py". Then you can write: u"abc".encode("mycodecs.my_utf_8") and the encodings search function will take care of the rest. Wouldn't this solve at least some of the problems ? Adding the "import sitecodecs" to site.py would be an additional help. But it would create naming problems, which would be avoided by using the above scheme. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 18:09:33 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 19:09:33 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FEDB47.41E89785@lemburg.com> (mal@lemburg.com) References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> <39FEDB47.41E89785@lemburg.com> Message-ID: <200010311809.TAA00764@loewis.home.cs.tu-berlin.de> > I don't get it: Python provides a new Unicode type on all platforms > even ones which don't support Unicode at all. An even better: > it is guaranteed to work the same on all platforms. I don't have too much problems with the Unicode type itself (although I had preferred a solution that uses wchar_t as the element type where available). > You'll never get the same level of conformance with platform > specific codecs since there are simply too many issues involved. > > Besides, what's the point in argueing against having a fairly > complete set of codecs in the standard distribution ? Speed, perhaps? What is the point of having the bsddb module when we have dumbdbm? Regards, Martin From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 18:25:26 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 19:25:26 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FEDF04.B604E280@lemburg.com> (mal@lemburg.com) References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> <39FEDF04.B604E280@lemburg.com> Message-ID: <200010311825.TAA01019@loewis.home.cs.tu-berlin.de> > u"abc".encode("mycodecs.my_utf_8") > > and the encodings search function will take care of the rest. > > Wouldn't this solve at least some of the problems ? I don't think so. The class of applications that I think will need codecs first are "internet" applications: processors of HTML, XML, MIME. In all these cases, some well-established encoding name is used, which should be provided by Python "as-is". That is, if you receive data in "shift-jis", having to map this to "japanese.shift_jis" is just the same as requiring that "japanese" is imported up-front. In the applications that I see, the application does not want to know what "shift-jis" is - it just wants Python to convert that to Unicode. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 18:21:27 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 19:21:27 +0100 Subject: [Distutils] Re: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FEDC07.9B229A7D@lemburg.com> (mal@lemburg.com) References: <200010262342.IAA16679@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010270110.KAA16842@dhcp198.grad.sccs.chukyo-u.ac.jp> <200010272140.XAA00906@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010282026.WAA00748@loewis.home.cs.tu-berlin.de> <39FD35E8.17B4AF5C@lemburg.com> <200010302229.XAA00761@loewis.home.cs.tu-berlin.de> <39FEDC07.9B229A7D@lemburg.com> Message-ID: <200010311821.TAA00975@loewis.home.cs.tu-berlin.de> > Ok, but why is installing the codecs using a separate package and > then adding them to site.py such a pain ? You mean sitecustomize.py, I assume. Because hand-editing files as part of the installation process is a pain, and because automatic adding of it is not feasible. To give an analogy, for years, Emacs users had to edit their .emacs files to integrate additional packages. Eventually, various means to support automatic adding of stuff were added, and user interfaces were designed to allow interactive manipulation of the settings that still needed user input (the customize mechanism). Having a place where customization can be added is but a start to support other people's work. > > I think it is, and apparently Tamito thinks that way as well, since > > the JapaneseCodecs install themselves into the Python installation, to > > be always available. > > No third-party package should install itself into the standard > Python installation. That's simply wrong. Well, I agree its wrong - however I also appreciate the problem that this solves (which apparently you don't). Tell him a different but "better" way to achieve the same effect, and he'll probably use it. > Which I did by proposing to have the encodings search function > try to import a package called 'sitecodecs'. The JapaneseCodecs > package would then install itself into that site specific > host package. Ok, that is something that may work in Python 2.1, so some mechanism for the time being is required. Although, thinking about it - how exactly would that work? Suppose all files of JapaneseCodecs are in a single directory, call it "japanese". Then, this directory is copied into /usr/local/lib/python2.0/site-packages/sitecodecs/japanese I can't see how that alone would make it available, even if codecs.py would import sitecodecs. More general: how would multiple codecs installed into sitecodecs coordinate with each other? Regards, Martin From mal@lemburg.com Tue Oct 31 18:32:13 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 19:32:13 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> <39FEDB47.41E89785@lemburg.com> <200010311809.TAA00764@loewis.home.cs.tu-berlin.de> Message-ID: <39FF102D.70EC8172@lemburg.com> "Martin v. Loewis" wrote: > > > I don't get it: Python provides a new Unicode type on all platforms > > even ones which don't support Unicode at all. An even better: > > it is guaranteed to work the same on all platforms. > > I don't have too much problems with the Unicode type itself (although > I had preferred a solution that uses wchar_t as the element type where > available). The implementation does use wchar_t where available and usable (meaning that sizeof(wchar_t) == 2). > > You'll never get the same level of conformance with platform > > specific codecs since there are simply too many issues involved. > > > > Besides, what's the point in argueing against having a fairly > > complete set of codecs in the standard distribution ? > > Speed, perhaps? What is the point of having the bsddb module when we > have dumbdbm? If you need speed, you can always write your own codecs and use them for processing your data. The codec registry is open in all directions... -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 18:28:58 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 19:28:58 +0100 Subject: [I18n-sig] Sourceforge project name In-Reply-To: References: Message-ID: <200010311828.TAA01049@loewis.home.cs.tu-berlin.de> > I'll suggest 'pycodecs'. But I'm feeling uninspired tonight and hope > someone can come up with some other ideas. If there are no other > suggestions in 24 hours, I'll set it up. I think that's as good as any. > Who wants checkin and admin rights? Me, checkin. Regards, Martin From tree@basistech.com Tue Oct 31 18:10:42 2000 From: tree@basistech.com (Tom Emerson) Date: Tue, 31 Oct 2000 13:10:42 -0500 (EST) Subject: [I18n-sig] Big5 Codecs In-Reply-To: <200010311832.CAA19990@ms5.hinet.net> References: <200010311832.CAA19990@ms5.hinet.net> Message-ID: <14847.2850.566354.346304@cymru.basistech.com> Frank J.S. Chen writes: > Hello, > > I have made the Big5 Codecs, and it is workable. Excellent! A couple of questions: a) What source did you use for the mapping table? b) How do you handle EUDC code-points? -tree -- Tom Emerson Basis Technology Corp. Zenkaku Language Hacker http://www.basistech.com "Beware the lollipop of mediocrity: lick it once and you suck forever" From mal@lemburg.com Tue Oct 31 18:38:27 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 19:38:27 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> <39FEDF04.B604E280@lemburg.com> <200010311825.TAA01019@loewis.home.cs.tu-berlin.de> Message-ID: <39FF11A3.D33D312D@lemburg.com> "Martin v. Loewis" wrote: > > > u"abc".encode("mycodecs.my_utf_8") > > > > and the encodings search function will take care of the rest. > > > > Wouldn't this solve at least some of the problems ? > > I don't think so. The class of applications that I think will need > codecs first are "internet" applications: processors of HTML, XML, > MIME. In all these cases, some well-established encoding name is used, > which should be provided by Python "as-is". That is, if you receive > data in "shift-jis", having to map this to "japanese.shift_jis" is > just the same as requiring that "japanese" is imported up-front. In > the applications that I see, the application does not want to know > what "shift-jis" is - it just wants Python to convert that to Unicode. Than have your application register a new search function which does the necessary aliasing. Another possibility would be dropping your shift_jis.py codec into the sitecodecs package... at your own risk, though, since it might overwrite some already installed codec. Using the fully qualified name helps in case you want to use different codec implementations for the same encoding. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From tree@basistech.com Tue Oct 31 19:39:41 2000 From: tree@basistech.com (Tom Emerson) Date: Tue, 31 Oct 2000 14:39:41 -0500 (EST) Subject: [I18n-sig] Big5 Codecs In-Reply-To: <200010311954.DAA08848@ms5.hinet.net> References: <200010311954.DAA08848@ms5.hinet.net> Message-ID: <14847.8189.919788.689626@cymru.basistech.com> Frank J.S. Chen writes: > > a) What source did you use for the mapping table? > > It follows the proposal issued by M.A. Lemburg. > BIG5 encoding can map to Unicode encoding and reversely. But the Unicode Consortium's mapping table does not round-trip Big 5 --- so where did you get the table? > There are Level 1 and Level 2 in BIG5, so I define them apart. > This table is complete, but I just make a small test, not well-tested > indeed. I have a few megabytes of Big Five encoded text --- I'll test it out. ;-) > > b) How do you handle EUDC code-points? > > What is EUDC code point? I cannot find this field name in > the BMP layout. EUDC are the End-User Defined Character region, the 3rd level of Big 5. Several groups, including HKUST, the Hong Kong government, and the Taiwan military define characters in the 3rd region. Other Big 5 extensions, such as ETen, also use this block. EUDC is divided into three segments: 0xFA40 -- 0xFEFE, 0x8E40 -- 0xA0FE, and 0x8140 -- 0x8DFE. -tree -- Tom Emerson Basis Technology Corp. Zenkaku Language Hacker http://www.basistech.com "Beware the lollipop of mediocrity: lick it once and you suck forever" From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 20:15:21 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 21:15:21 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat In-Reply-To: <39FF102D.70EC8172@lemburg.com> (mal@lemburg.com) References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> <39FEDB47.41E89785@lemburg.com> <200010311809.TAA00764@loewis.home.cs.tu-berlin.de> <39FF102D.70EC8172@lemburg.com> Message-ID: <200010312015.VAA01520@loewis.home.cs.tu-berlin.de> > The implementation does use wchar_t where available and usable > (meaning that sizeof(wchar_t) == 2). There is probably not much point in rehashing the entire discussion, but I'd think that wchar_t is usable also more cases; specifically on Linux, where it is defined to hold ISO 10646 characters. Requiring that the elements of a Unicode string have only two bytes will cause problems in the long run, IMHO, since it will lead the way to UTF-16, which is utter non-sense. > If you need speed, you can always write your own codecs and > use them for processing your data. The codec registry is > open in all directions... Yes, that is certainly one of the elegant aspects of the Python Unicode support. Regards, Martin From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 20:20:06 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 21:20:06 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 In-Reply-To: <39FF11A3.D33D312D@lemburg.com> (mal@lemburg.com) References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> <39FEDF04.B604E280@lemburg.com> <200010311825.TAA01019@loewis.home.cs.tu-berlin.de> <39FF11A3.D33D312D@lemburg.com> Message-ID: <200010312020.VAA01566@loewis.home.cs.tu-berlin.de> > Than have your application register a new search function which > does the necessary aliasing. That's my point: The application author does not *know* all the aliases in advance. She expects Python to know about these things, for the same reason that Python knows about many other things (e.g. how base64 works, or how to find out the format of a database by just looking at a few bytes). > Another possibility would be dropping your shift_jis.py codec > into the sitecodecs package... at your own risk, though, since > it might overwrite some already installed codec. See, this just won't work. If I have a package sitecodecs, and codecs.py just does "import sitecodecs", then shift_jis.py will not be considered for conversions. > Using the fully qualified name helps in case you want to use > different codec implementations for the same encoding. If you want to do that, you can instantiate the codec yourself, and use the codec interface. No need to play magic with the strings passed to .encode(). Regards, Martin From mal@lemburg.com Tue Oct 31 20:25:45 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 21:25:45 +0100 Subject: [I18n-sig] Re: [XML-SIG] Character encodings and expat References: <200010290850.JAA21751@loewis.home.cs.tu-berlin.de> <39FD3885.56458C3C@lemburg.com> <200010302245.XAA00903@loewis.home.cs.tu-berlin.de> <39FEDB47.41E89785@lemburg.com> <200010311809.TAA00764@loewis.home.cs.tu-berlin.de> <39FF102D.70EC8172@lemburg.com> <200010312015.VAA01520@loewis.home.cs.tu-berlin.de> Message-ID: <39FF2AC9.F1FD4BAB@lemburg.com> "Martin v. Loewis" wrote: > > > The implementation does use wchar_t where available and usable > > (meaning that sizeof(wchar_t) == 2). > > There is probably not much point in rehashing the entire discussion, > but I'd think that wchar_t is usable also more cases; specifically on > Linux, where it is defined to hold ISO 10646 characters. You probably mean: UCS-4... > Requiring > that the elements of a Unicode string have only two bytes will cause > problems in the long run, IMHO, since it will lead the way to UTF-16, > which is utter non-sense. We can always move on to UCS-4 at some later point. Right now, Python's Unicode internals are defined to be UTF-16 without support for surrogates... which means UCS-2 in most cases. BTW, there are conversions C API available to directly interface to the C libs native wchar_t type. The APIs also have optimizations to only copy data in case the sizeof() values differ. -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/ From martin@loewis.home.cs.tu-berlin.de Tue Oct 31 20:21:27 2000 From: martin@loewis.home.cs.tu-berlin.de (Martin v. Loewis) Date: Tue, 31 Oct 2000 21:21:27 +0100 Subject: [I18n-sig] Big5 Codecs In-Reply-To: <200010311832.CAA19990@ms5.hinet.net> (frank63@ms5.hinet.net) References: <200010311832.CAA19990@ms5.hinet.net> Message-ID: <200010312021.VAA01596@loewis.home.cs.tu-berlin.de> > See the attached .tgz file. Maybe I'm missing something, but - where is the file? Martin From mal@lemburg.com Tue Oct 31 20:30:07 2000 From: mal@lemburg.com (M.-A. Lemburg) Date: Tue, 31 Oct 2000 21:30:07 +0100 Subject: [I18n-sig] Codecs for Big Five and GB 2312 References: <200010272228.AAA01066@loewis.home.cs.tu-berlin.de> <39FADBCC.B19B65C9@lemburg.com> <200010300838.RAA20190@dhcp198.grad.sccs.chukyo-u.ac.jp> <39FD50D6.11FBF133@lemburg.com> <200010302259.XAA01048@loewis.home.cs.tu-berlin.de> <39FEDF04.B604E280@lemburg.com> <200010311825.TAA01019@loewis.home.cs.tu-berlin.de> <39FF11A3.D33D312D@lemburg.com> <200010312020.VAA01566@loewis.home.cs.tu-berlin.de> Message-ID: <39FF2BCF.A2DE1C55@lemburg.com> "Martin v. Loewis" wrote: > > > Than have your application register a new search function which > > does the necessary aliasing. > > That's my point: The application author does not *know* all the > aliases in advance. She expects Python to know about these things, for > the same reason that Python knows about many other things (e.g. how > base64 works, or how to find out the format of a database by just > looking at a few bytes). Python can't possibly know all encoding aliases nomatter how hard we try. That's why the codec interface is open and allows you to add aliasing support whereever needed. Note that MIME support works in a very similar way... you can register your own new MIME types at run-time in case you need them. > > Another possibility would be dropping your shift_jis.py codec > > into the sitecodecs package... at your own risk, though, since > > it might overwrite some already installed codec. > > See, this just won't work. If I have a package sitecodecs, and > codecs.py just does "import sitecodecs", then shift_jis.py will not be > considered for conversions. Sure it will: the encodings search function would be extended to look in that specific package too. > > Using the fully qualified name helps in case you want to use > > different codec implementations for the same encoding. > > If you want to do that, you can instantiate the codec yourself, and > use the codec interface. No need to play magic with the strings passed > to .encode(). But then you won't be able to pass the encoding string through any aliasing engine... and that's what you intended, right ? -- Marc-Andre Lemburg ______________________________________________________________________ Business: http://www.lemburg.com/ Python Pages: http://www.lemburg.com/python/