[Python-checkins] CVS: python/dist/src/Lib locale.py,1.4,1.5

Marc-Andre Lemburg python-dev@python.org
Wed, 7 Jun 2000 02:11:43 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv17332/Lib

Modified Files:
	locale.py 
Log Message:
Marc-Andre Lemburg <mal@lemburg.com>:
Added a new locale name aliasing engine which also supports
locale encodings, a feature which is used by the new default
encoding support in site.py.

Index: locale.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/locale.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** locale.py	2000/02/04 15:39:29	1.4
--- locale.py	2000/06/07 09:11:40	1.5
***************
*** 1,9 ****
! """Support for number formatting using the current locale settings."""
  
! # Author: Martin von Loewis
  
! from _locale import *
  import string
  
  #perform the grouping from right to left
  def _group(s):
--- 1,25 ----
! """ Locale support.
  
!     The module provides low-level access to the C lib's locale APIs
!     and adds high level number formatting APIs as well as a locale
!     aliasing engine to complement these.
  
!     The aliasing engine includes support for many commonly used locale
!     names and maps them to values suitable for passing to the C lib's
!     setlocale() function. It also includes default encodings for all
!     supported locale names.
! 
! """
! 
  import string
  
+ ### C lib locale APIs
+ 
+ from _locale import *
+ 
+ ### Number formatting APIs
+ 
+ # Author: Martin von Loewis
+ 
  #perform the grouping from right to left
  def _group(s):
***************
*** 26,30 ****
              result=s[-group:]
          s=s[:-group]
!     if s and result:
          result=s+conv['thousands_sep']+result
      return result
--- 42,48 ----
              result=s[-group:]
          s=s[:-group]
!     if not result:
!         return s
!     if s:
          result=s+conv['thousands_sep']+result
      return result
***************
*** 35,39 ****
      Grouping is applied if the third parameter is true."""
      result = f % val
!     fields = string.splitfields(result,".")
      if grouping:
          fields[0]=_group(fields[0])
--- 53,57 ----
      Grouping is applied if the third parameter is true."""
      result = f % val
!     fields = string.split(result, ".")
      if grouping:
          fields[0]=_group(fields[0])
***************
*** 52,60 ****
      "Parses a string as a float according to the locale settings."
      #First, get rid of the grouping
!     s=string.splitfields(str,localeconv()['thousands_sep'])
!     str=string.join(s,"")
      #next, replace the decimal point with a dot
!     s=string.splitfields(str,localeconv()['decimal_point'])
!     str=string.join(s,'.')
      #finally, parse the string
      return func(str)
--- 70,82 ----
      "Parses a string as a float according to the locale settings."
      #First, get rid of the grouping
!     ts = localeconv()['thousands_sep']
!     if ts:
!         s=string.split(str,ts)
!         str=string.join(s, "")
      #next, replace the decimal point with a dot
!     dd = localeconv()['decimal_point']
!     if dd:
!         s=string.split(str,dd)
!         str=string.join(s,'.')
      #finally, parse the string
      return func(str)
***************
*** 64,68 ****
      return atof(str,string.atoi)
  
! def test():
      setlocale(LC_ALL,"")
      #do grouping
--- 86,90 ----
      return atof(str,string.atoi)
  
! def _test():
      setlocale(LC_ALL,"")
      #do grouping
***************
*** 72,77 ****
      s1=str(3.14)
      print s1,"is",atof(s1)
      
  
  if __name__=='__main__':
!     test()
--- 94,571 ----
      s1=str(3.14)
      print s1,"is",atof(s1)
+ 
+ ### Locale name aliasing engine
+ 
+ # Author: Marc-Andre Lemburg, mal@lemburg.com
+ 
+ def normalize(localename):
+ 
+     """ Returns a normalized locale code for the given locale
+         name.
+ 
+         The returned locale code is formatted for use with
+         setlocale().
+ 
+         If normalization fails, the original name is returned
+         unchanged.
+ 
+         If the given encoding is not known, the function defaults to
+         the default encoding for the locale code just like setlocale()
+         does.
+ 
+     """
+     # Normalize the locale name and extract the encoding
+     fullname = string.lower(localename)
+     if ':' in fullname:
+         # ':' is sometimes used as encoding delimiter.
+         fullname = string.replace(fullname, ':', '.')
+     if '.' in fullname:
+         langname, encoding = string.split(fullname, '.')[:2]
+         fullname = langname + '.' + encoding
+     else:
+         langname = fullname
+         encoding = ''
+ 
+     # First lookup: fullname (possibly with encoding)
+     code = locale_alias.get(fullname, None)
+     if code is not None:
+         return code
+ 
+     # Second try: langname (without encoding)
+     code = locale_alias.get(langname, None)
+     if code is not None:
+         if '.' in code:
+             langname, defenc = string.split(code, '.')
+         else:
+             langname = code
+             defenc = ''
+         if encoding:
+             encoding = encoding_alias.get(encoding, encoding)
+         else:
+             encoding = defenc
+         if encoding:
+             return langname + '.' + encoding
+         else:
+             return langname
+ 
+     else:
+         return localename
+ 
+ def _parse_localename(localename):
+ 
+     """ Parses the locale code for localename and returns the
+         result as tuple (language code, encoding).
+ 
+         The localename is normalized and passed through the locale
+         alias engine. A ValueError is raised in case the locale name
+         cannot be parsed.
+ 
+         The language code corresponds to RFC 1766.  code and encoding
+         can be None in case the values cannot be determined or are
+         unkown to this implementation.
+ 
+     """
+     code = normalize(localename)
+     if '.' in code:
+         return string.split(code, '.')[:2]
+     elif code == 'C':
+         return None, None
+     else:
+         raise ValueError,'unkown locale: %s' % localename
+     return l
+ 
+ def _build_localename(localetuple):
+ 
+     """ Builds a locale code from the given tuple (language code,
+         encoding).
+ 
+         No aliasing or normalizing takes place.
+ 
+     """
+     language, encoding = localetuple
+     if language is None:
+         language = 'C'
+     if encoding is None:
+         return language
+     else:
+         return language + '.' + encoding
+     
+ def get_default(envvars=('LANGUAGE', 'LC_ALL', 'LC_CTYPE', 'LANG')):
+ 
+     """ Tries to determine the default locale settings and returns
+         them as tuple (language code, encoding).
+ 
+         According to POSIX, a program which has not called
+         setlocale(LC_ALL,"") runs using the portable 'C' locale.
+         Calling setlocale(LC_ALL,"") lets it use the default locale as
+         defined by the LANG variable. Since we don't want to interfere
+         with the current locale setting we thus emulate the behaviour
+         in the way described above.
+ 
+         To maintain compatibility with other platforms, not only the
+         LANG variable is tested, but a list of variables given as
+         envvars parameter. The first found to be defined will be
+         used. envvars defaults to the search path used in GNU gettext;
+         it must always contain the variable name 'LANG'.
+ 
+         Except for the code 'C', the language code corresponds to RFC
+         1766.  code and encoding can be None in case the values cannot
+         be determined.
+ 
+     """
+     import os
+     lookup = os.environ.get
+     for variable in envvars:
+         localename = lookup(variable,None)
+         if localename is not None:
+             break
+     else:
+         localename = 'C'
+     return _parse_localename(localename)
+ 
+ def get_locale(category=LC_CTYPE):
+ 
+     """ Returns the current setting for the given locale category as
+         tuple (language code, encoding).
+ 
+         category may be one of the LC_* value except LC_ALL. It
+         defaults to LC_CTYPE.
+ 
+         Except for the code 'C', the language code corresponds to RFC
+         1766.  code and encoding can be None in case the values cannot
+         be determined.
+ 
+     """
+     localename = setlocale(category)
+     if category == LC_ALL and ';' in localename:
+         raise TypeError,'category LC_ALL is not supported'
+     return _parse_localename(localename)
+ 
+ def set_locale(localetuple, category=LC_ALL):
+ 
+     """ Set the locale according to the localetuple (language code,
+         encoding) as returned by get_locale() and get_default().
+ 
+         The given codes are passed through the locale aliasing engine
+         before being given to setlocale() for processing.
+ 
+         category may be given as one of the LC_* values. It defaults
+         to LC_ALL.
+ 
+     """
+     setlocale(category, normalize(_build_localename(localetuple)))
+ 
+ def set_to_default(category=LC_ALL):
+ 
+     """ Sets the locale for category to the default setting.
+ 
+         The default setting is determined by calling
+         get_default(). category defaults to LC_ALL.
+         
+     """
+     setlocale(category, _build_localename(get_default()))
+ 
+ ### Database
+ #
+ # The following data was extracted from the locale.alias file which
+ # comes with X11 and then hand edited removing the explicit encoding
+ # definitions and adding some more aliases. The file is usually
+ # available as /usr/lib/X11/locale/locale.alias.
+ #    
+ 
+ #
+ # The encoding_alias table maps lowercase encoding alias names to C
+ # locale encoding names (case-sensitive).
+ #
+ encoding_alias = {
+         '437': 				'C',
+         'c': 				'C',
+         'iso8859': 			'ISO8859-1',
+         '8859': 			'ISO8859-1',
+         '88591': 			'ISO8859-1',
+         'ascii': 			'ISO8859-1',
+         'en': 				'ISO8859-1',
+         'iso88591': 			'ISO8859-1',
+         'iso_8859-1': 			'ISO8859-1',
+         '885915': 			'ISO8859-15',
+         'iso885915': 			'ISO8859-15',
+         'iso_8859-15': 			'ISO8859-15',
+         'iso8859-2': 			'ISO8859-2',
+         'iso88592': 			'ISO8859-2',
+         'iso_8859-2': 			'ISO8859-2',
+         'iso88595': 			'ISO8859-5',
+         'iso88596': 			'ISO8859-6',
+         'iso88597': 			'ISO8859-7',
+         'iso88598': 			'ISO8859-8',
+         'iso88599': 			'ISO8859-9',
+         'iso-2022-jp': 			'JIS7',
+         'jis': 				'JIS7',
+         'jis7': 			'JIS7',
+         'sjis': 			'SJIS',
+         'tis620': 			'TACTIS',
+         'ajec': 			'eucJP',
+         'eucjp': 			'eucJP',
+         'ujis': 			'eucJP',
+         'utf-8': 			'utf',
+         'utf8': 			'utf',
+         'utf8@ucs4': 			'utf',
+ }
+ 
+ #    
+ # The locale_alias table maps lowercase alias names to C locale names
+ # (case-sensitive). Encodings are always separated from the locale
+ # name using a dot ('.'); they should only be given in case the
+ # language name is needed to interpret the given encoding alias
+ # correctly (CJK codes often have this need).
+ #
+ locale_alias = {
+         'american':                      'en_US.ISO8859-1',
+         'ar':                            'ar_AA.ISO8859-6',
+         'ar_aa':                         'ar_AA.ISO8859-6',
+         'ar_sa':                         'ar_SA.ISO8859-6',
+         'arabic':                        'ar_AA.ISO8859-6',
+         'bg':                            'bg_BG.ISO8859-5',
+         'bg_bg':                         'bg_BG.ISO8859-5',
+         'bulgarian':                     'bg_BG.ISO8859-5',
+         'c-french':                      'fr_CA.ISO8859-1',
+         'c':                             'C',
+         'c_c':                           'C',
+         'cextend':                       'en_US.ISO8859-1',
+         'chinese-s':                     'zh_CN.eucCN',
+         'chinese-t':                     'zh_TW.eucTW',
+         'croatian':                      'hr_HR.ISO8859-2',
+         'cs':                            'cs_CZ.ISO8859-2',
+         'cs_cs':                         'cs_CZ.ISO8859-2',
+         'cs_cz':                         'cs_CZ.ISO8859-2',
+         'cz':                            'cz_CZ.ISO8859-2',
+         'cz_cz':                         'cz_CZ.ISO8859-2',
+         'czech':                         'cs_CS.ISO8859-2',
+         'da':                            'da_DK.ISO8859-1',
+         'da_dk':                         'da_DK.ISO8859-1',
+         'danish':                        'da_DK.ISO8859-1',
+         'de':                            'de_DE.ISO8859-1',
+         'de_at':                         'de_AT.ISO8859-1',
+         'de_ch':                         'de_CH.ISO8859-1',
+         'de_de':                         'de_DE.ISO8859-1',
+         'dutch':                         'nl_BE.ISO8859-1',
+         'ee':                            'ee_EE.ISO8859-4',
+         'el':                            'el_GR.ISO8859-7',
+         'el_gr':                         'el_GR.ISO8859-7',
+         'en':                            'en_US.ISO8859-1',
+         'en_au':                         'en_AU.ISO8859-1',
+         'en_ca':                         'en_CA.ISO8859-1',
+         'en_gb':                         'en_GB.ISO8859-1',
+         'en_ie':                         'en_IE.ISO8859-1',
+         'en_nz':                         'en_NZ.ISO8859-1',
+         'en_uk':                         'en_GB.ISO8859-1',
+         'en_us':                         'en_US.ISO8859-1',
+         'eng_gb':                        'en_GB.ISO8859-1',
+         'english':                       'en_EN.ISO8859-1',
+         'english_uk':                    'en_GB.ISO8859-1',
+         'english_united-states':         'en_US.ISO8859-1',
+         'english_us':                    'en_US.ISO8859-1',
+         'es':                            'es_ES.ISO8859-1',
+         'es_ar':                         'es_AR.ISO8859-1',
+         'es_bo':                         'es_BO.ISO8859-1',
+         'es_cl':                         'es_CL.ISO8859-1',
+         'es_co':                         'es_CO.ISO8859-1',
+         'es_cr':                         'es_CR.ISO8859-1',
+         'es_ec':                         'es_EC.ISO8859-1',
+         'es_es':                         'es_ES.ISO8859-1',
+         'es_gt':                         'es_GT.ISO8859-1',
+         'es_mx':                         'es_MX.ISO8859-1',
+         'es_ni':                         'es_NI.ISO8859-1',
+         'es_pa':                         'es_PA.ISO8859-1',
+         'es_pe':                         'es_PE.ISO8859-1',
+         'es_py':                         'es_PY.ISO8859-1',
+         'es_sv':                         'es_SV.ISO8859-1',
+         'es_uy':                         'es_UY.ISO8859-1',
+         'es_ve':                         'es_VE.ISO8859-1',
+         'et':                            'et_EE.ISO8859-4',
+         'et_ee':                         'et_EE.ISO8859-4',
+         'fi':                            'fi_FI.ISO8859-1',
+         'fi_fi':                         'fi_FI.ISO8859-1',
+         'finnish':                       'fi_FI.ISO8859-1',
+         'fr':                            'fr_FR.ISO8859-1',
+         'fr_be':                         'fr_BE.ISO8859-1',
+         'fr_ca':                         'fr_CA.ISO8859-1',
+         'fr_ch':                         'fr_CH.ISO8859-1',
+         'fr_fr':                         'fr_FR.ISO8859-1',
+         'fre_fr':                        'fr_FR.ISO8859-1',
+         'french':                        'fr_FR.ISO8859-1',
+         'french_france':                 'fr_FR.ISO8859-1',
+         'ger_de':                        'de_DE.ISO8859-1',
+         'german':                        'de_DE.ISO8859-1',
+         'german_germany':                'de_DE.ISO8859-1',
+         'greek':                         'el_GR.ISO8859-7',
+         'hebrew':                        'iw_IL.ISO8859-8',
+         'hr':                            'hr_HR.ISO8859-2',
+         'hr_hr':                         'hr_HR.ISO8859-2',
+         'hu':                            'hu_HU.ISO8859-2',
+         'hu_hu':                         'hu_HU.ISO8859-2',
+         'hungarian':                     'hu_HU.ISO8859-2',
+         'icelandic':                     'is_IS.ISO8859-1',
+         'id':                            'id_ID.ISO8859-1',
+         'id_id':                         'id_ID.ISO8859-1',
+         'is':                            'is_IS.ISO8859-1',
+         'is_is':                         'is_IS.ISO8859-1',
+         'iso-8859-1':                    'en_US.ISO8859-1',
+         'iso-8859-15':                   'en_US.ISO8859-15',
+         'iso8859-1':                     'en_US.ISO8859-1',
+         'iso8859-15':                    'en_US.ISO8859-15',
+         'iso_8859_1':                    'en_US.ISO8859-1',
+         'iso_8859_15':                   'en_US.ISO8859-15',
+         'it':                            'it_IT.ISO8859-1',
+         'it_ch':                         'it_CH.ISO8859-1',
+         'it_it':                         'it_IT.ISO8859-1',
+         'italian':                       'it_IT.ISO8859-1',
+         'iw':                            'iw_IL.ISO8859-8',
+         'iw_il':                         'iw_IL.ISO8859-8',
+         'ja':                            'ja_JP.eucJP',
+         'ja.jis':                        'ja_JP.JIS7',
+         'ja.sjis':                       'ja_JP.SJIS',
+         'ja_jp':                         'ja_JP.eucJP',
+         'ja_jp.ajec':                    'ja_JP.eucJP',
+         'ja_jp.euc':                     'ja_JP.eucJP',
+         'ja_jp.eucjp':                   'ja_JP.eucJP',
+         'ja_jp.iso-2022-jp':             'ja_JP.JIS7',
+         'ja_jp.jis':                     'ja_JP.JIS7',
+         'ja_jp.jis7':                    'ja_JP.JIS7',
+         'ja_jp.mscode':                  'ja_JP.SJIS',
+         'ja_jp.sjis':                    'ja_JP.SJIS',
+         'ja_jp.ujis':                    'ja_JP.eucJP',
+         'japan':                         'ja_JP.eucJP',
+         'japanese':                      'ja_JP.SJIS',
+         'japanese-euc':                  'ja_JP.eucJP',
+         'japanese.euc':                  'ja_JP.eucJP',
+         'jp_jp':                         'ja_JP.eucJP',
+         'ko':                            'ko_KR.eucKR',
+         'ko_kr':                         'ko_KR.eucKR',
+         'ko_kr.euc':                     'ko_KR.eucKR',
+         'korean':                        'ko_KR.eucKR',
+         'lt':                            'lt_LT.ISO8859-4',
+         'lv':                            'lv_LV.ISO8859-4',
+         'mk':                            'mk_MK.ISO8859-5',
+         'mk_mk':                         'mk_MK.ISO8859-5',
+         'nl':                            'nl_NL.ISO8859-1',
+         'nl_be':                         'nl_BE.ISO8859-1',
+         'nl_nl':                         'nl_NL.ISO8859-1',
+         'no':                            'no_NO.ISO8859-1',
+         'no_no':                         'no_NO.ISO8859-1',
+         'norwegian':                     'no_NO.ISO8859-1',
+         'pl':                            'pl_PL.ISO8859-2',
+         'pl_pl':                         'pl_PL.ISO8859-2',
+         'polish':                        'pl_PL.ISO8859-2',
+         'portuguese':                    'pt_PT.ISO8859-1',
+         'portuguese_brazil':             'pt_BR.ISO8859-1',
+         'posix':                         'C',
+         'posix-utf2':                    'C',
+         'pt':                            'pt_PT.ISO8859-1',
+         'pt_br':                         'pt_BR.ISO8859-1',
+         'pt_pt':                         'pt_PT.ISO8859-1',
+         'ro':                            'ro_RO.ISO8859-2',
+         'ro_ro':                         'ro_RO.ISO8859-2',
+         'ru':                            'ru_RU.ISO8859-5',
+         'ru_ru':                         'ru_RU.ISO8859-5',
+         'rumanian':                      'ro_RO.ISO8859-2',
+         'russian':                       'ru_RU.ISO8859-5',
+         'serbocroatian':                 'sh_YU.ISO8859-2',
+         'sh':                            'sh_YU.ISO8859-2',
+         'sh_hr':                         'sh_HR.ISO8859-2',
+         'sh_sp':                         'sh_YU.ISO8859-2',
+         'sh_yu':                         'sh_YU.ISO8859-2',
+         'sk':                            'sk_SK.ISO8859-2',
+         'sk_sk':                         'sk_SK.ISO8859-2',
+         'sl':                            'sl_CS.ISO8859-2',
+         'sl_cs':                         'sl_CS.ISO8859-2',
+         'sl_si':                         'sl_SI.ISO8859-2',
+         'slovak':                        'sk_SK.ISO8859-2',
+         'slovene':                       'sl_CS.ISO8859-2',
+         'sp':                            'sp_YU.ISO8859-5',
+         'sp_yu':                         'sp_YU.ISO8859-5',
+         'spanish':                       'es_ES.ISO8859-1',
+         'spanish_spain':                 'es_ES.ISO8859-1',
+         'sr_sp':                         'sr_SP.ISO8859-2',
+         'sv':                            'sv_SE.ISO8859-1',
+         'sv_se':                         'sv_SE.ISO8859-1',
+         'swedish':                       'sv_SE.ISO8859-1',
+         'th_th':                         'th_TH.TACTIS',
+         'tr':                            'tr_TR.ISO8859-9',
+         'tr_tr':                         'tr_TR.ISO8859-9',
+         'turkish':                       'tr_TR.ISO8859-9',
+         'univ':                          'en_US.utf',
+         'universal':                     'en_US.utf',
+         'zh':                            'zh_CN.eucCN',
+         'zh_cn':                         'zh_CN.eucCN',
+         'zh_cn.big5':                    'zh_TW.eucTW',
+         'zh_cn.euc':                     'zh_CN.eucCN',
+         'zh_tw':                         'zh_TW.eucTW',
+         'zh_tw.euc':                     'zh_TW.eucTW',
+ }
+ 
+ def _print_locale():
+ 
+     """ Test function.
+     """
+     categories = {}
+     def _init_categories(categories=categories):
+         for k,v in globals().items():
+             if k[:3] == 'LC_':
+                 categories[k] = v
+     _init_categories()
+     del categories['LC_ALL']
+ 
+     print 'Locale defaults as determined by get_default():'
+     print '-'*72
+     lang, enc = get_default()
+     print 'Language: ', lang or '(undefined)'
+     print 'Encoding: ', enc or '(undefined)'
+     print
+ 
+     print 'Locale settings on startup:'
+     print '-'*72
+     for name,category in categories.items():
+         print name,'...'
+         lang, enc = get_locale(category)
+         print '   Language: ', lang or '(undefined)'
+         print '   Encoding: ', enc or '(undefined)'
+         print
+ 
+     set_to_default()
+     print
+     print 'Locale settings after calling set_to_default():'
+     print '-'*72
+     for name,category in categories.items():
+         print name,'...'
+         lang, enc = get_locale(category)
+         print '   Language: ', lang or '(undefined)'
+         print '   Encoding: ', enc or '(undefined)'
+         print
+     
+     try:
+         setlocale(LC_ALL,"")
+     except:
+         print 'NOTE:'
+         print 'setlocale(LC_ALL,"") does not support the default locale'
+         print 'given in the OS environment variables.'
+     else:
+         print
+         print 'Locale settings after calling setlocale(LC_ALL,""):'
+         print '-'*72
+         for name,category in categories.items():
+             print name,'...'
+             lang, enc = get_locale(category)
+             print '   Language: ', lang or '(undefined)'
+             print '   Encoding: ', enc or '(undefined)'
+             print
      
+ ###
  
  if __name__=='__main__':
!     print 'Locale aliasing:'
!     print
!     _print_locale()
!     print
!     print 'Number formatting:'
!     print
!     _test()