[issue8898] The email package should defer to the codecs module for all aliases

Sun May 22 14:27:10 CEST 2011

Michele OrrÃ¹ <maker.py at gmail.com> added the comment:

unittest.skip* are decorators, so useless in this case; also, AFAIS
Lib/test/ uses sys.platform.

I would suggest to put a try statement in encodings.mbcs, and raise an
error in case the imported modules imported are not found.
But this is another story.

----------
title: The email package should defer to the codecs module for	all aliases -> The email package should defer to the codecs module for all aliases
Added file: http://bugs.python.org/file22065/issue8898_skip.patch

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue8898>
_______________________________________
-------------- next part --------------
diff -r cc60d0283fad Lib/email/charset.py

--- a/Lib/email/charset.py	Fri May 20 16:55:06 2011 +0200
+++ b/Lib/email/charset.py	Sun May 22 14:18:05 2011 +0200
@@ -10,6 +10,7 @@
     ]
 
 from functools import partial
+from codecs import lookup
 
 import email.base64mime
 import email.quoprimime
@@ -63,36 +64,6 @@
     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
     }
 
-# Aliases for other commonly-used names for character sets.  Map
-# them to the real ones used in email.
-ALIASES = {
-    'latin_1': 'iso-8859-1',
-    'latin-1': 'iso-8859-1',
-    'latin_2': 'iso-8859-2',
-    'latin-2': 'iso-8859-2',
-    'latin_3': 'iso-8859-3',
-    'latin-3': 'iso-8859-3',
-    'latin_4': 'iso-8859-4',
-    'latin-4': 'iso-8859-4',
-    'latin_5': 'iso-8859-9',
-    'latin-5': 'iso-8859-9',
-    'latin_6': 'iso-8859-10',
-    'latin-6': 'iso-8859-10',
-    'latin_7': 'iso-8859-13',
-    'latin-7': 'iso-8859-13',
-    'latin_8': 'iso-8859-14',
-    'latin-8': 'iso-8859-14',
-    'latin_9': 'iso-8859-15',
-    'latin-9': 'iso-8859-15',
-    'latin_10':'iso-8859-16',
-    'latin-10':'iso-8859-16',
-    'cp949':   'ks_c_5601-1987',
-    'euc_jp':  'euc-jp',
-    'euc_kr':  'euc-kr',
-    'ascii':   'us-ascii',
-    }
-
-
 # Map charsets to their Unicode codec strings.
 CODEC_MAP = {
     'gb2312':      'eucgb2312_cn',
@@ -103,6 +74,8 @@
     'us-ascii':    None,
     }
 
+# Aliases defined by the user
+ALIASES = dict()
 
 
 # Convenience functions for extending the above mappings
@@ -220,9 +193,12 @@
                 input_charset = str(input_charset, 'ascii')
         except UnicodeError:
             raise errors.CharsetError(input_charset)
-        input_charset = input_charset.lower()
-        # Set the input charset after filtering through the aliases
-        self.input_charset = ALIASES.get(input_charset, input_charset)
+        # Set the input charset after filtering through its aliases defined in
+        # codecs library
+        try:
+            self.input_charset = lookup(input_charset).name
+        except LookupError:
+            self.input_charset = ALIASES.get(input_charset, input_charset)
         # We can try to guess which encoding and conversion to use by the
         # charset_map dictionary.  Try that first, but let the user override
         # it.
diff -r cc60d0283fad Lib/encodings/aliases.py
--- a/Lib/encodings/aliases.py	Fri May 20 16:55:06 2011 +0200
+++ b/Lib/encodings/aliases.py	Sun May 22 14:18:05 2011 +0200
@@ -254,7 +254,7 @@
     # hp_roman8 codec
     'roman8'             : 'hp_roman8',
     'r8'                 : 'hp_roman8',
-    'csHPRoman8'         : 'hp_roman8',
+    'cshproman8'         : 'hp_roman8',
 
     # hz codec
     'hzgb'               : 'hz',
@@ -298,6 +298,7 @@
     'iso_ir_157'         : 'iso8859_10',
     'l6'                 : 'iso8859_10',
     'latin6'             : 'iso8859_10',
+    'latin_6'            : 'iso8859_10',
 
     # iso8859_11 codec
     'thai'               : 'iso8859_11',
@@ -308,6 +309,7 @@
     'iso_8859_13'        : 'iso8859_13',
     'l7'                 : 'iso8859_13',
     'latin7'             : 'iso8859_13',
+    'latin_7'            : 'iso8859_13',
 
     # iso8859_14 codec
     'iso_8859_14'        : 'iso8859_14',
@@ -316,11 +318,13 @@
     'iso_ir_199'         : 'iso8859_14',
     'l8'                 : 'iso8859_14',
     'latin8'             : 'iso8859_14',
+    'latin_8'            : 'iso8859_14',
 
     # iso8859_15 codec
     'iso_8859_15'        : 'iso8859_15',
     'l9'                 : 'iso8859_15',
     'latin9'             : 'iso8859_15',
+    'latin_9'            : 'iso8859_15',
 
     # iso8859_16 codec
     'iso_8859_16'        : 'iso8859_16',
@@ -328,6 +332,7 @@
     'iso_ir_226'         : 'iso8859_16',
     'l10'                : 'iso8859_16',
     'latin10'            : 'iso8859_16',
+    'latin_10'           : 'iso8859_16',
 
     # iso8859_2 codec
     'csisolatin2'        : 'iso8859_2',
@@ -336,6 +341,7 @@
     'iso_ir_101'         : 'iso8859_2',
     'l2'                 : 'iso8859_2',
     'latin2'             : 'iso8859_2',
+    'latin_2'            : 'iso8859_2',
 
     # iso8859_3 codec
     'csisolatin3'        : 'iso8859_3',
@@ -344,6 +350,7 @@
     'iso_ir_109'         : 'iso8859_3',
     'l3'                 : 'iso8859_3',
     'latin3'             : 'iso8859_3',
+    'latin_3'            : 'iso8859_3',
 
     # iso8859_4 codec
     'csisolatin4'        : 'iso8859_4',
@@ -352,6 +359,7 @@
     'iso_ir_110'         : 'iso8859_4',
     'l4'                 : 'iso8859_4',
     'latin4'             : 'iso8859_4',
+    'latin_4'            : 'iso8859_4',
 
     # iso8859_5 codec
     'csisolatincyrillic' : 'iso8859_5',
@@ -393,6 +401,7 @@
     'iso_ir_148'         : 'iso8859_9',
     'l5'                 : 'iso8859_9',
     'latin5'             : 'iso8859_9',
+    'latin_5'            : 'iso8859_9',
 
     # johab codec
     'cp1361'             : 'johab',
@@ -474,9 +483,6 @@
     'sjisx0213'          : 'shift_jisx0213',
     's_jisx0213'         : 'shift_jisx0213',
 
-    # tactis codec
-    'tis260'             : 'tactis',
-
     # tis_620 codec
     'tis620'             : 'tis_620',
     'tis_620_0'          : 'tis_620',
diff -r cc60d0283fad Lib/test/test_codeccallbacks.py
--- a/Lib/test/test_codeccallbacks.py	Fri May 20 16:55:06 2011 +0200
+++ b/Lib/test/test_codeccallbacks.py	Sun May 22 14:18:05 2011 +0200
@@ -1,5 +1,13 @@
-import test.support, unittest
-import sys, codecs, html.entities, unicodedata
+import test.support
+import unittest
+
+from  encodings.aliases import aliases
+import codecs
+import unicodedata
+import html.entities
+import importlib
+import sys
+
 
 class PosReturn:
     # this can be used for configurable callbacks
@@ -629,7 +637,16 @@
                     "test.badhandler"
                 )
 
-    def test_lookup(self):
+    def test_lookup_aliases(self):
+        for alias, module_name in aliases.items():
+            if sys.platform != 'win32' and module_name == 'mbcs':
+                continue
+
+            module = importlib.import_module('encodings.' + module_name)
+            codec_name = module.getregentry().name
+            self.assertEqual(codecs.lookup(alias).name, codec_name)
+
+    def test_lookup_error(self):
         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
         self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
@@ -664,7 +681,7 @@
         self.assertRaises(TypeError, codecs.register_error, 42)
         self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
 
-    def test_badlookupcall(self):
+    def test_badlookup_errorcall(self):
         # enhance coverage of:
         # Modules/_codecsmodule.c::lookup_error()
         self.assertRaises(TypeError, codecs.lookup_error)