[Python-Dev] Unicode Update 2000-03-17

M.-A. Lemburg mal@lemburg.com
Fri, 17 Mar 2000 19:53:39 +0100


This is a multi-part message in MIME format.
--------------A764B515049AA0B5F7643A5B
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Attached you find an update of the Unicode implementation.

The patch is against the current CVS version. I would appreciate
if someone with CVS checkin permissions could check the changes
in.

The patch contains all bugs and patches sent this week and
also fixes a leak in the codecs code and a bug in the free list
code for Unicode objects (which only shows up when compiling
Python with Py_DEBUG; thanks to MarkH for spotting this one).

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------A764B515049AA0B5F7643A5B
Content-Type: text/plain; charset=us-ascii;
 name="Unicode-Implementation-2000-03-17.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="Unicode-Implementation-2000-03-17.patch"

Only in CVS-Python/Doc/tools: anno-api.py
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Include/unicodeobject.h Python+Unicode/Include/unicodeobject.h
--- CVS-Python/Include/unicodeobject.h	Fri Mar 17 15:24:30 2000
+++ Python+Unicode/Include/unicodeobject.h	Tue Mar 14 10:38:08 2000
@@ -1,8 +1,5 @@
 #ifndef Py_UNICODEOBJECT_H
 #define Py_UNICODEOBJECT_H
-#ifdef __cplusplus
-extern "C" {
-#endif
 
 /*
 
@@ -109,8 +106,9 @@
 /* --- Internal Unicode Operations ---------------------------------------- */
 
 /* If you want Python to use the compiler's wctype.h functions instead
-   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS.
-   This reduces the interpreter's code size. */
+   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
+   configure Python using --with-ctype-functions.  This reduces the
+   interpreter's code size. */
 
 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 
@@ -169,6 +167,10 @@
     (!memcmp((string)->str + (offset), (substring)->str,\
              (substring)->length*sizeof(Py_UNICODE)))
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* --- Unicode Type ------------------------------------------------------- */
 
 typedef struct {
@@ -647,7 +649,7 @@
     int direction		/* Find direction: +1 forward, -1 backward */
     );
 
-/* Count the number of occurances of substr in str[start:end]. */
+/* Count the number of occurrences of substr in str[start:end]. */
 
 extern DL_IMPORT(int) PyUnicode_Count(
     PyObject *str,		/* String */ 
@@ -656,7 +658,7 @@
     int end			/* Stop index */
     );
 
-/* Replace at most maxcount occurances of substr in str with replstr
+/* Replace at most maxcount occurrences of substr in str with replstr
    and return the resulting Unicode object. */
 
 extern DL_IMPORT(PyObject *) PyUnicode_Replace(
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/codecs.py Python+Unicode/Lib/codecs.py
--- CVS-Python/Lib/codecs.py	Sat Mar 11 00:20:43 2000
+++ Python+Unicode/Lib/codecs.py	Mon Mar 13 14:33:54 2000
@@ -55,7 +55,7 @@
     """
     def encode(self,input,errors='strict'):
         
-        """ Encodes the object intput and returns a tuple (output
+        """ Encodes the object input and returns a tuple (output
             object, length consumed).
 
             errors defines the error handling to apply. It defaults to
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/encodings/__init__.py Python+Unicode/Lib/encodings/__init__.py
--- CVS-Python/Lib/encodings/__init__.py	Sat Mar 11 00:17:18 2000
+++ Python+Unicode/Lib/encodings/__init__.py	Mon Mar 13 14:30:33 2000
@@ -30,13 +30,13 @@
 import string,codecs,aliases
 
 _cache = {}
-_unkown = '--unkown--'
+_unknown = '--unknown--'
 
 def search_function(encoding):
     
     # Cache lookup
-    entry = _cache.get(encoding,_unkown)
-    if entry is not _unkown:
+    entry = _cache.get(encoding,_unknown)
+    if entry is not _unknown:
         return entry
 
     # Import the module
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/test/test_string.py Python+Unicode/Lib/test/test_string.py
--- CVS-Python/Lib/test/test_string.py	Sat Mar 11 10:52:43 2000
+++ Python+Unicode/Lib/test/test_string.py	Mon Mar 13 10:12:46 2000
@@ -143,6 +143,7 @@
 test('translate', 'xyz', 'xyz', table)
 
 test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
+test('replace', 'one!two!three!', 'onetwothree', '!', '')
 test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
 test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
 test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/test/test_unicode.py Python+Unicode/Lib/test/test_unicode.py
--- CVS-Python/Lib/test/test_unicode.py	Fri Mar 17 15:24:31 2000
+++ Python+Unicode/Lib/test/test_unicode.py	Mon Mar 13 10:13:05 2000
@@ -108,6 +108,7 @@
     test('translate', u'xyz', u'xyz', table)
 
 test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
+test('replace', u'one!two!three!', u'onetwothree', '!', '')
 test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Misc/unicode.txt Python+Unicode/Misc/unicode.txt
--- CVS-Python/Misc/unicode.txt	Sat Mar 11 00:14:11 2000
+++ Python+Unicode/Misc/unicode.txt	Fri Mar 17 16:55:11 2000
@@ -743,8 +743,9 @@
 stream codecs as available through the codecs module should 
 be used.
 
-XXX There should be a short-cut open(filename,mode,encoding) available which
-    also assures that mode contains the 'b' character when needed.
+The codecs module should provide a short-cut open(filename,mode,encoding)
+available which also assures that mode contains the 'b' character when
+needed.
 
 
 File/Stream Input:
@@ -810,6 +811,10 @@
 Introduction to Unicode (a little outdated by still nice to read):
         http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
 
+For comparison:
+	Introducing Unicode to ECMAScript --
+	http://www-4.ibm.com/software/developer/library/internationalization-support.html
+
 Encodings:
 
     Overview:
@@ -832,7 +837,7 @@
 
 History of this Proposal:
 -------------------------
-1.2: 
+1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about
      case mapping algorithms. Changed stream codecs .read() and
      .write() method to match the standard file-like object methods
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Modules/stropmodule.c Python+Unicode/Modules/stropmodule.c
--- CVS-Python/Modules/stropmodule.c	Wed Mar  1 10:22:53 2000
+++ Python+Unicode/Modules/stropmodule.c	Mon Mar 13 14:33:23 2000
@@ -1054,7 +1054,7 @@
 
   strstr replacement for arbitrary blocks of memory.
 
-  Locates the first occurance in the memory pointed to by MEM of the
+  Locates the first occurrence in the memory pointed to by MEM of the
   contents of memory pointed to by PAT.  Returns the index into MEM if
   found, or -1 if not found.  If len of PAT is greater than length of
   MEM, the function returns -1.
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Objects/stringobject.c Python+Unicode/Objects/stringobject.c
--- CVS-Python/Objects/stringobject.c	Tue Mar 14 00:14:17 2000
+++ Python+Unicode/Objects/stringobject.c	Mon Mar 13 14:33:24 2000
@@ -1395,7 +1395,7 @@
 
   strstr replacement for arbitrary blocks of memory.
 
-  Locates the first occurance in the memory pointed to by MEM of the
+  Locates the first occurrence in the memory pointed to by MEM of the
   contents of memory pointed to by PAT.  Returns the index into MEM if
   found, or -1 if not found.  If len of PAT is greater than length of
   MEM, the function returns -1.
@@ -1578,7 +1578,7 @@
 		return NULL;
 
 	if (sub_len <= 0) {
-		PyErr_SetString(PyExc_ValueError, "empty replacement string");
+		PyErr_SetString(PyExc_ValueError, "empty pattern string");
 		return NULL;
 	}
 	new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
Only in CVS-Python/Objects: stringobject.c.orig
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Objects/unicodeobject.c Python+Unicode/Objects/unicodeobject.c
--- CVS-Python/Objects/unicodeobject.c	Tue Mar 14 00:14:17 2000
+++ Python+Unicode/Objects/unicodeobject.c	Wed Mar 15 10:49:19 2000
@@ -83,7 +83,7 @@
    all objects on the free list having a size less than this
    limit. This reduces malloc() overhead for small Unicode objects.  
 
-   At worse this will result in MAX_UNICODE_FREELIST_SIZE *
+   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
    (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
    malloc()-overhead) bytes of unused garbage.
 
@@ -180,7 +180,7 @@
         unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
         unicode_freelist_size--;
         unicode->ob_type = &PyUnicode_Type;
-        _Py_NewReference(unicode);
+        _Py_NewReference((PyObject *)unicode);
 	if (unicode->str) {
 	    if (unicode->length < length &&
 		_PyUnicode_Resize(unicode, length)) {
@@ -199,16 +199,19 @@
 	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
     }
 
-    if (!unicode->str) {
-        PyMem_DEL(unicode);
-        PyErr_NoMemory();
-        return NULL;
-    }
+    if (!unicode->str) 
+	goto onError;
     unicode->str[length] = 0;
     unicode->length = length;
     unicode->hash = -1;
     unicode->utf8str = NULL;
     return unicode;
+
+ onError:
+    _Py_ForgetReference((PyObject *)unicode);
+    PyMem_DEL(unicode);
+    PyErr_NoMemory();
+    return NULL;
 }
 
 static
@@ -224,7 +227,6 @@
         *(PyUnicodeObject **)unicode = unicode_freelist;
         unicode_freelist = unicode;
         unicode_freelist_size++;
-        _Py_ForgetReference(unicode);
     }
     else {
 	free(unicode->str);
@@ -489,7 +491,7 @@
     }
     else {
         PyErr_Format(PyExc_ValueError,
-                     "UTF-8 decoding error; unkown error handling code: %s",
+                     "UTF-8 decoding error; unknown error handling code: %s",
                      errors);
         return -1;
     }
@@ -611,7 +613,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "UTF-8 encoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -733,7 +735,7 @@
     }
     else {
         PyErr_Format(PyExc_ValueError,
-                     "UTF-16 decoding error; unkown error handling code: %s",
+                     "UTF-16 decoding error; unknown error handling code: %s",
                      errors);
         return -1;
     }
@@ -921,7 +923,7 @@
     else {
         PyErr_Format(PyExc_ValueError,
                      "Unicode-Escape decoding error; "
-                     "unkown error handling code: %s",
+                     "unknown error handling code: %s",
                      errors);
         return -1;
     }
@@ -1051,6 +1053,10 @@
 
 */
 
+static const Py_UNICODE *findchar(const Py_UNICODE *s,
+				  int size,
+				  Py_UNICODE ch);
+
 static
 PyObject *unicodeescape_string(const Py_UNICODE *s,
                                int size,
@@ -1069,9 +1075,6 @@
     p = q = PyString_AS_STRING(repr);
 
     if (quotes) {
-        static const Py_UNICODE *findchar(const Py_UNICODE *s,
-					  int size,
-					  Py_UNICODE ch);
         *p++ = 'u';
         *p++ = (findchar(s, size, '\'') && 
                 !findchar(s, size, '"')) ? '"' : '\'';
@@ -1298,7 +1301,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "Latin-1 encoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -1369,7 +1372,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "ASCII decoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -1431,7 +1434,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "ASCII encoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -1502,7 +1505,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "charmap decoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -1618,7 +1621,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "charmap encoding error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
@@ -1750,7 +1753,7 @@
     else {
 	PyErr_Format(PyExc_ValueError,
 		     "translate error; "
-		     "unkown error handling code: %s",
+		     "unknown error handling code: %s",
 		     errors);
 	return -1;
     }
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Python/codecs.c Python+Unicode/Python/codecs.c
--- CVS-Python/Python/codecs.c	Fri Mar 10 23:57:27 2000
+++ Python+Unicode/Python/codecs.c	Wed Mar 15 11:27:54 2000
@@ -93,9 +93,14 @@
 
 PyObject *_PyCodec_Lookup(const char *encoding)
 {
-    PyObject *result, *args = NULL, *v;
+    PyObject *result, *args = NULL, *v = NULL;
     int i, len;
 
+    if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
+	PyErr_SetString(PyExc_SystemError,
+			"codec module not properly initialized");
+	goto onError;
+    }
     if (!import_encodings_called)
 	import_encodings();
 
@@ -109,6 +114,7 @@
     result = PyDict_GetItem(_PyCodec_SearchCache, v);
     if (result != NULL) {
 	Py_INCREF(result);
+	Py_DECREF(v);
 	return result;
     }
     
@@ -121,6 +127,7 @@
     if (args == NULL)
 	goto onError;
     PyTuple_SET_ITEM(args,0,v);
+    v = NULL;
 
     for (i = 0; i < len; i++) {
 	PyObject *func;
@@ -146,7 +153,7 @@
     if (i == len) {
 	/* XXX Perhaps we should cache misses too ? */
 	PyErr_SetString(PyExc_LookupError,
-			"unkown encoding");
+			"unknown encoding");
 	goto onError;
     }
 
@@ -156,6 +163,7 @@
     return result;
 
  onError:
+    Py_XDECREF(v);
     Py_XDECREF(args);
     return NULL;
 }
@@ -378,5 +386,7 @@
 void _PyCodecRegistry_Fini()
 {
     Py_XDECREF(_PyCodec_SearchPath);
+    _PyCodec_SearchPath = NULL;
     Py_XDECREF(_PyCodec_SearchCache);
+    _PyCodec_SearchCache = NULL;
 }

--------------A764B515049AA0B5F7643A5B--