[Patches] Unicode Patch Set 2000-04-10

M.-A. Lemburg mal@lemburg.com
Mon, 10 Apr 2000 11:51:44 +0200


This is a multi-part message in MIME format.
--------------3642FC2D466120215DDB5BCE
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The attached patch includes the following fixes and additions:

* More test cases for test_contains.py.

* New exported API PyUnicode_Resize()

* '...%s...' % u"abc" now coerces to Unicode just like 
  string methods. Care is taken not to reevaluate already formatted
  arguments -- only the first Unicode object appearing in the
  argument mapping is looked up twice. Added test cases for
  this to test_unicode.py.

* TypeErrors during comparing of mixed type arguments including
  a Unicode object are now masked (just like they are for all
  other combinations).

* The experimental Keep-Alive optimization was turned back
  on after some tweaks to the implementation. It should now
  work without causing core dumps... this has yet to tested
  though (switching it off is easy: see the unicodeobject.c
  file for details).

* Fixed a memory leak in the Unicode freelist cleanup code.

* Added tests to correctly process the return code from
  _PyUnicode_Resize().

* Fixed a bug in the 'ignore' error handling routines
  of some builtin codecs. Added test cases for these to
  test_unicode.py.

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------3642FC2D466120215DDB5BCE
Content-Type: text/plain; charset=iso-8859-1;
 name="Unicode-Implementation-2000-04-10.patch"
Content-Transfer-Encoding: 8bit
Content-Disposition: inline;
 filename="Unicode-Implementation-2000-04-10.patch"

diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.py CVS-Python/Include/unicodeobject.h Python+Unicode/Include/unicodeobject.h
--- CVS-Python/Include/unicodeobject.h	Thu Apr  6 10:00:22 2000
+++ Python+Unicode/Include/unicodeobject.h	Sat Apr  8 12:56:27 2000
@@ -237,6 +237,25 @@
     PyObject *unicode	 	/* Unicode object */
     );
 
+/* Resize an already allocated Unicode object to the new size length.
+
+   *unicode is modified to point to the new (resized) object and 0
+   returned on success.
+
+   This API may only be called by the function which also called the
+   Unicode constructor. The refcount on the object must be 1. Otherwise,
+   an error is returned.
+
+   Error handling is implemented as follows: an exception is set, -1
+   is returned and *unicode left untouched.
+
+*/
+
+extern DL_IMPORT(int) PyUnicode_Resize(
+    PyObject **unicode,		/* Pointer to the Unicode object */
+    int length			/* New length */
+    );
+
 /* Coerce obj to an Unicode object and return a reference with
    *incremented* refcount.
 
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.py CVS-Python/Objects/object.c Python+Unicode/Objects/object.c
--- CVS-Python/Objects/object.c	Tue Mar 28 09:19:17 2000
+++ Python+Unicode/Objects/object.c	Wed Apr  5 02:44:28 2000
@@ -347,8 +347,21 @@
 				return cmp;
 			}
 		}
-		else if (PyUnicode_Check(v) || PyUnicode_Check(w))
-			return PyUnicode_Compare(v, w);
+		else if (PyUnicode_Check(v) || PyUnicode_Check(w)) {
+			int result = PyUnicode_Compare(v, w);
+			if (result == -1 && PyErr_Occurred() && 
+			    PyErr_ExceptionMatches(PyExc_TypeError))
+				/* TypeErrors are ignored: if Unicode coercion
+				fails due to one of the arguments not
+			 	having the right type, we continue as
+				defined by the coercion protocol (see
+				above). Luckily, decoding errors are
+				reported as ValueErrors and are not masked
+				by this technique. */
+				PyErr_Clear();
+			else
+				return result;
+		}
 		else if (vtp->tp_as_number != NULL)
 			vname = "";
 		else if (wtp->tp_as_number != NULL)
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.py CVS-Python/Objects/stringobject.c Python+Unicode/Objects/stringobject.c
--- CVS-Python/Objects/stringobject.c	Fri Mar 24 00:01:30 2000
+++ Python+Unicode/Objects/stringobject.c	Sat Apr  8 01:04:08 2000
@@ -389,9 +389,9 @@
 {
 	register char *s, *end;
 	register char c;
-	if (!PyString_Check(el))
+	if (PyUnicode_Check(el))
 		return PyUnicode_Contains(a, el);
-	if (PyString_Size(el) != 1) {
+	if (!PyString_Check(el) || PyString_Size(el) != 1) {
 		PyErr_SetString(PyExc_TypeError,
 				"string member test needs char left operand");
 		return -1;
@@ -2384,12 +2384,13 @@
 	char *fmt, *res;
 	int fmtcnt, rescnt, reslen, arglen, argidx;
 	int args_owned = 0;
-	PyObject *result;
+	PyObject *result, *orig_args;
 	PyObject *dict = NULL;
 	if (format == NULL || !PyString_Check(format) || args == NULL) {
 		PyErr_BadInternalCall();
 		return NULL;
 	}
+	orig_args = args;
 	fmt = PyString_AsString(format);
 	fmtcnt = PyString_Size(format);
 	reslen = rescnt = fmtcnt + 100;
@@ -2434,6 +2435,8 @@
 			int sign;
 			int len;
 			char tmpbuf[120]; /* For format{float,int,char}() */
+			char *fmt_start = fmt;
+			
 			fmt++;
 			if (*fmt == '(') {
 				char *keystart;
@@ -2584,6 +2587,10 @@
 				break;
 			case 's':
 			case 'r':
+				if (PyUnicode_Check(v)) {
+					fmt = fmt_start;
+					goto unicode;
+				}
 				if (c == 's')
 					temp = PyObject_Str(v);
 				else
@@ -2716,6 +2723,47 @@
 	}
 	_PyString_Resize(&result, reslen - rescnt);
 	return result;
+
+ unicode:
+	if (args_owned) {
+		Py_DECREF(args);
+		args_owned = 0;
+	}
+	/* Fiddle args right (remove the first argidx-1 arguments) */
+	--argidx;
+	if (PyTuple_Check(orig_args) && argidx > 0) {
+		PyObject *v;
+		int n = PyTuple_GET_SIZE(orig_args) - argidx;
+		v = PyTuple_New(n);
+		if (v == NULL)
+			goto error;
+		while (--n >= 0) {
+			PyObject *w = PyTuple_GET_ITEM(orig_args, n + argidx);
+			Py_INCREF(w);
+			PyTuple_SET_ITEM(v, n, w);
+		}
+		args = v;
+	} else {
+		Py_INCREF(orig_args);
+		args = orig_args;
+	}
+	/* Paste rest of format string to what we have of the result
+	   string; we reuse result for this */
+	rescnt = res - PyString_AS_STRING(result);
+	fmtcnt = PyString_GET_SIZE(format) - \
+		 (fmt - PyString_AS_STRING(format));
+	if (_PyString_Resize(&result, rescnt + fmtcnt)) {
+		Py_DECREF(args);
+		goto error;
+	}
+	memcpy(PyString_AS_STRING(result) + rescnt, fmt, fmtcnt);
+	format = result;
+	/* Let Unicode do its magic */
+	result = PyUnicode_Format(format, args);
+	Py_DECREF(format);
+	Py_DECREF(args);
+	return result;
+	
  error:
 	Py_DECREF(result);
 	if (args_owned) {
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.py CVS-Python/Objects/unicodeobject.c Python+Unicode/Objects/unicodeobject.c
--- CVS-Python/Objects/unicodeobject.c	Mon Apr 10 11:22:21 2000
+++ Python+Unicode/Objects/unicodeobject.c	Sat Apr  8 14:23:58 2000
@@ -76,6 +76,7 @@
 #ifdef MS_WIN32
 #include <windows.h>
 #endif
+
 /* Limit for the Unicode object free list */
 
 #define MAX_UNICODE_FREELIST_SIZE       1024
@@ -87,18 +88,17 @@
    limit. This reduces malloc() overhead for small Unicode objects.  
 
    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
-   (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
+   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
    malloc()-overhead) bytes of unused garbage.
 
    Setting the limit to 0 effectively turns the feature off.
 
-   XXX The feature is currently turned off because there are
-       apparently some lingering bugs in its implementation which I
-       haven't yet been able to sort out.
+   Note: This is an experimental feature ! If you get core dumps when
+   using Unicode objects, turn this feature off.
 
 */
 
-#define STAYALIVE_SIZE_LIMIT       0
+#define KEEPALIVE_SIZE_LIMIT       9
 
 /* Endianness switches; defaults to little endian */
 
@@ -125,9 +125,9 @@
 {
     void *oldstr;
     
-    /* Shortcut if there's nothing to do. */
+    /* Shortcut if there's nothing much to do. */
     if (unicode->length == length)
-	return 0;
+	goto reset;
 
     /* Resizing unicode_empty is not allowed. */
     if (unicode == unicode_empty) {
@@ -148,6 +148,7 @@
     unicode->str[length] = 0;
     unicode->length = length;
 
+ reset:
     /* Reset the object caches */
     if (unicode->utf8str) {
         Py_DECREF(unicode->utf8str);
@@ -158,6 +159,23 @@
     return 0;
 }
 
+int PyUnicode_Resize(PyObject **unicode,
+		     int length)
+{
+    PyUnicodeObject *v;
+
+    if (unicode == NULL) {
+	PyErr_BadInternalCall();
+	return -1;
+    }
+    v = (PyUnicodeObject *)*unicode;
+    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
+	PyErr_BadInternalCall();
+	return -1;
+    }
+    return _PyUnicode_Resize(v, length);
+}
+
 /* We allocate one more byte to make sure the string is
    Ux0000 terminated -- XXX is this needed ? 
 
@@ -185,7 +203,9 @@
         unicode->ob_type = &PyUnicode_Type;
         _Py_NewReference((PyObject *)unicode);
 	if (unicode->str) {
-	    if (unicode->length < length &&
+	    /* Keep-Alive optimization: we only upsize the buffer,
+	       never downsize it. */
+	    if ((unicode->length < length) &&
 		_PyUnicode_Resize(unicode, length)) {
 		free(unicode->str);
 		PyMem_DEL(unicode);
@@ -220,19 +240,25 @@
 static
 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 {
-    Py_XDECREF(unicode->utf8str);
     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
-	if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
+        /* Keep-Alive optimization */
+	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 	    free(unicode->str);
 	    unicode->str = NULL;
 	    unicode->length = 0;
 	}
+	if (unicode->utf8str) {
+	    Py_DECREF(unicode->utf8str);
+	    unicode->utf8str = NULL;
+	}
+	/* Add to free list */
         *(PyUnicodeObject **)unicode = unicode_freelist;
         unicode_freelist = unicode;
         unicode_freelist_size++;
     }
     else {
 	free(unicode->str);
+	Py_XDECREF(unicode->utf8str);
         PyMem_DEL(unicode);
     }
 }
@@ -665,7 +691,8 @@
         }
     }
     *p = '\0';
-    _PyString_Resize(&v, p - q);
+    if (_PyString_Resize(&v, p - q))
+	goto onError;
 
  done:
     return v;
@@ -1047,7 +1074,8 @@
             break;
         }
     }
-    _PyUnicode_Resize(v, (int)(p - buf));
+    if (_PyUnicode_Resize(v, (int)(p - buf)))
+	goto onError;
     return (PyObject *)v;
     
  onError:
@@ -1119,9 +1147,14 @@
         *p++ = q[1];
 
     *p = '\0';
-    _PyString_Resize(&repr, p - q);
+    if (_PyString_Resize(&repr, p - q))
+	goto onError;
 
     return repr;
+
+ onError:
+    Py_DECREF(repr);
+    return NULL;
 }
 
 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
@@ -1209,7 +1242,8 @@
 	s += i;
 	*p++ = x;
     }
-    _PyUnicode_Resize(v, (int)(p - buf));
+    if (_PyUnicode_Resize(v, (int)(p - buf)))
+	goto onError;
     return (PyObject *)v;
     
  onError:
@@ -1247,9 +1281,14 @@
             *p++ = (char) ch;
     }
     *p = '\0';
-    _PyString_Resize(&repr, p - q);
+    if (_PyString_Resize(&repr, p - q))
+	goto onError;
 
     return repr;
+
+ onError:
+    Py_DECREF(repr);
+    return NULL;
 }
 
 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@@ -1305,6 +1344,7 @@
     }
     else if (strcmp(errors,"replace") == 0) {
 	**dest = '?';
+	(*dest)++;
 	return 0;
     }
     else {
@@ -1321,12 +1361,13 @@
 				 const char *errors)
 {
     PyObject *repr;
-    char *s;
+    char *s, *start;
     repr = PyString_FromStringAndSize(NULL, size);
     if (repr == NULL)
         return NULL;
 
     s = PyString_AS_STRING(repr);
+    start = s;
     while (size-- > 0) {
         Py_UNICODE ch = *p++;
 	if (ch >= 256) {
@@ -1337,6 +1378,10 @@
 	else
             *s++ = (char)ch;
     }
+    /* Resize if error handling skipped some characters */
+    if (s - start < PyString_GET_SIZE(repr))
+	if (_PyString_Resize(&repr, s - start))
+	    goto onError;
     return repr;
 
  onError:
@@ -1411,8 +1456,9 @@
 				      "ordinal not in range(128)"))
 		goto onError;
     }
-    if (p - PyUnicode_AS_UNICODE(v) < size)
-	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));	
+    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
+	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
+	    goto onError;
     return (PyObject *)v;
     
  onError:
@@ -1438,6 +1484,7 @@
     }
     else if (strcmp(errors,"replace") == 0) {
 	**dest = '?';
+	(*dest)++;
 	return 0;
     }
     else {
@@ -1454,12 +1501,13 @@
 				const char *errors)
 {
     PyObject *repr;
-    char *s;
+    char *s, *start;
     repr = PyString_FromStringAndSize(NULL, size);
     if (repr == NULL)
         return NULL;
 
     s = PyString_AS_STRING(repr);
+    start = s;
     while (size-- > 0) {
         Py_UNICODE ch = *p++;
 	if (ch >= 128) {
@@ -1470,6 +1518,10 @@
 	else
             *s++ = (char)ch;
     }
+    /* Resize if error handling skipped some characters */
+    if (s - start < PyString_GET_SIZE(repr))
+	if (_PyString_Resize(&repr, s - start))
+	    goto onError;
     return repr;
 
  onError:
@@ -1898,7 +1950,8 @@
 	Py_DECREF(x);
     }
     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
-	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));	
+	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
+	    goto onError;
 
  done:
     return (PyObject *)v;
@@ -1959,7 +2012,7 @@
 	    continue;
 	}
 	if (0 < ch && ch < 256) {
-	    *output++ = (char) ch;
+	    *output++ = ch;
 	    continue;
 	}
 	/* All other characters are considered invalid */
@@ -4539,7 +4592,8 @@
 	Py_DECREF(args);
     }
     Py_DECREF(uformat);
-    _PyUnicode_Resize(result, reslen - rescnt);
+    if (_PyUnicode_Resize(result, reslen - rescnt))
+	goto onError;
     return (PyObject *)result;
 
  onError:
@@ -4605,6 +4659,9 @@
     while (u != NULL) {
 	PyUnicodeObject *v = u;
 	u = *(PyUnicodeObject **)u;
+	if (v->str)
+	    free(v->str);
+	Py_XDECREF(v->utf8str);
 	free(v);
     }
     Py_XDECREF(unicode_empty);
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.c -x *.h -x *.in -x output CVS-Python/Lib/test/test_contains.py Python+Unicode/Lib/test/test_contains.py
--- CVS-Python/Lib/test/test_contains.py	Tue Mar  7 16:52:01 2000
+++ Python+Unicode/Lib/test/test_contains.py	Wed Apr  5 11:23:44 2000
@@ -17,7 +17,7 @@
 
 def check(ok, *args):
     if not ok:
-        raise TestFailed, join(map(str, args), " ")
+        raise TestFailed, " ".join(map(str, args))
 
 a = base_set(1)
 b = set(1)
@@ -60,5 +60,62 @@
 try:
 	None in 'abc'
 	check(0, "None in 'abc' did not raise error")
+except TypeError:
+	pass
+
+# Test char in Unicode
+
+check('c' in u'abc', "'c' not in u'abc'")
+check('d' not in u'abc', "'d' in u'abc'")
+
+try:
+	'' in u'abc'
+	check(0, "'' in u'abc' did not raise error")
+except TypeError:
+	pass
+
+try:
+	'ab' in u'abc'
+	check(0, "'ab' in u'abc' did not raise error")
+except TypeError:
+	pass
+
+try:
+	None in u'abc'
+	check(0, "None in u'abc' did not raise error")
+except TypeError:
+	pass
+
+# Test Unicode char in Unicode
+
+check(u'c' in u'abc', "u'c' not in u'abc'")
+check(u'd' not in u'abc', "u'd' in u'abc'")
+
+try:
+	u'' in u'abc'
+	check(0, "u'' in u'abc' did not raise error")
+except TypeError:
+	pass
+
+try:
+	u'ab' in u'abc'
+	check(0, "u'ab' in u'abc' did not raise error")
+except TypeError:
+	pass
+
+# Test Unicode char in string
+
+check(u'c' in 'abc', "u'c' not in 'abc'")
+check(u'd' not in 'abc', "u'd' in 'abc'")
+
+try:
+	u'' in 'abc'
+	check(0, "u'' in 'abc' did not raise error")
+except TypeError:
+	pass
+
+try:
+	u'ab' in 'abc'
+	check(0, "u'ab' in 'abc' did not raise error")
 except TypeError:
 	pass
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x Demo -x CVS -x Doc -x *.orig -x .#* -x *.txt -x distutils -x PC -x PCbuild -x *.c -x *.h -x *.in -x output CVS-Python/Lib/test/test_unicode.py Python+Unicode/Lib/test/test_unicode.py
--- CVS-Python/Lib/test/test_unicode.py	Thu Apr  6 10:00:25 2000
+++ Python+Unicode/Lib/test/test_unicode.py	Sat Apr  8 00:52:42 2000
@@ -255,6 +255,15 @@
 assert u"%r, %r" % (u"abc", "abc") == u"u'abc', 'abc'"
 assert u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def'
 assert u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"} == u'abc, def'
+# formatting jobs delegated from the string implementation:
+assert '...%(foo)s...' % {'foo':u"abc"} == u'...abc...'
+assert '...%(foo)s...' % {'foo':"abc"} == '...abc...'
+assert '...%(foo)s...' % {u'foo':"abc"} == '...abc...'
+assert '...%(foo)s...' % {u'foo':u"abc"} == u'...abc...'
+assert '...%(foo)s...' % {u'foo':u"abc",'def':123} ==  u'...abc...'
+assert '...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...'
+assert '...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...'
+assert '...%s...' % u"abc" == u'...abc...'
 print 'done.'
 
 # Test builtin codecs
@@ -264,6 +273,26 @@
 assert unicode('hello','utf-8') == u'hello'
 assert unicode('hello','utf8') == u'hello'
 assert unicode('hello','latin-1') == u'hello'
+
+try:
+    u'Andr\202 x'.encode('ascii')
+    u'Andr\202 x'.encode('ascii','strict')
+except ValueError:
+    pass
+else:
+    raise AssertionError, "u'Andr\202'.encode('ascii') failed to raise an exception"
+assert u'Andr\202 x'.encode('ascii','ignore') == "Andr x"
+assert u'Andr\202 x'.encode('ascii','replace') == "Andr? x"
+
+try:
+    unicode('Andr\202 x','ascii')
+    unicode('Andr\202 x','ascii','strict')
+except ValueError:
+    pass
+else:
+    raise AssertionError, "unicode('Andr\202') failed to raise an exception"
+assert unicode('Andr\202 x','ascii','ignore') == u"Andr x"
+assert unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x'
 
 assert u'hello'.encode('ascii') == 'hello'
 assert u'hello'.encode('utf-8') == 'hello'

--------------3642FC2D466120215DDB5BCE--