[Python-Dev] Unicode: When Things Get Hairy

M.-A. Lemburg mal@lemburg.com
Sat, 11 Mar 2000 14:57:34 +0100


This is a multi-part message in MIME format.
--------------56A130F1FCAC300009B200AD
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

I couldn't resist :-) Here's the patch...

BTW, how should we proceed with future patches ? Should I wrap
them together about once a week, or send them as soon as they
are done ?

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------56A130F1FCAC300009B200AD
Content-Type: text/plain; charset=us-ascii;
 name="Unicode-Implementation-2000-03-11.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="Unicode-Implementation-2000-03-11.patch"

diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Include/unicodeobject.h Python+Unicode/Include/unicodeobject.h
--- CVS-Python/Include/unicodeobject.h	Fri Mar 10 23:33:05 2000
+++ Python+Unicode/Include/unicodeobject.h	Sat Mar 11 14:45:59 2000
@@ -683,6 +683,17 @@
     PyObject *args		/* Argument tuple or dictionary */
     );
 
+/* Checks whether element is contained in container and return 1/0
+   accordingly.
+
+   element has to coerce to an one element Unicode string. -1 is
+   returned in case of an error. */
+
+extern DL_IMPORT(int) PyUnicode_Contains(
+    PyObject *container,	/* Container string */ 
+    PyObject *element		/* Element string */
+    );
+
 /* === Characters Type APIs =============================================== */
 
 /* These should not be used directly. Use the Py_UNICODE_IS* and
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/test/test_unicode.py Python+Unicode/Lib/test/test_unicode.py
--- CVS-Python/Lib/test/test_unicode.py	Sat Mar 11 00:23:20 2000
+++ Python+Unicode/Lib/test/test_unicode.py	Sat Mar 11 14:52:29 2000
@@ -219,6 +219,19 @@
 test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
 test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
 
+# Contains:
+print 'Testing Unicode contains method...',
+assert ('a' in 'abdb') == 1
+assert ('a' in 'bdab') == 1
+assert ('a' in 'bdaba') == 1
+assert ('a' in 'bdba') == 1
+assert ('a' in u'bdba') == 1
+assert (u'a' in u'bdba') == 1
+assert (u'a' in u'bdb') == 0
+assert (u'a' in 'bdb') == 0
+assert (u'a' in 'bdba') == 1
+print 'done.'
+
 # Formatting:
 print 'Testing Unicode formatting strings...',
 assert u"%s, %s" % (u"abc", "abc") == u'abc, abc'
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Misc/unicode.txt Python+Unicode/Misc/unicode.txt
--- CVS-Python/Misc/unicode.txt	Sat Mar 11 00:14:11 2000
+++ Python+Unicode/Misc/unicode.txt	Sat Mar 11 14:53:37 2000
@@ -743,8 +743,9 @@
 stream codecs as available through the codecs module should 
 be used.
 
-XXX There should be a short-cut open(filename,mode,encoding) available which
-    also assures that mode contains the 'b' character when needed.
+The codecs module should provide a short-cut open(filename,mode,encoding)
+available which also assures that mode contains the 'b' character when
+needed.
 
 
 File/Stream Input:
@@ -810,6 +811,10 @@
 Introduction to Unicode (a little outdated by still nice to read):
         http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
 
+For comparison:
+	Introducing Unicode to ECMAScript --
+	http://www-4.ibm.com/software/developer/library/internationalization-support.html
+
 Encodings:
 
     Overview:
@@ -832,7 +837,7 @@
 
 History of this Proposal:
 -------------------------
-1.2: 
+1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about
      case mapping algorithms. Changed stream codecs .read() and
      .write() method to match the standard file-like object methods
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Objects/stringobject.c Python+Unicode/Objects/stringobject.c
--- CVS-Python/Objects/stringobject.c	Sat Mar 11 10:55:09 2000
+++ Python+Unicode/Objects/stringobject.c	Sat Mar 11 14:47:45 2000
@@ -389,7 +389,9 @@
 {
 	register char *s, *end;
 	register char c;
-	if (!PyString_Check(el) || PyString_Size(el) != 1) {
+	if (!PyString_Check(el))
+		return PyUnicode_Contains(a, el);
+	if (PyString_Size(el) != 1) {
 		PyErr_SetString(PyExc_TypeError,
 				"string member test needs char left operand");
 		return -1;
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Objects/unicodeobject.c Python+Unicode/Objects/unicodeobject.c
--- CVS-Python/Objects/unicodeobject.c	Fri Mar 10 23:53:23 2000
+++ Python+Unicode/Objects/unicodeobject.c	Sat Mar 11 14:48:52 2000
@@ -2737,6 +2737,49 @@
     return -1;
 }
 
+int PyUnicode_Contains(PyObject *container,
+		       PyObject *element)
+{
+    PyUnicodeObject *u = NULL, *v = NULL;
+    int result;
+    register const Py_UNICODE *p, *e;
+    register Py_UNICODE ch;
+
+    /* Coerce the two arguments */
+    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
+    if (u == NULL)
+	goto onError;
+    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
+    if (v == NULL)
+	goto onError;
+
+    /* Check v in u */
+    if (PyUnicode_GET_SIZE(v) != 1) {
+	PyErr_SetString(PyExc_TypeError,
+			"string member test needs char left operand");
+	goto onError;
+    }
+    ch = *PyUnicode_AS_UNICODE(v);
+    p = PyUnicode_AS_UNICODE(u);
+    e = p + PyUnicode_GET_SIZE(u);
+    result = 0;
+    while (p < e) {
+	if (*p++ == ch) {
+	    result = 1;
+	    break;
+	}
+    }
+
+    Py_DECREF(u);
+    Py_DECREF(v);
+    return result;
+
+onError:
+    Py_XDECREF(u);
+    Py_XDECREF(v);
+    return -1;
+}
+
 /* Concat to string or Unicode object giving a new Unicode object. */
 
 PyObject *PyUnicode_Concat(PyObject *left,
@@ -3817,6 +3860,7 @@
     (intintargfunc) unicode_slice, 	/* sq_slice */
     0, 					/* sq_ass_item */
     0, 					/* sq_ass_slice */
+    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
 };
 
 static int

--------------56A130F1FCAC300009B200AD--