[Python-3000-checkins] r56755 - in python/branches/py3k-struni: Doc/api/concrete.tex Objects/bytesobject.c Objects/unicodeobject.c

Sun Aug 5 22:26:12 CEST 2007

Author: martin.v.loewis
Date: Sun Aug  5 22:26:11 2007
New Revision: 56755

Modified:
   python/branches/py3k-struni/Doc/api/concrete.tex
   python/branches/py3k-struni/Objects/bytesobject.c
   python/branches/py3k-struni/Objects/unicodeobject.c
Log:
Change PyUnicode_FromString[AndSize] to expect UTF-8.


Modified: python/branches/py3k-struni/Doc/api/concrete.tex
==============================================================================

--- python/branches/py3k-struni/Doc/api/concrete.tex	(original)
+++ python/branches/py3k-struni/Doc/api/concrete.tex	Sun Aug  5 22:26:11 2007
@@ -996,10 +996,11 @@
   \var{u} is \NULL{}.
 \end{cfuncdesc}
 
-\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char *u}
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromStringAndSize}{const char *u,
+                                                       Py_ssize_t size}
   Create a Unicode Object from the char buffer \var{u}.
-  \var{u} must be 0-terminated, the bytes will be interpreted as
-  being latin-1 encoded. \var{u} may also be \NULL{} which causes the
+  The bytes will be interpreted as being UTF-8 encoded. 
+  \var{u} may also be \NULL{} which causes the
   contents to be undefined. It is the user's responsibility to fill
   in the needed data.  The buffer is copied into the new object.
   If the buffer is not \NULL{}, the return value might be a shared object.
@@ -1008,6 +1009,12 @@
   \versionadded{3.0}
 \end{cfuncdesc}
 
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char*u}
+   Create a Unicode object from an UTF-8 encoded null-terminated
+   char buffer \var{u}.
+   \versionadded{3.0}
+\end{funcdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_FromFormat}{const char *format, ...}
   Take a C \cfunction{printf()}-style \var{format} string and a
   variable number of arguments, calculate the size of the resulting

Modified: python/branches/py3k-struni/Objects/bytesobject.c
==============================================================================
--- python/branches/py3k-struni/Objects/bytesobject.c	(original)
+++ python/branches/py3k-struni/Objects/bytesobject.c	Sun Aug  5 22:26:11 2007
@@ -2724,11 +2724,13 @@
 static PyObject *
 bytes_reduce(PyBytesObject *self)
 {
-    return Py_BuildValue("(O(s#s))",
-                         Py_Type(self),
-                         self->ob_bytes == NULL ? "" : self->ob_bytes,
-                         Py_Size(self),
-                         "latin-1");
+    PyObject *latin1;
+    if (self->ob_bytes)
+	latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, 
+					Py_Size(self), NULL);
+    else
+	latin1 = PyUnicode_FromString("");
+    return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
 }
 
 static PySequenceMethods bytes_as_sequence = {

Modified: python/branches/py3k-struni/Objects/unicodeobject.c
==============================================================================
--- python/branches/py3k-struni/Objects/unicodeobject.c	(original)
+++ python/branches/py3k-struni/Objects/unicodeobject.c	Sun Aug  5 22:26:11 2007
@@ -427,7 +427,9 @@
 {
     PyUnicodeObject *unicode;
     /* If the Unicode data is known at construction time, we can apply
-       some optimizations which share commonly used objects. */
+       some optimizations which share commonly used objects.
+       Also, this means the input must be UTF-8, so fall back to the
+       UTF-8 decoder at the end. */
     if (u != NULL) {
 
 	/* Optimization for empty strings */
@@ -436,8 +438,9 @@
 	    return (PyObject *)unicode_empty;
 	}
 
-	/* Single characters are shared when using this constructor */
-	if (size == 1) {
+	/* Single characters are shared when using this constructor.
+           Restrict to ASCII, since the input must be UTF-8. */
+	if (size == 1 && Py_CHARMASK(*u) < 128) {
 	    unicode = unicode_latin1[Py_CHARMASK(*u)];
 	    if (!unicode) {
 		unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@
 	    Py_INCREF(unicode);
 	    return (PyObject *)unicode;
 	}
+
+        return PyUnicode_DecodeUTF8(u, size, NULL);
     }
 
     unicode = _PyUnicode_New(size);
     if (!unicode)
         return NULL;
 
-    /* Copy the Unicode data into the new object */
-    if (u != NULL) {
-        Py_UNICODE *p = unicode->str;
-        while (size--)
-            *p++ = Py_CHARMASK(*u++);
-        /* Don't need to write trailing 0 because
-           that's already done by _PyUnicode_New */
-    }
-
     return (PyObject *)unicode;
 }