[Python-Dev] 'abc'.encode() and str(obj, encoding)

M.-A. Lemburg mal@lemburg.com
Wed, 05 Jul 2000 20:11:08 +0200


This is a multi-part message in MIME format.
--------------FC351AC889B8145AF245DB36
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The idea is not new: strings and Unicode should have more
or less the same methods to enhance their compatibility.

The attached patch adds encoding capabilities to normal
strings and extends the builtin str() to accept an optional
encoding (and error) argument. It also tries to reuse the
already available Unicode codecs for the purposes of strings
(conversions are done via the default encoding in both
directions).

With it you can use the existing codecs to easily encode
or decode strings and data into string objects:

>>> 'abc'.encode('utf16')
'\377\376a\000b\000c\000'
>>> str(_, 'utf16')
'abc'

Since the codec API and registry support more than just
Unicode codecs, one could also think of things like:

'...long...data...string...'.encode('data_gzip')
or other complicated string conversions.

The str(obj[, encoding[, errors]]) builtin does the exact
opposite: it takes the string representation of obj and
then decodes it using the given encoding.

What do you think about this ? Should I add it to CVS
as experiment ?

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------FC351AC889B8145AF245DB36
Content-Type: text/plain; charset=us-ascii;
 name="str.encode.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="str.encode.patch"

diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x core -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x Setup.config -x hassignal -x Makefile.pre -x configure -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x */plat* -x *.py -x ACKS -x *.txt -x README CVS-Python/Include/stringobject.h Python+Unicode/Include/stringobject.h
--- CVS-Python/Include/stringobject.h	Sat Jul  1 10:30:46 2000
+++ Python+Unicode/Include/stringobject.h	Wed Jul  5 18:59:46 2000
@@ -81,6 +81,37 @@
 #define PyString_AS_STRING(op) (((PyStringObject *)(op))->ob_sval)
 #define PyString_GET_SIZE(op)  (((PyStringObject *)(op))->ob_size)
 
+/* --- Generic Codecs ----------------------------------------------------- */
+
+/* Create a string object by decoding the encoded string s of the
+   given size. */
+
+extern DL_IMPORT(PyObject*) PyString_Decode(
+    const char *s,              /* encoded string */
+    int size,                   /* size of buffer */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a char buffer of the given size and returns a 
+   Python string object. */
+
+extern DL_IMPORT(PyObject*) PyString_Encode(
+    const char *s,              /* string char buffer */
+    int size,                   /* number of chars to encode */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a string object and returns the result as Python string
+   object. */
+
+extern DL_IMPORT(PyObject*) PyString_AsEncodedString(
+    PyObject *str,	 	/* string object */
+    const char *encoding,	/* encoding */
+    const char *errors		/* error handling */
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x core -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x Setup.config -x hassignal -x Makefile.pre -x configure -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x */plat* -x *.py -x ACKS -x *.txt -x README CVS-Python/Objects/stringobject.c Python+Unicode/Objects/stringobject.c
--- CVS-Python/Objects/stringobject.c	Wed Jul  5 11:42:49 2000
+++ Python+Unicode/Objects/stringobject.c	Wed Jul  5 19:47:44 2000
@@ -152,6 +152,100 @@
 	return (PyObject *) op;
 }
 
+PyObject *PyString_Decode(const char *s,
+			  int size,
+			  const char *encoding,
+			  const char *errors)
+{
+    PyObject *buffer = NULL, *str;
+    
+    if (encoding == NULL) 
+	encoding = PyUnicode_GetDefaultEncoding();
+
+    /* Decode via the codec registry */
+    buffer = PyBuffer_FromMemory((void *)s, size);
+    if (buffer == NULL)
+        goto onError;
+    str = PyCodec_Decode(buffer, encoding, errors);
+    if (str == NULL)
+        goto onError;
+    /* Convert Unicode to a string using the default encoding */
+    if (PyUnicode_Check(str)) {
+	PyObject *temp = str;
+	str = PyUnicode_AsEncodedString(str, NULL, NULL);
+	Py_DECREF(temp);
+	if (str == NULL)
+	    goto onError;
+    }
+    if (!PyString_Check(str)) {
+        PyErr_Format(PyExc_TypeError,
+                     "decoder did not return an string object (type=%.400s)",
+                     str->ob_type->tp_name);
+        Py_DECREF(str);
+        goto onError;
+    }
+    Py_DECREF(buffer);
+    return str;
+    
+ onError:
+    Py_XDECREF(buffer);
+    return NULL;
+}
+
+PyObject *PyString_Encode(const char *s,
+			  int size,
+			  const char *encoding,
+			  const char *errors)
+{
+    PyObject *v, *str;
+    
+    str = PyString_FromStringAndSize(s, size);
+    if (str == NULL)
+	return NULL;
+    v = PyString_AsEncodedString(str, encoding, errors);
+    Py_DECREF(str);
+    return v;
+}
+
+PyObject *PyString_AsEncodedString(PyObject *str,
+				   const char *encoding,
+				   const char *errors)
+{
+    PyObject *v;
+    
+    if (!PyString_Check(str)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+
+    if (encoding == NULL) 
+	encoding = PyUnicode_GetDefaultEncoding();
+
+    /* Encode via the codec registry */
+    v = PyCodec_Encode(str, encoding, errors);
+    if (v == NULL)
+        goto onError;
+    /* Convert Unicode to a string using the default encoding */
+    if (PyUnicode_Check(v)) {
+	PyObject *temp = v;
+	v = PyUnicode_AsEncodedString(v, NULL, NULL);
+	Py_DECREF(temp);
+	if (v == NULL)
+	    goto onError;
+    }
+    if (!PyString_Check(v)) {
+        PyErr_Format(PyExc_TypeError,
+                     "encoder did not return a string object (type=%.400s)",
+                     v->ob_type->tp_name);
+        Py_DECREF(v);
+        goto onError;
+    }
+    return v;
+    
+ onError:
+    return NULL;
+}
+
 static void
 string_dealloc(op)
 	PyObject *op;
@@ -1686,6 +1780,25 @@
 }
 
 
+static char encode__doc__[] =
+"S.encode([encoding[,errors]]) -> string\n\
+\n\
+Return an encoded string version of S. Default encoding is the current\n\
+default string encoding. errors may be given to set a different error\n\
+handling scheme. Default is 'strict' meaning that encoding errors raise\n\
+a ValueError. Other possible values are 'ignore' and 'replace'.";
+
+static PyObject *
+string_encode(PyStringObject *self, PyObject *args)
+{
+    char *encoding = NULL;
+    char *errors = NULL;
+    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
+        return NULL;
+    return PyString_AsEncodedString((PyObject *)self, encoding, errors);
+}
+
+
 static char expandtabs__doc__[] =
 "S.expandtabs([tabsize]) -> string\n\
 \n\
@@ -2252,6 +2365,7 @@
 	{"ljust",       (PyCFunction)string_ljust,       1, ljust__doc__},
 	{"rjust",       (PyCFunction)string_rjust,       1, rjust__doc__},
 	{"center",      (PyCFunction)string_center,      1, center__doc__},
+	{"encode",      (PyCFunction)string_encode,      1, encode__doc__},
 	{"expandtabs",  (PyCFunction)string_expandtabs,  1, expandtabs__doc__},
 	{"splitlines",  (PyCFunction)string_splitlines,  1, splitlines__doc__},
 #if 0
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x core -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x Setup.config -x hassignal -x Makefile.pre -x configure -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x */plat* -x *.py -x ACKS -x *.txt -x README CVS-Python/Python/bltinmodule.c Python+Unicode/Python/bltinmodule.c
--- CVS-Python/Python/bltinmodule.c	Tue Jul  4 10:30:50 2000
+++ Python+Unicode/Python/bltinmodule.c	Wed Jul  5 19:40:14 2000
@@ -2050,18 +2050,39 @@
 	PyObject *self;
 	PyObject *args;
 {
-	PyObject *v;
+	PyObject *v, *w;
+	const void *buffer;
+	int len;
+	char *encoding = NULL;
+	char *errors = NULL;
 
-	if (!PyArg_ParseTuple(args, "O:str", &v))
+	if ( !PyArg_ParseTuple(args, "O|ss:str", &v, &encoding, &errors) )
 		return NULL;
-	return PyObject_Str(v);
+	
+	/* Get string representation */
+	if (PyString_Check(v))
+	    Py_INCREF(v);
+	else
+	    v = PyObject_Str(v);
+	if (encoding == NULL)
+	    return v;
+	
+	/* Decode data raw */
+	if (PyObject_AsReadBuffer(v, &buffer, &len))
+	    return NULL;
+	w = PyString_Decode((const char *)buffer, len, encoding, errors);
+	Py_DECREF(v);
+	return w;
 }
 
 static char str_doc[] =
-"str(object) -> string\n\
+"str(object [, encoding[, errors]]) -> string\n\
 \n\
-Return a nice string representation of the object.\n\
-If the argument is a string, the return value is the same object.";
+Return a string representation of the object.\n\
+If the argument is a string, the return value is the same object.\n\
+If encoding is given, the string representation is decoded prior\n\
+to returning it. errors, defining the error handling for the decoding\n\
+process, defaults to 'strict'.";
 
 
 static PyObject *

--------------FC351AC889B8145AF245DB36--