[Python-3000-checkins] r58093 - in python/branches/py3k: Lib/test/test_bytes.py Objects/bytesobject.c

guido.van.rossum python-3000-checkins at python.org
Mon Sep 10 18:53:45 CEST 2007


Author: guido.van.rossum
Date: Mon Sep 10 18:53:45 2007
New Revision: 58093

Modified:
   python/branches/py3k/Lib/test/test_bytes.py
   python/branches/py3k/Objects/bytesobject.c
Log:
Bug # 1125 (my code).
Support bytes.split() and bytes.strip() -- these split/strip using ASCII
whitespace (tab, space, CR, LF, FF, VT) like their str counterparts.
Also for rsplit(), lstrip() and rstrip().
And change all these functions to accept arbitrary buffer-API-supporting
arguments.
With unit tests.


Modified: python/branches/py3k/Lib/test/test_bytes.py
==============================================================================
--- python/branches/py3k/Lib/test/test_bytes.py	(original)
+++ python/branches/py3k/Lib/test/test_bytes.py	Mon Sep 10 18:53:45 2007
@@ -617,16 +617,46 @@
         self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
         self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
         self.assertEqual(b.split(b'w'), [b])
-        # require an arg (no magic whitespace split)
-        self.assertRaises(TypeError, b.split)
+
+    def test_split_whitespace(self):
+        for b in (b'  arf  barf  ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+                  b'arf\fbarf', b'arf\vbarf'):
+            self.assertEqual(b.split(), [b'arf', b'barf'])
+            self.assertEqual(b.split(None), [b'arf', b'barf'])
+            self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
+        self.assertEqual(b'  a  bb  c  '.split(None, 0), [b'a  bb  c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 1), [b'a', b'bb  c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 2), [b'a', b'bb', b'c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 3), [b'a', b'bb', b'c'])
+
+    def test_split_buffer(self):
+        self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b'])
+
+    def test_split_string_error(self):
+        self.assertRaises(TypeError, b'a b'.split, ' ')
 
     def test_rsplit(self):
         b = b'mississippi'
         self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
         self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
         self.assertEqual(b.rsplit(b'w'), [b])
-        # require an arg (no magic whitespace split)
-        self.assertRaises(TypeError, b.rsplit)
+
+    def test_rsplit_whitespace(self):
+        for b in (b'  arf  barf  ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+                  b'arf\fbarf', b'arf\vbarf'):
+            self.assertEqual(b.rsplit(), [b'arf', b'barf'])
+            self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
+            self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 0), [b'  a  bb  c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 1), [b'  a  bb', b'c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None,2), [b'  a', b'bb', b'c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 3), [b'a', b'bb', b'c'])
+
+    def test_rplit_buffer(self):
+        self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b'])
+
+    def test_rplit_string_error(self):
+        self.assertRaises(TypeError, b'a b'.rsplit, ' ')
 
     def test_partition(self):
         b = b'mississippi'
@@ -670,6 +700,22 @@
         self.assertEqual(b.rstrip(b'im'), b'mississipp')
         self.assertEqual(b.rstrip(b'pim'), b'mississ')
 
+    def test_strip_whitespace(self):
+        b = b' \t\n\r\f\vabc \t\n\r\f\v'
+        self.assertEqual(b.strip(), b'abc')
+        self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v')
+        self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc')
+
+    def test_strip_buffer(self):
+        self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b')
+        self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc')
+        self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab')
+
+    def test_strip_string_error(self):
+        self.assertRaises(TypeError, b'abc'.strip, 'b')
+        self.assertRaises(TypeError, b'abc'.lstrip, 'b')
+        self.assertRaises(TypeError, b'abc'.rstrip, 'b')
+
     def test_ord(self):
         b = b'\0A\x7f\x80\xff'
         self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],

Modified: python/branches/py3k/Objects/bytesobject.c
==============================================================================
--- python/branches/py3k/Objects/bytesobject.c	(original)
+++ python/branches/py3k/Objects/bytesobject.c	Mon Sep 10 18:53:45 2007
@@ -2104,7 +2104,7 @@
 Py_LOCAL_INLINE(PyObject *)
 split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
 {
-    register Py_ssize_t i, j, count=0;
+    register Py_ssize_t i, j, count = 0;
     PyObject *str;
     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 
@@ -2113,7 +2113,7 @@
 
     i = j = 0;
     while ((j < len) && (maxcount-- > 0)) {
-        for(; j<len; j++) {
+        for(; j < len; j++) {
             /* I found that using memchr makes no difference */
             if (s[j] == ch) {
                 SPLIT_ADD(s, i, j);
@@ -2133,46 +2133,91 @@
     return NULL;
 }
 
+#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
+
+Py_LOCAL_INLINE(PyObject *)
+split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+    register Py_ssize_t i, j, count = 0;
+    PyObject *str;
+    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+    if (list == NULL)
+        return NULL;
+
+    for (i = j = 0; i < len; ) {
+	/* find a token */
+	while (i < len && ISSPACE(s[i]))
+	    i++;
+	j = i;
+	while (i < len && !ISSPACE(s[i]))
+	    i++;
+	if (j < i) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_ADD(s, j, i);
+	    while (i < len && ISSPACE(s[i]))
+		i++;
+	    j = i;
+	}
+    }
+    if (j < len) {
+	SPLIT_ADD(s, j, len);
+    }
+    FIX_PREALLOC_SIZE(list);
+    return list;
+
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
 PyDoc_STRVAR(split__doc__,
-"B.split(sep [,maxsplit]) -> list of bytes\n\
+"B.split([sep [, maxsplit]]) -> list of bytes\n\
 \n\
-Return a list of the bytes in the string B, using sep as the\n\
-delimiter.  If maxsplit is given, at most maxsplit\n\
-splits are done.");
+Return a list of the bytes in the string B, using sep as the delimiter.\n\
+If sep is not given, B is split on ASCII whitespace charcters\n\
+(space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
 
 static PyObject *
 bytes_split(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
-    Py_ssize_t maxsplit = -1, count=0;
+    Py_ssize_t maxsplit = -1, count = 0;
     const char *s = PyBytes_AS_STRING(self), *sub;
-    PyObject *list, *str, *subobj;
+    PyObject *list, *str, *subobj = Py_None;
+    PyBuffer vsub;
 #ifdef USE_FAST
     Py_ssize_t pos;
 #endif
 
-    if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
+    if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
         return NULL;
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
-    if (PyBytes_Check(subobj)) {
-        sub = PyBytes_AS_STRING(subobj);
-        n = PyBytes_GET_SIZE(subobj);
-    }
-    /* XXX -> use the modern buffer interface */
-    else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+
+    if (subobj == Py_None)
+        return split_whitespace(s, len, maxsplit);
+
+    if (_getbuffer(subobj, &vsub) < 0)
         return NULL;
+    sub = vsub.buf;
+    n = vsub.len;
 
     if (n == 0) {
         PyErr_SetString(PyExc_ValueError, "empty separator");
+        PyObject_ReleaseBuffer(subobj, &vsub);
         return NULL;
     }
-    else if (n == 1)
+    if (n == 1)
         return split_char(s, len, sub[0], maxsplit);
 
     list = PyList_New(PREALLOC_SIZE(maxsplit));
-    if (list == NULL)
+    if (list == NULL) {
+        PyObject_ReleaseBuffer(subobj, &vsub);
         return NULL;
+    }
 
 #ifdef USE_FAST
     i = j = 0;
@@ -2198,10 +2243,12 @@
 #endif
     SPLIT_ADD(s, i, len);
     FIX_PREALLOC_SIZE(list);
+    PyObject_ReleaseBuffer(subobj, &vsub);
     return list;
 
   onError:
     Py_DECREF(list);
+    PyObject_ReleaseBuffer(subobj, &vsub);
     return NULL;
 }
 
@@ -2293,44 +2340,90 @@
     return NULL;
 }
 
+Py_LOCAL_INLINE(PyObject *)
+rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+    register Py_ssize_t i, j, count = 0;
+    PyObject *str;
+    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+    if (list == NULL)
+        return NULL;
+
+    for (i = j = len - 1; i >= 0; ) {
+	/* find a token */
+	while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+	    i--;
+	j = i;
+	while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
+	    i--;
+	if (j > i) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_ADD(s, i + 1, j + 1);
+	    while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+		i--;
+	    j = i;
+	}
+    }
+    if (j >= 0) {
+	SPLIT_ADD(s, 0, j + 1);
+    }
+    FIX_PREALLOC_SIZE(list);
+    if (PyList_Reverse(list) < 0)
+        goto onError;
+
+    return list;
+
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
 PyDoc_STRVAR(rsplit__doc__,
 "B.rsplit(sep [,maxsplit]) -> list of bytes\n\
 \n\
-Return a list of the sections in the byte B, using sep as the\n\
-delimiter, starting at the end of the bytes and working\n\
-to the front.  If maxsplit is given, at most maxsplit splits are\n\
-done.");
+Return a list of the sections in the byte B, using sep as the delimiter,\n\
+starting at the end of the bytes and working to the front.\n\
+If sep is not given, B is split on ASCII whitespace characters\n\
+(space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
 
 static PyObject *
 bytes_rsplit(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
-    Py_ssize_t maxsplit = -1, count=0;
+    Py_ssize_t maxsplit = -1, count = 0;
     const char *s = PyBytes_AS_STRING(self), *sub;
-    PyObject *list, *str, *subobj;
+    PyObject *list, *str, *subobj = Py_None;
+    PyBuffer vsub;
 
-    if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
+    if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
         return NULL;
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
-    if (PyBytes_Check(subobj)) {
-        sub = PyBytes_AS_STRING(subobj);
-        n = PyBytes_GET_SIZE(subobj);
-    }
-    /* XXX -> Use the modern buffer interface */
-    else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+
+    if (subobj == Py_None)
+        return rsplit_whitespace(s, len, maxsplit);
+
+    if (_getbuffer(subobj, &vsub) < 0)
         return NULL;
+    sub = vsub.buf;
+    n = vsub.len;
 
     if (n == 0) {
         PyErr_SetString(PyExc_ValueError, "empty separator");
+        PyObject_ReleaseBuffer(subobj, &vsub);
         return NULL;
     }
     else if (n == 1)
         return rsplit_char(s, len, sub[0], maxsplit);
 
     list = PyList_New(PREALLOC_SIZE(maxsplit));
-    if (list == NULL)
+    if (list == NULL) {
+        PyObject_ReleaseBuffer(subobj, &vsub);
         return NULL;
+    }
 
     j = len;
     i = j - n;
@@ -2349,10 +2442,12 @@
     FIX_PREALLOC_SIZE(list);
     if (PyList_Reverse(list) < 0)
         goto onError;
+    PyObject_ReleaseBuffer(subobj, &vsub);
     return list;
 
 onError:
     Py_DECREF(list);
+    PyObject_ReleaseBuffer(subobj, &vsub);
     return NULL;
 }
 
@@ -2542,71 +2637,104 @@
 }
 
 PyDoc_STRVAR(strip__doc__,
-"B.strip(bytes) -> bytes\n\
+"B.strip([bytes]) -> bytes\n\
 \n\
-Strip leading and trailing bytes contained in the argument.");
+Strip leading and trailing bytes contained in the argument.\n\
+If the argument is omitted, strip ASCII whitespace.");
 static PyObject *
-bytes_strip(PyBytesObject *self, PyObject *arg)
+bytes_strip(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t left, right, mysize, argsize;
     void *myptr, *argptr;
-    if (arg == NULL || !PyBytes_Check(arg)) {
-        PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
-        return NULL;
+    PyObject *arg = Py_None;
+    PyBuffer varg;
+    if (!PyArg_ParseTuple(args, "|O:strip", &arg))
+        return NULL;
+    if (arg == Py_None) {
+        argptr = "\t\n\r\f\v ";
+        argsize = 6;
+    }
+    else {
+	    if (_getbuffer(arg, &varg) < 0)
+		    return NULL;
+	    argptr = varg.buf;
+	    argsize = varg.len;
     }
     myptr = self->ob_bytes;
     mysize = Py_Size(self);
-    argptr = ((PyBytesObject *)arg)->ob_bytes;
-    argsize = Py_Size(arg);
     left = lstrip_helper(myptr, mysize, argptr, argsize);
     if (left == mysize)
         right = left;
     else
         right = rstrip_helper(myptr, mysize, argptr, argsize);
+    if (arg != Py_None)
+	    PyObject_ReleaseBuffer(arg, &varg);
     return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
 }
 
 PyDoc_STRVAR(lstrip__doc__,
-"B.lstrip(bytes) -> bytes\n\
+"B.lstrip([bytes]) -> bytes\n\
 \n\
-Strip leading bytes contained in the argument.");
+Strip leading bytes contained in the argument.\n\
+If the argument is omitted, strip leading ASCII whitespace.");
 static PyObject *
-bytes_lstrip(PyBytesObject *self, PyObject *arg)
+bytes_lstrip(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t left, right, mysize, argsize;
     void *myptr, *argptr;
-    if (arg == NULL || !PyBytes_Check(arg)) {
-        PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
-        return NULL;
+    PyObject *arg = Py_None;
+    PyBuffer varg;
+    if (!PyArg_ParseTuple(args, "|O:lstrip", &arg))
+        return NULL;
+    if (arg == Py_None) {
+        argptr = "\t\n\r\f\v ";
+        argsize = 6;
+    }
+    else {
+	    if (_getbuffer(arg, &varg) < 0)
+		    return NULL;
+	    argptr = varg.buf;
+	    argsize = varg.len;
     }
     myptr = self->ob_bytes;
     mysize = Py_Size(self);
-    argptr = ((PyBytesObject *)arg)->ob_bytes;
-    argsize = Py_Size(arg);
     left = lstrip_helper(myptr, mysize, argptr, argsize);
     right = mysize;
+    if (arg != Py_None)
+	    PyObject_ReleaseBuffer(arg, &varg);
     return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
 }
 
 PyDoc_STRVAR(rstrip__doc__,
-"B.rstrip(bytes) -> bytes\n\
+"B.rstrip([bytes]) -> bytes\n\
 \n\
-Strip trailing bytes contained in the argument.");
+Strip trailing bytes contained in the argument.\n\
+If the argument is omitted, strip trailing ASCII whitespace.");
 static PyObject *
-bytes_rstrip(PyBytesObject *self, PyObject *arg)
+bytes_rstrip(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t left, right, mysize, argsize;
     void *myptr, *argptr;
-    if (arg == NULL || !PyBytes_Check(arg)) {
-        PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
-        return NULL;
+    PyObject *arg = Py_None;
+    PyBuffer varg;
+    if (!PyArg_ParseTuple(args, "|O:rstrip", &arg))
+        return NULL;
+    if (arg == Py_None) {
+        argptr = "\t\n\r\f\v ";
+        argsize = 6;
+    }
+    else {
+	    if (_getbuffer(arg, &varg) < 0)
+		    return NULL;
+	    argptr = varg.buf;
+	    argsize = varg.len;
     }
     myptr = self->ob_bytes;
     mysize = Py_Size(self);
-    argptr = ((PyBytesObject *)arg)->ob_bytes;
-    argsize = Py_Size(arg);
     left = 0;
     right = rstrip_helper(myptr, mysize, argptr, argsize);
+    if (arg != Py_None)
+	    PyObject_ReleaseBuffer(arg, &varg);
     return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
 }
 
@@ -2839,9 +2967,9 @@
     {"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__},
     {"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__},
     {"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__},
-    {"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__},
-    {"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__},
-    {"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__},
+    {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__},
+    {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__},
+    {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
     {"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc},
     {"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc},
     {"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS,


More information about the Python-3000-checkins mailing list