[Python-checkins] cpython: Close #17694: Add minimum length to _PyUnicodeWriter
victor.stinner
python-checkins at python.org
Wed Apr 17 23:05:39 CEST 2013
http://hg.python.org/cpython/rev/edf029fc9591
changeset: 83433:edf029fc9591
user: Victor Stinner <victor.stinner at gmail.com>
date: Wed Apr 17 23:02:17 2013 +0200
summary:
Close #17694: Add minimum length to _PyUnicodeWriter
* Add also min_char attribute to _PyUnicodeWriter structure (currently unused)
* _PyUnicodeWriter_Init() has no more argument (except the writer itself):
min_length and overallocate must be set explicitly
* In error handlers, only enable overallocation if the replacement string
is longer than 1 character
* CJK decoders don't use overallocation anymore
* Set min_length, instead of preallocating memory using
_PyUnicodeWriter_Prepare(), in many decoders
* _PyUnicode_DecodeUnicodeInternal() checks for integer overflow
files:
Include/unicodeobject.h | 20 +-
Modules/cjkcodecs/multibytecodec.c | 9 +-
Objects/complexobject.c | 2 +-
Objects/floatobject.c | 2 +-
Objects/longobject.c | 2 +-
Objects/stringlib/unicode_format.h | 6 +-
Objects/unicodeobject.c | 119 ++++++++--------
7 files changed, 85 insertions(+), 75 deletions(-)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -898,22 +898,28 @@
Py_UCS4 maxchar;
Py_ssize_t size;
Py_ssize_t pos;
- /* minimum length of the buffer when overallocation is enabled,
- see _PyUnicodeWriter_Init() */
+
+ /* minimum number of allocated characters (default: 0) */
Py_ssize_t min_length;
+
+ /* minimum character (default: 127, ASCII) */
+ Py_UCS4 min_char;
+
+ /* If non-zero, overallocate the buffer by 25% (default: 0). */
unsigned char overallocate;
+
/* If readonly is 1, buffer is a shared string (cannot be modified)
and size is set to 0. */
unsigned char readonly;
} _PyUnicodeWriter ;
/* Initialize a Unicode writer.
-
- If min_length is greater than zero, _PyUnicodeWriter_Prepare()
- overallocates the buffer and min_length is the minimum length in characters
- of the buffer. */
+ *
+ * By default, the minimum buffer size is 0 character and overallocation is
+ * disabled. Set min_length, min_char and overallocate attributes to control
+ * the allocation of the buffer. */
PyAPI_FUNC(void)
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
/* Prepare the buffer to write 'length' characters
with the specified maximum character.
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -633,7 +633,8 @@
return make_tuple(PyUnicode_New(0, 0), 0);
}
- _PyUnicodeWriter_Init(&buf.writer, datalen);
+ _PyUnicodeWriter_Init(&buf.writer);
+ buf.writer.min_length = datalen;
buf.excobj = NULL;
buf.inbuf = buf.inbuf_top = (unsigned char *)data;
buf.inbuf_end = buf.inbuf_top + datalen;
@@ -839,7 +840,7 @@
{
buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
buf->inbuf_end = buf->inbuf_top + size;
- _PyUnicodeWriter_Init(&buf->writer, size);
+ buf->writer.min_length += size;
return 0;
}
@@ -1037,7 +1038,7 @@
data = pdata.buf;
size = pdata.len;
- _PyUnicodeWriter_Init(&buf.writer, 1);
+ _PyUnicodeWriter_Init(&buf.writer);
buf.excobj = NULL;
origpending = self->pendingsize;
@@ -1241,7 +1242,7 @@
if (sizehint == 0)
return PyUnicode_New(0, 0);
- _PyUnicodeWriter_Init(&buf.writer, 1);
+ _PyUnicodeWriter_Init(&buf.writer);
buf.excobj = NULL;
cres = NULL;
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -705,7 +705,7 @@
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL;
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
ret = _PyComplex_FormatAdvancedWriter(
&writer,
self,
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -1711,7 +1711,7 @@
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL;
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
ret = _PyFloat_FormatAdvancedWriter(
&writer,
self,
diff --git a/Objects/longobject.c b/Objects/longobject.c
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4379,7 +4379,7 @@
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL;
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
ret = _PyLong_FormatAdvancedWriter(
&writer,
self,
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -906,7 +906,6 @@
int recursion_depth, AutoNumber *auto_number)
{
_PyUnicodeWriter writer;
- Py_ssize_t minlen;
/* check the recursion level */
if (recursion_depth <= 0) {
@@ -915,8 +914,9 @@
return NULL;
}
- minlen = PyUnicode_GET_LENGTH(input->str) + 100;
- _PyUnicodeWriter_Init(&writer, minlen);
+ _PyUnicodeWriter_Init(&writer);
+ writer.overallocate = 1;
+ writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
if (!do_markup(input, args, kwargs, &writer, recursion_depth,
auto_number)) {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2665,7 +2665,9 @@
const char *f;
_PyUnicodeWriter writer;
- _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
+ _PyUnicodeWriter_Init(&writer);
+ writer.min_length = strlen(format) + 100;
+ writer.overallocate = 1;
/* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
Copy it to be able to pass a reference to a subfunction. */
@@ -4117,7 +4119,10 @@
goto onError;
}
- writer->overallocate = 1;
+ if (PyUnicode_READY(repunicode) < 0)
+ goto onError;
+ if (PyUnicode_GET_LENGTH(repunicode) > 1)
+ writer->overallocate = 1;
if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
return
@@ -4256,9 +4261,8 @@
}
/* Start off assuming it's all ASCII. Widen later as necessary. */
- _PyUnicodeWriter_Init(&writer, 0);
- if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
- goto onError;
+ _PyUnicodeWriter_Init(&writer);
+ writer.min_length = size;
shiftOutStart = 0;
e = s + size;
@@ -4655,7 +4659,7 @@
return get_latin1_char((unsigned char)s[0]);
}
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
@@ -4910,7 +4914,7 @@
le = bo <= 0;
#endif
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
goto onError;
@@ -5149,7 +5153,7 @@
/* Note: size will always be longer than the resulting Unicode
character count */
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
goto onError;
@@ -5420,11 +5424,9 @@
and we determined it's exact size (common case)
or it contains \x, \u, ... escape sequences. then we create a
legacy wchar string and resize it at the end of this function. */
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
if (len > 0) {
- if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
- goto onError;
- assert(writer.kind == PyUnicode_1BYTE_KIND);
+ writer.min_length = len;
}
else {
/* Escaped strings will always be longer than the resulting
@@ -5432,8 +5434,7 @@
length after conversion to the true value.
(but if the error callback returns a long replacement string
we'll have to allocate more space) */
- if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
- goto onError;
+ writer.min_length = size;
}
if (size == 0)
@@ -5461,10 +5462,6 @@
if (s > end)
c = '\0'; /* Invalid after \ */
- /* The only case in which i == ascii_length is a backslash
- followed by a newline. */
- assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
-
switch (c) {
/* \x escapes */
@@ -5787,9 +5784,8 @@
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. (But decoding error
handler might have to resize the string) */
- _PyUnicodeWriter_Init(&writer, 1);
- if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
- goto onError;
+ _PyUnicodeWriter_Init(&writer);
+ writer.min_length = size;
end = s + size;
while (s < end) {
@@ -5982,12 +5978,14 @@
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
- /* XXX overflow detection missing */
- _PyUnicodeWriter_Init(&writer, 0);
- if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
+ _PyUnicodeWriter_Init(&writer);
+ if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
+ PyErr_NoMemory();
goto onError;
+ }
+ writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
+
end = s + size;
-
while (s < end) {
Py_UNICODE uch;
Py_UCS4 ch;
@@ -6429,9 +6427,9 @@
if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]);
- _PyUnicodeWriter_Init(&writer, 0);
- if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
- goto onError;
+ _PyUnicodeWriter_Init(&writer);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
+ return NULL;
e = s + size;
data = writer.data;
@@ -7280,7 +7278,7 @@
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
@@ -7312,7 +7310,7 @@
ch = *s;
x = mapdata_ucs1[ch];
if (x > maxchar) {
- if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
+ if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)
goto onError;
maxchar = writer.maxchar;
outdata = (Py_UCS1 *)writer.data;
@@ -12841,21 +12839,27 @@
Py_LOCAL_INLINE(void)
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
- writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ if (!writer->readonly)
+ writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ else {
+ /* Copy-on-write mode: set buffer size to 0 so
+ * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
+ * next write. */
+ writer->size = 0;
+ }
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
writer->data = PyUnicode_DATA(writer->buffer);
writer->kind = PyUnicode_KIND(writer->buffer);
}
void
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
memset(writer, 0, sizeof(*writer));
#ifdef Py_DEBUG
writer->kind = 5; /* invalid kind */
#endif
- writer->min_length = Py_MAX(min_length, 100);
- writer->overallocate = (min_length > 0);
+ writer->min_char = 127;
}
int
@@ -12873,29 +12877,28 @@
}
newlen = writer->pos + length;
+ maxchar = MAX_MAXCHAR(maxchar, writer->min_char);
+
if (writer->buffer == NULL) {
- if (writer->overallocate) {
+ assert(!writer->readonly);
+ if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
/* overallocate 25% to limit the number of resize */
- if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
- newlen += newlen / 4;
- if (newlen < writer->min_length)
- newlen = writer->min_length;
- }
+ newlen += newlen / 4;
+ }
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
+
writer->buffer = PyUnicode_New(newlen, maxchar);
if (writer->buffer == NULL)
return -1;
- _PyUnicodeWriter_Update(writer);
- return 0;
- }
-
- if (newlen > writer->size) {
- if (writer->overallocate) {
+ }
+ else if (newlen > writer->size) {
+ if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
/* overallocate 25% to limit the number of resize */
- if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
- newlen += newlen / 4;
- if (newlen < writer->min_length)
- newlen = writer->min_length;
- }
+ newlen += newlen / 4;
+ }
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
if (maxchar > writer->maxchar || writer->readonly) {
/* resize + widen */
@@ -12913,7 +12916,6 @@
return -1;
}
writer->buffer = newbuffer;
- _PyUnicodeWriter_Update(writer);
}
else if (maxchar > writer->maxchar) {
assert(!writer->readonly);
@@ -12924,8 +12926,8 @@
writer->buffer, 0, writer->pos);
Py_DECREF(writer->buffer);
writer->buffer = newbuffer;
- _PyUnicodeWriter_Update(writer);
- }
+ }
+ _PyUnicodeWriter_Update(writer);
return 0;
}
@@ -12959,11 +12961,10 @@
maxchar = PyUnicode_MAX_CHAR_VALUE(str);
if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
if (writer->buffer == NULL && !writer->overallocate) {
+ writer->readonly = 1;
Py_INCREF(str);
writer->buffer = str;
_PyUnicodeWriter_Update(writer);
- writer->readonly = 1;
- writer->size = 0;
writer->pos += len;
return 0;
}
@@ -13080,7 +13081,7 @@
if (PyUnicode_READY(self) == -1)
return NULL;
- _PyUnicodeWriter_Init(&writer, 0);
+ _PyUnicodeWriter_Init(&writer);
ret = _PyUnicode_FormatAdvancedWriter(&writer,
self, format_spec, 0,
PyUnicode_GET_LENGTH(format_spec));
@@ -14164,7 +14165,9 @@
ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
ctx.fmtpos = 0;
- _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
+ _PyUnicodeWriter_Init(&ctx.writer);
+ ctx.writer.min_length = ctx.fmtcnt + 100;
+ ctx.writer.overallocate = 1;
if (PyTuple_Check(args)) {
ctx.arglen = PyTuple_Size(args);
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list