[pypy-svn] r12232 - pypy/dist/pypy/lib
ale at codespeak.net
ale at codespeak.net
Fri May 13 12:39:48 CEST 2005
Author: ale
Date: Fri May 13 12:39:48 2005
New Revision: 12232
Modified:
pypy/dist/pypy/lib/inprogress__codecs.py
pypy/dist/pypy/lib/unicodecodec.py
Log:
All functions of the _codecsmodule.c is now exposed by _codecs.py.
All the codecs exposed by _codecs.py is now implemented in unicodecodec.py. Unfortunately a lot of the codecs testing is done in test_unicode.py which doesn' work for obvious reasons, so the test coverage is rather poor.
Modified: pypy/dist/pypy/lib/inprogress__codecs.py
==============================================================================
--- pypy/dist/pypy/lib/inprogress__codecs.py (original)
+++ pypy/dist/pypy/lib/inprogress__codecs.py Fri May 13 12:39:48 2005
@@ -34,7 +34,7 @@
Copyright (c) Corporation for National Research Initiatives.
"""
-from unicodecodec import *
+from unicodecodec_ import *
#/* --- Registry ----------------------------------------------------------- */
codec_search_path = []
@@ -107,6 +107,7 @@
"""None
"""
res = PyUnicode_EncodeLatin1(obj,len(obj),errors)
+ res = ''.join(res)
return res, len(res)
# XXX MBCS codec might involve ctypes ?
def mbcs_decode():
@@ -131,53 +132,65 @@
"""None
"""
res = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
+ res = ''.join(res)
return res,len(res)
def raw_unicode_escape_decode( data,errors='strict'):
"""None
"""
res = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors)
+ res = ''.join(res)
return res,len(res)
def utf_7_decode( data,errors='strict'):
"""None
"""
- unistr = PyUnicode_DecodeUTF7(data,errors='strict')
- return unistr,len(unistr)
-# XXX unicode_escape_encode
+ res = PyUnicode_DecodeUTF7(data,errors='strict')
+ res = ''.join(res)
+ return res,len(res)
+
def unicode_escape_encode( obj,errors='strict'):
"""None
"""
- pass
-# XXX latin_1_decode
+ res = PyUnicode_EncodeUnicodeEscape(data,len(data),errors)
+ res = ''.join(res)
+ return res, len(res)
+
def latin_1_decode( data,errors='strict'):
"""None
"""
- pass
-# XXX utf_16_decode
-def utf_16_decode( data,errors='strict'):
+ res = PyUnicode_DecodeLatin1(data,len(data),errors)
+ res = ''.join(res)
+ return res, len(res)
+
+def utf_16_decode( data,errors='strict',final=None):
"""None
"""
- pass
+ res = PyUnicode_DecodeUTF16Stateful(data,len(data),errors)
+ res = ''.join(res)
+ return res, len(res)
def unicode_escape_decode( data,errors='strict'):
"""None
"""
- unistr = PyUnicode_DecodeUnicodeEscape(data,len(data),errors)
- return unistr,len(unistr)
+ res = PyUnicode_DecodeUnicodeEscape(data,len(data),errors)
+ res = ''.join(res)
+ return res, len(res)
def ascii_decode( data,errors='strict'):
"""None
"""
res = PyUnicode_DecodeASCII(data,len(data),errors)
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def charmap_encode(obj,errors='strict',mapping='latin-1'):
"""None
"""
res = PyUnicode_EncodeCharmap(obj,len(obj),mapping,errors)
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def unicode_internal_encode( obj,errors='strict'):
"""None
@@ -185,12 +198,14 @@
if type(obj) == unicode:
return obj, len(obj)
else:
- return PyUnicode_FromUnicode(obj,size),size
-# XXX utf_16_ex_decode
+ return ''.join(PyUnicode_FromUnicode(obj,size),size)
+
def utf_16_ex_decode( data,errors='strict'):
"""None
"""
- pass
+ res = PyUnicode_DecodeUTF16Stateful(data,len(data),errors,'native')
+ res = ''.join(res)
+ return res, len(res)
# XXX escape_decode Check if this is right
def escape_decode(data,errors='strict'):
"""None
@@ -201,20 +216,23 @@
"""None
"""
res = str(obj)
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def charmap_decode( data,errors='strict',mapping=None):
"""None
"""
res = PyUnicode_DecodeCharmap(data, len(data), mapping, errors)
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def utf_7_encode( obj,errors='strict'):
"""None
"""
res = PyUnicode_EncodeUTF7(obj,len(obj),0,0,errors)
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def mbcs_encode( obj,errors='strict'):
"""None
@@ -230,40 +248,43 @@
"""None
"""
res = PyUnicode_EncodeASCII(obj,len(obj),errors)
- return res,len(res)
-##(PyUnicode_EncodeASCII(
-## PyUnicode_AS_UNICODE(obj),
-## PyUnicode_GET_SIZE(obj),
-## errors),
-## PyUnicode_GET_SIZE(obj))
+ res = ''.join(res)
+ return res, len(res)
def utf_16_encode( obj,errors='strict'):
"""None
"""
- u = PyUnicode_EncodeUTF16(obj,len(obj),errors)
- return u,len(u)
+ res = PyUnicode_EncodeUTF16(obj,len(obj),errors)
+ res = ''.join(res)
+ return res, len(res)
def raw_unicode_escape_encode( obj,errors='strict'):
"""None
"""
res = PyUnicode_EncodeRawUnicodeEscape(obj,len(obj))
- return res,len(res)
+ res = ''.join(res)
+ return res, len(res)
def utf_8_encode( obj,errors='strict'):
"""None
"""
res = PyUnicode_EncodeUTF8(obj,len(obj),errors)
- return res,len(res)
-# XXX utf_16_le_encode
+ res = ''.join(res)
+ return res, len(res)
+
def utf_16_le_encode( obj,errors='strict'):
"""None
"""
- pass
-# XXX utf_16_be_encode
+ res = PyUnicode_EncodeUTF16(obj,len(obj),errors,'little')
+ res = ''.join(res)
+ return res, len(res)
+
def utf_16_be_encode( obj,errors='strict'):
"""None
"""
- pass
+ res = PyUnicode_EncodeUTF16(obj,len(obj),errors,'big')
+ res = ''.join(res)
+ return res, len(res)
def unicode_internal_decode( unistr,errors='strict'):
"""None
@@ -272,16 +293,20 @@
return unistr,len(unistr)
else:
return unicode(unistr),len(unistr)
-# XXX utf_16_le_decode
+
def utf_16_le_decode( data,errors='strict'):
"""None
"""
- pass
-# XXX utf_16_be_decode
+ res = PyUnicode_DecodeUTF16Stateful(data,len(data),errors,'little')
+ res = ''.join(res)
+ return res, len(res)
+
def utf_16_be_decode( data,errors='strict'):
"""None
"""
- pass
+ res = PyUnicode_DecodeUTF16Stateful(data,len(data),errors,'big')
+ res = ''.join(res)
+ return res, len(res)
def strict_errors(exc):
if isinstance(exc,Exception):
Modified: pypy/dist/pypy/lib/unicodecodec.py
==============================================================================
--- pypy/dist/pypy/lib/unicodecodec.py (original)
+++ pypy/dist/pypy/lib/unicodecodec.py Fri May 13 12:39:48 2005
@@ -1,4 +1,12 @@
import sys
+""" Python implementation of CPythons builtin unicode codecs.
+
+ Generally the functions in this module take a list of characters an returns
+ a list of characters.
+
+ For use in the PyPy project"""
+
+
## indicate whether a UTF-7 character is special i.e. cannot be directly
## encoded:
## 0 - not special
@@ -16,7 +24,7 @@
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
]
-unicode_latin1=[]
+unicode_latin1=[None]*256
codec_error_registry = {}
def lookup_error(errors):
@@ -46,8 +54,6 @@
else:
raise TypeError("handler must be callable")
-for i in range(256):
- unicode_latin1.append(None)
def PyUnicode_Check(op):
return type(op) == unicode
@@ -85,17 +91,19 @@
else:
return ord(c) + 4
-def ENCODE(out, ch, bits) :
+def ENCODE( ch, bits) :
charvalue = 0
+ out = []
for c in ch:
charvalue <<= 16
charvalue += ord(c)
while (bits >= 6):
out += B64(charvalue >> (bits-6))
bits -= 6;
- return out,ch,bits
+ return out,bits
-def DECODE(out, ch, bits, surrogate):
+def DECODE( ch, bits, surrogate):
+ out = []
while (bits >= 16):
outCh = unicode (chr((ord(ch) >> (bits-16)) & 0xffff))
bits -= 16
@@ -110,7 +118,7 @@
raise UnicodeDecodeError,"code pairs are not supported"
else:
out += outCh
- return ''.join(out),ch,bits,surrogate
+ return out,bits,surrogate
def PyUnicode_DecodeUTF7(s, size, errors):
@@ -134,7 +142,8 @@
if ((ch == '-') or not B64CHAR(ch)):
inShift = 0
i += 1
- p, charsleft, bitsleft, surrogate = DECODE(p, charsleft, bitsleft, surrogate);
+ out, bitsleft, surrogate = DECODE(charsleft, bitsleft, surrogate)
+ p += out
if (bitsleft >= 6):
## /* The shift sequence has a partial character in it. If
## bitsleft < 6 then we could just classify it as padding
@@ -185,7 +194,7 @@
endinpos = size;
raise UnicodeDecodeError, "unterminated shift sequence"
- return unicode(''.join(p))
+ return p
def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
@@ -204,7 +213,8 @@
charsleft = ch
bitsleft = 16
out += '+'
- out, charsleft, bitsleft = ENCODE(out, charsleft, bitsleft)
+ p, bitsleft = ENCODE( charsleft, bitsleft)
+ out += p
inShift = bitsleft > 0
else:
out += ch
@@ -249,7 +259,7 @@
out += [B64(ord(cc) << (6-bitsleft) ) for cc in charsleft]
out += '-'
- return ''.join(out)
+ return out
def PyUnicode_FromOrdinal(ordinal):
@@ -289,14 +299,6 @@
## /* Single character Unicode objects in the Latin-1 range are
## shared when using this constructor */
- if (size == 1 and ord(u) < 256) :
- result = unicode_latin1[ord(u)]
- if (not result):
- result = unicode(u)
- unicode_latin1[ord(u)] = result
- if (not result):
- return None
- return result
return unicode(u)
def PyUnicode_Decode(s,size,encoding,errors):
@@ -321,6 +323,80 @@
v = PyUnicode_Decode(s, len(s), encoding, errors)
return v
+def unicodeescape_string(s, size, quotes):
+
+
+ p = []
+ if (quotes) :
+ p += 'u'
+ if (s.find('\'')!=-1 and s.find('"')==-1):
+ p += '"'
+ else:
+ p += '\''
+ pos = 0
+ while (pos < size):
+ ch = s[pos]
+ #/* Escape quotes */
+ if (quotes and (ch == p[1] or ch == '\\')):
+ p += '\\'
+ p += ch
+ continue
+
+#ifdef Py_UNICODE_WIDE
+ #/* Map 21-bit characters to '\U00xxxxxx' */
+ elif (ord(ch) >= 0x10000):
+ p += '\\'
+ p += 'U'
+ p += '%08x'%ord(ch)
+ continue
+#endif
+ #/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+ elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00):
+ pos += 1
+ ch2 = s[pos]
+
+ if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF):
+ ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000
+ p += '\\'
+ p += 'U'
+ p += '%08x'%ucs
+ continue
+
+ #/* Fall through: isolated surrogates are copied as-is */
+ pos -= 1
+
+ #/* Map 16-bit characters to '\uxxxx' */
+ if (ord(ch) >= 256):
+ p += '\\'
+ p += 'u'
+ p += '%04x'%ord(ch)
+
+ #/* Map special whitespace to '\t', \n', '\r' */
+ elif (ch == '\t'):
+ p += '\\'
+ p += 't'
+
+ elif (ch == '\n'):
+ p += '\\'
+ p += 'n'
+
+ elif (ch == '\r'):
+ p += '\\'
+ p += 'r'
+
+ #/* Map non-printable US ASCII to '\xhh' */
+ elif (ch < ' ' or ch >= 0x7F) :
+ p += '\\'
+ p += 'x'
+ p += '%02x'%ord(ch)
+ #/* Copy everything else as-is */
+ else:
+ p += ch
+
+ if (quotes):
+ p += p[1]
+ return p
+
def PyUnicode_DecodeASCII(s, size, errors):
# /* ASCII is equivalent to the first 128 ordinals in Unicode. */
@@ -336,16 +412,17 @@
p += c
pos += 1
else:
+
res = unicode_call_errorhandler(
errors, "ascii", "ordinal not in range(128)",
s, pos, pos+1)
p += res[0]
pos = res[1]
- return ''.join(p) #(encoding,p,collstart,collend,reason)
+ return p
def PyUnicode_EncodeASCII(p,size,errors):
- return u''.join(unicode_encode_ucs1(p, size, errors, 128))
+ return unicode_encode_ucs1(p, size, errors, 128)
def PyUnicode_AsASCIIString(unistr):
@@ -355,159 +432,119 @@
len(unicode),
None)
-##def PyUnicode_DecodeUTF16Stateful(s,size,errors,byteorder,consumed):
-##
-## bo = 0; /* assume native ordering by default */
-## errmsg = "";
-## /* Offsets from q for retrieving byte pairs in the right order. */
-###ifdef BYTEORDER_IS_LITTLE_ENDIAN
-## int ihi = 1, ilo = 0;
-###else
-## int ihi = 0, ilo = 1;
-###endif
-## PyObject *errorHandler = NULL;
-## PyObject *exc = NULL;
-##
-## /* Note: size will always be longer than the resulting Unicode
-## character count */
-## unicode = _PyUnicode_New(size);
-## if (!unicode)
-## return NULL;
-## if (size == 0)
-## return (PyObject *)unicode;
-##
-## /* Unpack UTF-16 encoded data */
-## p = unicode->str;
-## q = (unsigned char *)s;
-## e = q + size;
-##
-## if (byteorder)
-## bo = *byteorder;
-##
+def PyUnicode_DecodeUTF16Stateful(s,size,errors,byteorder='native',consumed=None):
+
+ bo = 0 #/* assume native ordering by default */
+ errmsg = ""
+
+ if sys.byteorder == 'little':
+ ihi = 1
+ ilo = 0
+ else:
+ ihi = 0
+ ilo = 1
+
+ if (size == 0):
+ return [u'']
+
+ #/* Unpack UTF-16 encoded data */
+
## /* Check for BOM marks (U+FEFF) in the input and adjust current
## byte order setting accordingly. In native mode, the leading BOM
## mark is skipped, in all other modes, it is copied to the output
## stream as-is (giving a ZWNBSP character). */
-## if (bo == 0) {
-## if (size >= 2) {
-## const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
-###ifdef BYTEORDER_IS_LITTLE_ENDIAN
-## if (bom == 0xFEFF) {
-## q += 2;
-## bo = -1;
-## }
-## else if (bom == 0xFFFE) {
-## q += 2;
-## bo = 1;
-## }
-###else
-## if (bom == 0xFEFF) {
-## q += 2;
-## bo = 1;
-## }
-## else if (bom == 0xFFFE) {
-## q += 2;
-## bo = -1;
-## }
-###endif
-## }
-## }
-##
-## if (bo == -1) {
-## /* force LE */
-## ihi = 1;
-## ilo = 0;
-## }
-## else if (bo == 1) {
-## /* force BE */
-## ihi = 0;
-## ilo = 1;
-## }
-##
-## while (q < e) {
-## Py_UNICODE ch;
-## /* remaining bytes at the end? (size should be even) */
-## if (e-q<2) {
-## if (consumed)
-## break;
-## errmsg = "truncated data";
-## startinpos = ((const char *)q)-starts;
-## endinpos = ((const char *)e)-starts;
-## goto utf16Error;
-## /* The remaining input chars are ignored if the callback
-## chooses to skip the input */
-## }
-## ch = (q[ihi] << 8) | q[ilo];
-##
-## q += 2;
-##
-## if (ch < 0xD800 || ch > 0xDFFF) {
-## *p++ = ch;
-## continue;
-## }
-##
-## /* UTF-16 code pair: */
-## if (q >= e) {
-## errmsg = "unexpected end of data";
-## startinpos = (((const char *)q)-2)-starts;
-## endinpos = ((const char *)e)-starts;
-## goto utf16Error;
-## }
-## if (0xD800 <= ch && ch <= 0xDBFF) {
-## Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
-## q += 2;
-## if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
-###ifndef Py_UNICODE_WIDE
-## *p++ = ch;
-## *p++ = ch2;
-###else
-## *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
-###endif
-## continue;
-## }
-## else {
-## errmsg = "illegal UTF-16 surrogate";
-## startinpos = (((const char *)q)-4)-starts;
-## endinpos = startinpos+2;
-## goto utf16Error;
-## }
-##
-## }
-## errmsg = "illegal encoding";
-## startinpos = (((const char *)q)-2)-starts;
-## endinpos = startinpos+2;
-## /* Fall through to report the error */
-##
-## utf16Error:
-## outpos = p-PyUnicode_AS_UNICODE(unicode);
-## if (unicode_decode_call_errorhandler(
-## errors, &errorHandler,
-## "utf16", errmsg,
-## starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
-## (PyObject **)&unicode, &outpos, &p))
-## goto onError;
-## }
-##
-## if (byteorder)
-## *byteorder = bo;
-##
-## if (consumed)
-## *consumed = (const char *)q-starts;
-##
-## /* Adjust length */
-## if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
-## goto onError;
-##
-## Py_XDECREF(errorHandler);
-## Py_XDECREF(exc);
-## return (PyObject *)unicode;
+ q = 0
+ if byteorder == 'native':
+ if (size >= 2):
+ bom = (ord(s[ihi]) << 8) | ord(s[ilo])
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ if sys.byteorder == 'little':
+ if (bom == 0xFEFF):
+ q += 2
+ bo = -1
+ elif bom == 0xFFFE:
+ q += 2
+ bo = 1
+ else:
+ if bom == 0xFEFF:
+ q += 2
+ bo = 1
+ elif bom == 0xFFFE:
+ q += 2
+ bo = -1
+ elif byteorder == 'little':
+ bo = -1
+ else:
+ bo = 1
+
+ if (bo == -1):
+ #/* force LE */
+ ihi = 1
+ ilo = 0
+
+ elif (bo == 1):
+ #/* force BE */
+ ihi = 0
+ ilo = 1
+
+ while (q < len(s)):
+
+ #/* remaining bytes at the end? (size should be even) */
+ if (len(s)-q<2):
+ if (consumed):
+ break
+ errmsg = "truncated data";
+ startinpos = q
+ endinpos = len(s)
+ unicode_call_errorhandler()
+## /* The remaining input chars are ignored if the callback
+## chooses to skip the input */
+
+ ch = (s[q+ihi] << 8) | s[q+ilo]
+ q += 2
+
+ if (ch < 0xD800 or ch > 0xDFFF):
+ p += unichr(ch)
+ continue
+
+ #/* UTF-16 code pair: */
+ if (q >= e):
+ errmsg = "unexpected end of data";
+ startinpos = q-2
+ endinpos = len(s)
+ unicode_call_errorhandler
+
+ if (0xD800 <= ch and ch <= 0xDBFF):
+ ch2 = (s[q+ihi] << 8) | s[q+ilo]
+ q += 2
+ if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
+ #ifndef Py_UNICODE_WIDE
+ if sys.maxunicode < 65536:
+ p += unichr(ch)
+ p += unichr(ch2)
+ else:
+ p += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
+ #endif
+ continue
+
+ else:
+ errmsg = "illegal UTF-16 surrogate";
+ startinpos = q-4
+ endinpos = startinpos+2
+ unicode_call_errorhandler
+
+ errmsg = "illegal encoding";
+ startinpos = q-2
+ endinpos = startinpos+2
+ unicode_call_errorhandler
+
+
+## if (byteorder):
+## byteorder = bo
##
-##onError:
-## Py_DECREF(unicode);
-## Py_XDECREF(errorHandler);
-## Py_XDECREF(exc);
-## return NULL;
-##}
+## if (consumed):
+## consumed = (const char *)q-starts;
+ return p
def PyUnicode_EncodeUTF16(s,size,errors,byteorder='little'):
@@ -553,7 +590,7 @@
if (ch2):
p +=STORECHAR(ch2,bom)
- return ''.join(p)
+ return p
def PyUnicode_DecodeMBCS(s, size, errors):
@@ -581,8 +618,8 @@
## return (PyObject *)v;
##}
-##def PyUnicode_EncodeMBCS(p, size, errors):
-##
+def PyUnicode_EncodeMBCS(p, size, errors):
+ pass
#### /* If there are no characters, bail now! */
## if (size==0)
## return ""
@@ -804,7 +841,7 @@
if (consumed):
consumed = pos
- return u''.join(p)
+ return p
def PyUnicode_EncodeUTF8(s,size,errors):
@@ -836,50 +873,33 @@
p.extend(encodeUCS4(ch3))
continue
## /* Fall through: handles isolated high surrogates */
- p.append (chr((0xe0 | (ord(ch) >> 12))))
- p.append (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
- p.append (chr((0x80 | (ord(ch) & 0x3f))))
+ p += (chr((0xe0 | (ord(ch) >> 12))))
+ p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
+ p += (chr((0x80 | (ord(ch) & 0x3f))))
continue
- return ''.join(p)
+ return p
def encodeUCS4(ch):
## /* Encode UCS4 Unicode ordinals */
p=[]
- p.append (chr((0xf0 | (ch >> 18))))
- p.append (chr((0x80 | ((ch >> 12) & 0x3f))))
- p.append (chr((0x80 | ((ch >> 6) & 0x3f))))
- p.append (chr((0x80 | (ch & 0x3f))))
+ p += (chr((0xf0 | (ch >> 18))))
+ p += (chr((0x80 | ((ch >> 12) & 0x3f))))
+ p += (chr((0x80 | ((ch >> 6) & 0x3f))))
+ p += (chr((0x80 | (ch & 0x3f))))
return p
#/* --- Latin-1 Codec ------------------------------------------------------ */
def PyUnicode_DecodeLatin1(s, size, errors):
- pass
-##{
-## PyUnicodeObject *v;
-## Py_UNICODE *p;
-##
-## /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
-## if (size == 1) {
-## Py_UNICODE r = *(unsigned char*)s;
-## return PyUnicode_FromUnicode(&r, 1);
-## }
-##
-## v = _PyUnicode_New(size);
-## if (v == NULL)
-## goto onError;
-## if (size == 0)
-## return (PyObject *)v;
-## p = PyUnicode_AS_UNICODE(v);
-## while (size-- > 0)
-## *p++ = (unsigned char)*s++;
-## return (PyObject *)v;
-##
-## onE rror:
-## Py_XDECREF(v);
-## return NULL;
-##}
-
+ #/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
+ if (size == 1):
+ return [PyUnicode_FromUnicode(s, 1)]
+ pos = 0
+ p = []
+ while (pos < size):
+ p += s[pos]
+ pos += 1
+ return p
def unicode_encode_ucs1(p,size,errors,limit):
@@ -897,6 +917,10 @@
while pos < len(p):
#for ch in p:
ch = p[pos]
+ try:
+ ord(ch)
+ except TypeError:
+ print "Typeerror",ch,type(ch)
if ord(ch) < limit:
res += chr(ord(ch))
pos += 1
@@ -909,11 +933,11 @@
x = unicode_call_errorhandler(errors,encoding,reason,p,collstart,collend,False)
res += str(x[0])
pos = x[1]
- return res #u''.join(res)
+ return res
def PyUnicode_EncodeLatin1(p,size,errors):
res=unicode_encode_ucs1(p, size, errors, 256)
- return ''.join(res)
+ return res
hexdigits = [hex(i)[-1] for i in range(16)]+[hex(i)[-1].upper() for i in range(10,16)]
def hexescape(s,pos,digits,message,errors):
@@ -931,30 +955,28 @@
endinpos = pos
while s[endinpos] in hexdigits:
endinpos +=1
- #message = "Find den rigtige fejl meddelelse"
x = unicode_call_errorhandler(errors,"unicodeescape",message,s,pos-2,
endinpos+1)
p += x[0]
pos = x[1]
- # /* when we get here, chr is a 32-bit unicode character */
+ #/* when we get here, chr is a 32-bit unicode character */
else:
if chr < sys.maxunicode:
p += [unichr(chr)]
pos += digits
- #else
+
elif (chr <= 0x10ffff):
chr -= 0x10000L
p += unichr(0xD800 + (chr >> 10))
p += unichr(0xDC00 + (chr & 0x03FF))
pos += digits
- #endif
else:
- message = "Find den rigtige fejl meddelelse"
+ message = "illegal Unicode character"
x = unicode_call_errorhandler(errors,"unicodeescape",message,s,pos-2,
pos+1)
p += x[0]
pos = x[1]
- res = ''.join(p)
+ res = p
return res,pos
def PyUnicode_DecodeUnicodeEscape(s, size, errors):
@@ -1027,69 +1049,43 @@
x = hexescape(s,pos+1,digits,message,errors)
p += x[0]
pos = x[1]
-
-
## /* \N{name} */
-## elif ch == 'N':
-## message = "malformed \\N character escape";
-## if (ucnhash_CAPI == NULL) {
-## /* load the unicode data module */
-## PyObject *m, *v;
-## m = PyImport_ImportModule("unicodedata");
-## if (m == NULL)
-## goto ucnhashError;
-## v = PyObject_GetAttrString(m, "ucnhash_CAPI");
-## Py_DECREF(m);
-## if (v == NULL)
-## goto ucnhashError;
-## ucnhash_CAPI = PyCObject_AsVoidPtr(v);
-## Py_DECREF(v);
-## if (ucnhash_CAPI == NULL)
-## goto ucnhashError;
-## }
-## if (*s == '{') {
-## const char *start = s+1;
-## /* look for the closing brace */
-## while (*s != '}' && s < end)
-## s++;
-## if (s > start && s < end && *s == '}') {
-## /* found a name. look it up in the unicode database */
-## message = "unknown Unicode character name";
-## s++;
-## if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
-## goto store;
-## }
-## }
-## endinpos = s-starts;
-## outpos = p-PyUnicode_AS_UNICODE(v);
-## if (unicode_decode_call_errorhandler(
-## errors, &errorHandler,
-## "unicodeescape", message,
-## starts, size, &startinpos, &endinpos, &exc, &s,
-## (PyObject **)&v, &outpos, &p))
-## goto onError;
-## break;
+ elif ch == 'N':
+ message = "malformed \\N character escape"
+ try:
+ import unicodedata
+ except ImportError:
+ message = "\\N escapes not supported (can't load unicodedata module)"
+ unicode_call_errorhandler(errors,"unicodeescape",message,s,pos-1,size)
+ if (s[pos] == '{'):
+ look = pos+1
+ #/* look for the closing brace */
+ while (s[look] != '}' and look < size):
+ look += 1
+ if (look > pos+1 and look < size and s[look] == '}'):
+ #/* found a name. look it up in the unicode database */
+ message = "unknown Unicode character name"
+ look += 1
+ try:
+ chr = unicodedata.lookup(s[pos:look])
+ except KeyError:
+ x=unicode_call_errorhandler(errors,"unicodeescape",message,s,pos-1,size)
+ else:
+ x = hexescape(s,pos+1,look-pos,message,errors)
+ p += x[0]
+ pos = x[1]
else:
- p += '\\'
- p += s[pos]
- if (pos > size):
- message = "\\ at end of string"
-## endinpos = s-starts;
-## outpos = p-PyUnicode_AS_UNICODE(v);
- handler = lookup_error(errors)
- x = handler(UnicodeDecodeError("unicodeescape",s,pos,
- pos+digits,message))
- p += x[0]
- pos = x[1]
-## if (unicode_call_errorhandler(
-## errors, &errorHandler,
-## "unicodeescape", message,
-## starts, size, &startinpos, &endinpos, &exc, &s,
-## (PyObject **)&v, &outpos, &p))
-## goto onError;
-
-
- return ''.join(p)
+ if (pos > size):
+ message = "\\ at end of string"
+ handler = lookup_error(errors)
+ x = handler(UnicodeDecodeError("unicodeescape",s,pos,
+ pos+digits,message))
+ p += x[0]
+ pos = x[1]
+ else:
+ p += '\\'
+ p += s[pos]
+ return p
def PyUnicode_EncodeRawUnicodeEscape(s,size):
@@ -1113,7 +1109,7 @@
p += ch
p += '\0'
- return ''.join(p)
+ return p
def charmapencode_output(c,mapping):
@@ -1160,7 +1156,7 @@
#/* done with this character => adjust input position */
inpos+=1
- return ''.join(res)
+ return res
def PyUnicode_DecodeCharmap(s, size, mapping, errors):
@@ -1199,7 +1195,7 @@
## s,inpos,inpos+1)
## p += x[0]
inpos +=1
- return u''.join(p)
+ return p
def PyUnicode_DecodeRawUnicodeEscape(s, size,errors):
@@ -1271,4 +1267,4 @@
else:
p += unichr(x)
- return u''.join(p)
\ No newline at end of file
+ return p
\ No newline at end of file
More information about the Pypy-commit
mailing list