[Python-checkins] python/dist/src/Lib/email Header.py,1.4,1.5
bwarsaw@users.sourceforge.net
bwarsaw@users.sourceforge.net
Fri, 28 Jun 2002 16:46:56 -0700
Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv18840/email
Modified Files:
Header.py
Log Message:
Teach this class about "highest-level syntactic breaks" but only for
headers with no charset or 'us-ascii' charsets. Actually this is only
partially true: we know about semicolons (but not true parameters) and
we know about whitespace (but not technically folding whitespace).
Still it should be good enough for all practical purposes.
Other changes include:
__init__(): Add a continuation_ws argument, which defaults to a single
space. Set this to change the whitespace used for continuation lines
when a header must be split. Also, changed the way header line
lengths are calculated, so that they take into account continuation_ws
(when tabs-expanded) and any provided header_name parameter. This
should do much better on returning split headers for which the first
and subsequent lines must fit into a specified width.
guess_maxlinelen(): Removed. I don't think we need this method as
part of the public API.
encode_chunks() -> _encode_chunks(): I don't think we need this one as
part of the public API either.
Index: Header.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Header.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** Header.py 1 Jun 2002 05:49:17 -0000 1.4
--- Header.py 28 Jun 2002 23:46:53 -0000 1.5
***************
*** 17,21 ****
CRLFSPACE = '\r\n '
CRLF = '\r\n'
! NLSPACE = '\n '
MAXLINELEN = 76
--- 17,23 ----
CRLFSPACE = '\r\n '
CRLF = '\r\n'
! NL = '\n'
! SPACE8 = ' ' * 8
! EMPTYSTRING = ''
MAXLINELEN = 76
***************
*** 93,101 ****
class Header:
! def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
"""Create a MIME-compliant header that can contain many languages.
Specify the initial header value in s. Specify its character set as a
! Charset object in the charset argument. If none, a default Charset
instance will be used.
--- 95,104 ----
class Header:
! def __init__(self, s, charset=None, maxlinelen=None, header_name=None,
! continuation_ws=' '):
"""Create a MIME-compliant header that can contain many languages.
Specify the initial header value in s. Specify its character set as a
! Charset object in the charset argument. If None, a default Charset
instance will be used.
***************
*** 105,126 ****
charset specified in the constructor.
! The maximum line length can be specified explicitly via maxlinelen.
! You can also pass None for maxlinelen and the name of a header field
! (e.g. "Subject") to let the constructor guess the best line length to
! use. The default maxlinelen is 76.
"""
if charset is None:
charset = Charset()
self._charset = charset
# BAW: I believe `chunks' and `maxlinelen' should be non-public.
self._chunks = []
self.append(s, charset)
if maxlinelen is None:
! if header_name is None:
! self._maxlinelen = MAXLINELEN
! else:
! self.guess_maxlinelen(header_name)
else:
! self._maxlinelen = maxlinelen
def __str__(self):
--- 108,141 ----
charset specified in the constructor.
! The maximum line length can be specified explicit via maxlinelen. For
! splitting the first line to a shorter value (to account for the field
! header which isn't included in s, e.g. `Subject') pass in the name of
! the field in header_name. The default maxlinelen is 76.
!
! continuation_ws must be RFC 2822 compliant folding whitespace (usually
! either a space or a hard tab) which will be prepended to continuation
! lines.
"""
if charset is None:
charset = Charset()
self._charset = charset
+ self._continuation_ws = continuation_ws
+ cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
# BAW: I believe `chunks' and `maxlinelen' should be non-public.
self._chunks = []
self.append(s, charset)
if maxlinelen is None:
! maxlinelen = MAXLINELEN
! if header_name is None:
! # We don't know anything about the field header so the first line
! # is the same length as subsequent lines.
! self._firstlinelen = maxlinelen
else:
! # The first line should be shorter to take into account the field
! # header. Also subtract off 2 extra for the colon and space.
! self._firstlinelen = maxlinelen - len(header_name) - 2
! # Second and subsequent lines should subtract off the length in
! # columns of the continuation whitespace prefix.
! self._maxlinelen = maxlinelen - cws_expanded_len
def __str__(self):
***************
*** 128,145 ****
return self.encode()
- def guess_maxlinelen(self, s=None):
- """Guess the maximum length to make each header line.
-
- Given a header name (e.g. "Subject"), set this header's maximum line
- length to an appropriate length to avoid line wrapping. If s is not
- given, return the previous maximum line length and don't set it.
-
- Returns the new maximum line length.
- """
- # BAW: is this semantic necessary?
- if s is not None:
- self._maxlinelen = MAXLINELEN - len(s) - 2
- return self._maxlinelen
-
def append(self, s, charset=None):
"""Append string s with Charset charset to the MIME header.
--- 143,146 ----
***************
*** 151,155 ****
self._chunks.append((s, charset))
! def _split(self, s, charset):
# Split up a header safely for use with encode_chunks. BAW: this
# appears to be a private convenience method.
--- 152,156 ----
self._chunks.append((s, charset))
! def _split(self, s, charset, firstline=0):
# Split up a header safely for use with encode_chunks. BAW: this
# appears to be a private convenience method.
***************
*** 160,163 ****
--- 161,178 ----
if elen <= self._maxlinelen:
return [(encoded, charset)]
+ # BAW: I'm not sure what the right test here is. What we're trying to
+ # do is be faithful to RFC 2822's recommendation that ($2.2.3):
+ #
+ # "Note: Though structured field bodies are defined in such a way that
+ # folding can take place between many of the lexical tokens (and even
+ # within some of the lexical tokens), folding SHOULD be limited to
+ # placing the CRLF at higher-level syntactic breaks."
+ #
+ # For now, I can only imagine doing this when the charset is us-ascii,
+ # although it's possible that other charsets may also benefit from the
+ # higher-level syntactic breaks.
+ #
+ elif charset == 'us-ascii':
+ return self._ascii_split(s, charset, firstline)
# BAW: should we use encoded?
elif elen == len(s):
***************
*** 167,171 ****
first = charset.from_splittable(splittable[:splitpnt], 0)
last = charset.from_splittable(splittable[splitpnt:], 0)
- return self._split(first, charset) + self._split(last, charset)
else:
# Divide and conquer.
--- 182,185 ----
***************
*** 173,203 ****
first = charset.from_splittable(splittable[:halfway], 0)
last = charset.from_splittable(splittable[halfway:], 0)
! return self._split(first, charset) + self._split(last, charset)
!
! def encode(self):
! """Encode a message header, possibly converting charset and encoding.
!
! There are many issues involved in converting a given string for use in
! an email header. Only certain character sets are readable in most
! email clients, and as header strings can only contain a subset of
! 7-bit ASCII, care must be taken to properly convert and encode (with
! Base64 or quoted-printable) header strings. In addition, there is a
! 75-character length limit on any given encoded header field, so
! line-wrapping must be performed, even with double-byte character sets.
!
! This method will do its best to convert the string to the correct
! character set used in email, and encode and line wrap it safely with
! the appropriate scheme for that character set.
! If the given charset is not known or an error occurs during
! conversion, this function will return the header untouched.
! """
! newchunks = []
! for s, charset in self._chunks:
! newchunks += self._split(s, charset)
! self._chunks = newchunks
! return self.encode_chunks()
! def encode_chunks(self):
"""MIME-encode a header with many different charsets and/or encodings.
--- 187,270 ----
first = charset.from_splittable(splittable[:halfway], 0)
last = charset.from_splittable(splittable[halfway:], 0)
! # Do the split
! return self._split(first, charset, firstline) + \
! self._split(last, charset)
! def _ascii_split(self, s, charset, firstline):
! # Attempt to split the line at the highest-level syntactic break
! # possible. Note that we don't have a lot of smarts about field
! # syntax; we just try to break on semi-colons, then whitespace.
! rtn = []
! lines = s.splitlines()
! while lines:
! line = lines.pop(0)
! if firstline:
! maxlinelen = self._firstlinelen
! firstline = 0
! else:
! line = line.lstrip()
! maxlinelen = self._maxlinelen
! # Short lines can remain unchanged
! if len(line.replace('\t', SPACE8)) <= maxlinelen:
! rtn.append(line)
! else:
! oldlen = len(line)
! # Try to break the line on semicolons, but if that doesn't
! # work, try to split on folding whitespace.
! while len(line) > maxlinelen:
! i = line.rfind(';', 0, maxlinelen)
! if i < 0:
! break
! rtn.append(line[:i] + ';')
! line = line[i+1:]
! # Is the remaining stuff still longer than maxlinelen?
! if len(line) <= maxlinelen:
! # Splitting on semis worked
! rtn.append(line)
! continue
! # Splitting on semis didn't finish the job. If it did any
! # work at all, stick the remaining junk on the front of the
! # `lines' sequence and let the next pass do its thing.
! if len(line) <> oldlen:
! lines.insert(0, line)
! continue
! # Otherwise, splitting on semis didn't help at all.
! parts = re.split(r'(\s+)', line)
! if len(parts) == 1 or (len(parts) == 3 and
! parts[0].endswith(':')):
! # This line can't be split on whitespace. There's now
! # little we can do to get this into maxlinelen. BAW:
! # We're still potentially breaking the RFC by possibly
! # allowing lines longer than the absolute maximum of 998
! # characters. For now, let it slide.
! #
! # len(parts) will be 1 if this line has no `Field: '
! # prefix, otherwise it will be len(3).
! rtn.append(line)
! continue
! # There is whitespace we can split on.
! first = parts.pop(0)
! sublines = [first]
! acc = len(first)
! while parts:
! len0 = len(parts[0])
! len1 = len(parts[1])
! if acc + len0 + len1 <= maxlinelen:
! sublines.append(parts.pop(0))
! sublines.append(parts.pop(0))
! acc += len0 + len1
! else:
! # Split it here, but don't forget to ignore the
! # next whitespace-only part
! if first <> '':
! rtn.append(EMPTYSTRING.join(sublines))
! del parts[0]
! first = parts.pop(0)
! sublines = [first]
! acc = len(first)
! rtn.append(EMPTYSTRING.join(sublines))
! return [(chunk, charset) for chunk in rtn]
! def _encode_chunks(self):
"""MIME-encode a header with many different charsets and/or encodings.
***************
*** 220,227 ****
chunks = []
for header, charset in self._chunks:
! if charset is None:
! _max_append(chunks, header, self._maxlinelen, ' ')
else:
_max_append(chunks, charset.header_encode(header, 0),
self._maxlinelen, ' ')
! return NLSPACE.join(chunks)
--- 287,320 ----
chunks = []
for header, charset in self._chunks:
! if charset is None or charset.header_encoding is None:
! # There's no encoding for this chunk's charsets
! _max_append(chunks, header, self._maxlinelen)
else:
_max_append(chunks, charset.header_encode(header, 0),
self._maxlinelen, ' ')
! joiner = NL + self._continuation_ws
! return joiner.join(chunks)
!
! def encode(self):
! """Encode a message header, possibly converting charset and encoding.
!
! There are many issues involved in converting a given string for use in
! an email header. Only certain character sets are readable in most
! email clients, and as header strings can only contain a subset of
! 7-bit ASCII, care must be taken to properly convert and encode (with
! Base64 or quoted-printable) header strings. In addition, there is a
! 75-character length limit on any given encoded header field, so
! line-wrapping must be performed, even with double-byte character sets.
!
! This method will do its best to convert the string to the correct
! character set used in email, and encode and line wrap it safely with
! the appropriate scheme for that character set.
!
! If the given charset is not known or an error occurs during
! conversion, this function will return the header untouched.
! """
! newchunks = []
! for s, charset in self._chunks:
! newchunks += self._split(s, charset, 1)
! self._chunks = newchunks
! return self._encode_chunks()