[Python-checkins] python/dist/src/Lib/email Header.py,1.4,1.5

bwarsaw@users.sourceforge.net bwarsaw@users.sourceforge.net
Fri, 28 Jun 2002 16:46:56 -0700

Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv18840/email

Modified Files:
Log Message:
Teach this class about "highest-level syntactic breaks" but only for
headers with no charset or 'us-ascii' charsets.  Actually this is only
partially true: we know about semicolons (but not true parameters) and
we know about whitespace (but not technically folding whitespace).
Still it should be good enough for all practical purposes.

Other changes include:

__init__(): Add a continuation_ws argument, which defaults to a single
space.  Set this to change the whitespace used for continuation lines
when a header must be split.  Also, changed the way header line
lengths are calculated, so that they take into account continuation_ws
(when tabs-expanded) and any provided header_name parameter.  This
should do much better on returning split headers for which the first
and subsequent lines must fit into a specified width.

guess_maxlinelen(): Removed.  I don't think we need this method as
part of the public API.

encode_chunks() -> _encode_chunks(): I don't think we need this one as
part of the public API either.

Index: Header.py
RCS file: /cvsroot/python/python/dist/src/Lib/email/Header.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** Header.py	1 Jun 2002 05:49:17 -0000	1.4
--- Header.py	28 Jun 2002 23:46:53 -0000	1.5
*** 17,21 ****
  CRLFSPACE = '\r\n '
  CRLF = '\r\n'
! NLSPACE = '\n '
--- 17,23 ----
  CRLFSPACE = '\r\n '
  CRLF = '\r\n'
! NL = '\n'
! SPACE8 = ' ' * 8
*** 93,101 ****
  class Header:
!     def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
          """Create a MIME-compliant header that can contain many languages.
          Specify the initial header value in s.  Specify its character set as a
!         Charset object in the charset argument.  If none, a default Charset
          instance will be used.
--- 95,104 ----
  class Header:
!     def __init__(self, s, charset=None, maxlinelen=None, header_name=None,
!                  continuation_ws=' '):
          """Create a MIME-compliant header that can contain many languages.
          Specify the initial header value in s.  Specify its character set as a
!         Charset object in the charset argument.  If None, a default Charset
          instance will be used.
*** 105,126 ****
          charset specified in the constructor.
!         The maximum line length can be specified explicitly via maxlinelen.
!         You can also pass None for maxlinelen and the name of a header field
!         (e.g. "Subject") to let the constructor guess the best line length to
!         use.  The default maxlinelen is 76.
          if charset is None:
              charset = Charset()
          self._charset = charset
          # BAW: I believe `chunks' and `maxlinelen' should be non-public.
          self._chunks = []
          self.append(s, charset)
          if maxlinelen is None:
!             if header_name is None:
!                 self._maxlinelen = MAXLINELEN
!             else:
!                 self.guess_maxlinelen(header_name)
!             self._maxlinelen = maxlinelen
      def __str__(self):
--- 108,141 ----
          charset specified in the constructor.
!         The maximum line length can be specified explicit via maxlinelen.  For
!         splitting the first line to a shorter value (to account for the field
!         header which isn't included in s, e.g. `Subject') pass in the name of
!         the field in header_name.  The default maxlinelen is 76.
!         continuation_ws must be RFC 2822 compliant folding whitespace (usually
!         either a space or a hard tab) which will be prepended to continuation
!         lines.
          if charset is None:
              charset = Charset()
          self._charset = charset
+         self._continuation_ws = continuation_ws
+         cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
          # BAW: I believe `chunks' and `maxlinelen' should be non-public.
          self._chunks = []
          self.append(s, charset)
          if maxlinelen is None:
!             maxlinelen = MAXLINELEN
!         if header_name is None:
!             # We don't know anything about the field header so the first line
!             # is the same length as subsequent lines.
!             self._firstlinelen = maxlinelen
!             # The first line should be shorter to take into account the field
!             # header.  Also subtract off 2 extra for the colon and space.
!             self._firstlinelen = maxlinelen - len(header_name) - 2
!         # Second and subsequent lines should subtract off the length in
!         # columns of the continuation whitespace prefix.
!         self._maxlinelen = maxlinelen - cws_expanded_len
      def __str__(self):
*** 128,145 ****
          return self.encode()
-     def guess_maxlinelen(self, s=None):
-         """Guess the maximum length to make each header line.
-         Given a header name (e.g. "Subject"), set this header's maximum line
-         length to an appropriate length to avoid line wrapping.  If s is not
-         given, return the previous maximum line length and don't set it.
-         Returns the new maximum line length.
-         """
-         # BAW: is this semantic necessary?
-         if s is not None:
-             self._maxlinelen = MAXLINELEN - len(s) - 2
-         return self._maxlinelen
      def append(self, s, charset=None):
          """Append string s with Charset charset to the MIME header.
--- 143,146 ----
*** 151,155 ****
          self._chunks.append((s, charset))
!     def _split(self, s, charset):
          # Split up a header safely for use with encode_chunks.  BAW: this
          # appears to be a private convenience method.
--- 152,156 ----
          self._chunks.append((s, charset))
!     def _split(self, s, charset, firstline=0):
          # Split up a header safely for use with encode_chunks.  BAW: this
          # appears to be a private convenience method.
*** 160,163 ****
--- 161,178 ----
          if elen <= self._maxlinelen:
              return [(encoded, charset)]
+         # BAW: I'm not sure what the right test here is.  What we're trying to
+         # do is be faithful to RFC 2822's recommendation that ($2.2.3):
+         #
+         # "Note: Though structured field bodies are defined in such a way that
+         #  folding can take place between many of the lexical tokens (and even
+         #  within some of the lexical tokens), folding SHOULD be limited to
+         #  placing the CRLF at higher-level syntactic breaks."
+         #
+         # For now, I can only imagine doing this when the charset is us-ascii,
+         # although it's possible that other charsets may also benefit from the
+         # higher-level syntactic breaks.
+         #
+         elif charset == 'us-ascii':
+             return self._ascii_split(s, charset, firstline)
          # BAW: should we use encoded?
          elif elen == len(s):
*** 167,171 ****
              first = charset.from_splittable(splittable[:splitpnt], 0)
              last = charset.from_splittable(splittable[splitpnt:], 0)
-             return self._split(first, charset) + self._split(last, charset)
              # Divide and conquer.
--- 182,185 ----
*** 173,203 ****
              first = charset.from_splittable(splittable[:halfway], 0)
              last = charset.from_splittable(splittable[halfway:], 0)
!             return self._split(first, charset) + self._split(last, charset)
!     def encode(self):
!         """Encode a message header, possibly converting charset and encoding.
!         There are many issues involved in converting a given string for use in
!         an email header.  Only certain character sets are readable in most
!         email clients, and as header strings can only contain a subset of
!         7-bit ASCII, care must be taken to properly convert and encode (with
!         Base64 or quoted-printable) header strings.  In addition, there is a
!         75-character length limit on any given encoded header field, so
!         line-wrapping must be performed, even with double-byte character sets.
!         This method will do its best to convert the string to the correct
!         character set used in email, and encode and line wrap it safely with
!         the appropriate scheme for that character set.
!         If the given charset is not known or an error occurs during
!         conversion, this function will return the header untouched.
!         """
!         newchunks = []
!         for s, charset in self._chunks:
!             newchunks += self._split(s, charset)
!         self._chunks = newchunks
!         return self.encode_chunks()
!     def encode_chunks(self):
          """MIME-encode a header with many different charsets and/or encodings.
--- 187,270 ----
              first = charset.from_splittable(splittable[:halfway], 0)
              last = charset.from_splittable(splittable[halfway:], 0)
!         # Do the split
!         return self._split(first, charset, firstline) + \
!                self._split(last, charset)
!     def _ascii_split(self, s, charset, firstline):
!         # Attempt to split the line at the highest-level syntactic break
!         # possible.  Note that we don't have a lot of smarts about field
!         # syntax; we just try to break on semi-colons, then whitespace.
!         rtn = []
!         lines = s.splitlines()
!         while lines:
!             line = lines.pop(0)
!             if firstline:
!                 maxlinelen = self._firstlinelen
!                 firstline = 0
!             else:
!                 line = line.lstrip()
!                 maxlinelen = self._maxlinelen
!             # Short lines can remain unchanged
!             if len(line.replace('\t', SPACE8)) <= maxlinelen:
!                 rtn.append(line)
!             else:
!                 oldlen = len(line)
!                 # Try to break the line on semicolons, but if that doesn't
!                 # work, try to split on folding whitespace.
!                 while len(line) > maxlinelen:
!                     i = line.rfind(';', 0, maxlinelen)
!                     if i < 0:
!                         break
!                     rtn.append(line[:i] + ';')
!                     line = line[i+1:]
!                 # Is the remaining stuff still longer than maxlinelen?
!                 if len(line) <= maxlinelen:
!                     # Splitting on semis worked
!                     rtn.append(line)
!                     continue
!                 # Splitting on semis didn't finish the job.  If it did any
!                 # work at all, stick the remaining junk on the front of the
!                 # `lines' sequence and let the next pass do its thing.
!                 if len(line) <> oldlen:
!                     lines.insert(0, line)
!                     continue
!                 # Otherwise, splitting on semis didn't help at all.
!                 parts = re.split(r'(\s+)', line)
!                 if len(parts) == 1 or (len(parts) == 3 and
!                                        parts[0].endswith(':')):
!                     # This line can't be split on whitespace.  There's now
!                     # little we can do to get this into maxlinelen.  BAW:
!                     # We're still potentially breaking the RFC by possibly
!                     # allowing lines longer than the absolute maximum of 998
!                     # characters.  For now, let it slide.
!                     #
!                     # len(parts) will be 1 if this line has no `Field: '
!                     # prefix, otherwise it will be len(3).
!                     rtn.append(line)
!                     continue
!                 # There is whitespace we can split on.
!                 first = parts.pop(0)
!                 sublines = [first]
!                 acc = len(first)
!                 while parts:
!                     len0 = len(parts[0])
!                     len1 = len(parts[1])
!                     if acc + len0 + len1 <= maxlinelen:
!                         sublines.append(parts.pop(0))
!                         sublines.append(parts.pop(0))
!                         acc += len0 + len1
!                     else:
!                         # Split it here, but don't forget to ignore the
!                         # next whitespace-only part
!                         if first <> '':
!                             rtn.append(EMPTYSTRING.join(sublines))
!                         del parts[0]
!                         first = parts.pop(0)
!                         sublines = [first]
!                         acc = len(first)
!                 rtn.append(EMPTYSTRING.join(sublines))
!         return [(chunk, charset) for chunk in rtn]
!     def _encode_chunks(self):
          """MIME-encode a header with many different charsets and/or encodings.
*** 220,227 ****
          chunks = []
          for header, charset in self._chunks:
!             if charset is None:
!                 _max_append(chunks, header, self._maxlinelen, ' ')
                  _max_append(chunks, charset.header_encode(header, 0),
                              self._maxlinelen, ' ')
!         return NLSPACE.join(chunks)
--- 287,320 ----
          chunks = []
          for header, charset in self._chunks:
!             if charset is None or charset.header_encoding is None:
!                 # There's no encoding for this chunk's charsets
!                 _max_append(chunks, header, self._maxlinelen)
                  _max_append(chunks, charset.header_encode(header, 0),
                              self._maxlinelen, ' ')
!         joiner = NL + self._continuation_ws
!         return joiner.join(chunks)
!     def encode(self):
!         """Encode a message header, possibly converting charset and encoding.
!         There are many issues involved in converting a given string for use in
!         an email header.  Only certain character sets are readable in most
!         email clients, and as header strings can only contain a subset of
!         7-bit ASCII, care must be taken to properly convert and encode (with
!         Base64 or quoted-printable) header strings.  In addition, there is a
!         75-character length limit on any given encoded header field, so
!         line-wrapping must be performed, even with double-byte character sets.
!         This method will do its best to convert the string to the correct
!         character set used in email, and encode and line wrap it safely with
!         the appropriate scheme for that character set.
!         If the given charset is not known or an error occurs during
!         conversion, this function will return the header untouched.
!         """
!         newchunks = []
!         for s, charset in self._chunks:
!             newchunks += self._split(s, charset, 1)
!         self._chunks = newchunks
!         return self._encode_chunks()