[Email-SIG] use_case.py

PyTonic pytonic at i2pmail.org
Mon May 25 23:33:21 CEST 2015


On 05/25/2015 07:04 PM, PyTonic wrote:
> I will post another message containing a simple use case for the new
> interface which only streams, decodes and stores base64 encoded
> payloads on the fly and uses the old method for everything else. It
> additionally uses two more callables inside its Message subclass:
> get_payload_file() and is_streamed().
>
> It also contains some comments about unresolved issues like how
> decoding errors should be properly dealt with. And who is responsible
> for catching exceptions raised by the new interfaces so they can't
> break the FeedParser itself.

Attached as use_case.txt
-------------- next part --------------
from email import message
from feedparser_stream import FeedParser

class OldMessageList(message.Message):
  ''' Same as message.Message '''
  def start_payload_chunks(self):
    self._payload_buffer = list()

  def append_payload_chunk(self, line):
    self._payload_buffer.append(line)

  def finalize_payload(self):
    self.set_payload(''.join(self._payload_buffer))
    del self._payload_buffer


from cStringIO import StringIO
class OldMessagecStringIO(message.Message):
  '''
    Same as message.Message but using
    cStringIO instead of list()
  '''
  def start_payload_chunks(self):
    self._payload_buffer = StringIO()

  def append_payload_chunk(self, line):
    self._payload_buffer.write(line)

  def finalize_payload(self):
    self.set_payload(self._payload_buffer.getvalue())
    self._payload_buffer.close()
    del self._payload_buffer


from binascii import a2b_base64
class StreamAndDecodeOnlyBase64Message(message.Message):
  def __init__(self, *args, **kwargs):
    '''
      This class does almost everything like the default
      message.Message class but will decode and store base64 payloads
      on the fly without storing the full payload in RAM (twice). If
      CTE is not set to base64 it should behave as usual.

      For this to somewhat work:
      1) one should be able to pass kwargs to FeedParser().
         FeedParser then passes those to the supplied factory class
         when creating new message objects.
      2) FeedParser has to call the message object to add new payload
         lines instead of adding them locally and then setting a str.

      See changes in feedparser.py for an experimental version.

      There are some things left out like catching decoding
      exceptions and adding those to defects. Its also unclear how to
      proceed in such a situation. Currently message.Message silently
      delivers the encoded parts in get_payload() if decoding fails.
      This is not the right thing to do if a user requests decoding.

      There is also no check if start_payoad_chunks() was actually
      called before appending new lines.
    '''
    self._create_tmp_file = kwargs.pop(
      'tmp_file_creator', lambda msg: open('/tmp/bad_fallback', 'r+b')
    )
    message.Message.__init__(self, *args, **kwargs)

  def start_payload_chunks(self):
    if self.get('content-transfer-encoding', '').lower() == 'base64':
      assert(callable(self._create_tmp_file))
      self._payload_file = self._create_tmp_file(self)
      self._payload_file_start = self._payload_file.tell()
      self._left_over = ''
      self._is_base64 = True
      self.append_payload_chunk = self._append_payload_chunk_file
    else:
      self._payload_buffer = list()
      self._is_base64 = False
      self.append_payload_chunk = self._append_payload_chunk_memory

  def _append_payload_chunk_memory(self, line):
      self._payload_buffer.append(line)

  def _append_payload_chunk_file(self, line):
    # Base64 specific
    line = self._left_over + line.rstrip()
    mod = (len(line) % 4) * -1
    if mod != 0:
      line = line[:mod]
      self._left_over = line[mod:]

    self._payload_file.write(a2b_base64(line))
    if mod == 0:
      self._left_over = ''

  def finalize_payload(self):
    if not self._is_base64:
      self.set_payload(''.join(self._payload_buffer))
      del self._payload_buffer
    else:
      '''
        It is unclear to me how get_payload() could
        be modified to deliver either a filename or
        File object in this case without breaking
        existing code. Regardless of this it should
        *not* hold the full decoded content in memory.
        
        len(self._left_over) > 0 should raise a decoding exception
        which should be added to defects. Not sure where, here or
        within FeedParser(). See last part of __init__ comment.  
      '''
      self._payload_file.seek(self._payload_file_start)
      self.set_payload('')

  def is_streamed(self):
    return self._is_base64

  def get_payload_file(self):
    assert(self._is_base64)
    return self._payload_file


if __name__ == '__main__':
  import sys
  from hashlib import md5
  
  def _create_temporary_file(msg):
    ''' Just some test dummy '''
    return open('/tmp/some.very_large_payload', 'r+b')

  def show_parts(msg, level=0):
    ''' Just some debugging dummy '''
    _fmt = "{sp:\t<{lvl}}{part}: {mime} as {charset} via {encoding}:\t{hash}"
    _fmt_kw = {
            'sp': '',
           'lvl': level,
          'mime': msg.get_content_type(),
       'charset': msg.get_content_charset(),
      'encoding': msg.get('content-transfer-encoding', 'unknown')
    }
    if msg.is_multipart():
      print _fmt.format(part='multipart', hash='', **_fmt_kw)
      for part in msg.get_payload():
        show_parts(part, level=level+1)
      return

    if hasattr(msg, 'is_streamed') and msg.is_streamed():
      _checksum = md5()
      with msg.get_payload_file() as _payload_file:
        while True:
          _chunk = _payload_file.read(_checksum.block_size)
          if len(_chunk) == 0:
            break
          _checksum.update(_chunk)
      _checksum = _checksum.hexdigest() + ' (streamed)'
    else:
      _checksum = md5(msg.get_payload(decode=True)).hexdigest()
    print _fmt.format(part='single part', hash=_checksum, **_fmt_kw)


  # Init two parser instances
  stream_parser = FeedParser(
    _factory=StreamAndDecodeOnlyBase64Message,
    tmp_file_creator=_create_temporary_file
  )
  default_parser = FeedParser()

  # And test
  for name, parser in (('default', default_parser), ('stream', stream_parser)):
    print "\nUsing %s parser:" % name
    with open(sys.argv[1], 'rb') as some_largish_mime_message:
      for line in some_largish_mime_message:
        parser.feed(line)
    msg = parser.close()
    show_parts(msg, level=1)


More information about the Email-SIG mailing list