[Email-SIG] use_case.py
PyTonic
pytonic at i2pmail.org
Mon May 25 23:33:21 CEST 2015
On 05/25/2015 07:04 PM, PyTonic wrote:
> I will post another message containing a simple use case for the new
> interface which only streams, decodes and stores base64 encoded
> payloads on the fly and uses the old method for everything else. It
> additionally uses two more callables inside its Message subclass:
> get_payload_file() and is_streamed().
>
> It also contains some comments about unresolved issues like how
> decoding errors should be properly dealt with. And who is responsible
> for catching exceptions raised by the new interfaces so they can't
> break the FeedParser itself.
Attached as use_case.txt
-------------- next part --------------
from email import message
from feedparser_stream import FeedParser
class OldMessageList(message.Message):
''' Same as message.Message '''
def start_payload_chunks(self):
self._payload_buffer = list()
def append_payload_chunk(self, line):
self._payload_buffer.append(line)
def finalize_payload(self):
self.set_payload(''.join(self._payload_buffer))
del self._payload_buffer
from cStringIO import StringIO
class OldMessagecStringIO(message.Message):
'''
Same as message.Message but using
cStringIO instead of list()
'''
def start_payload_chunks(self):
self._payload_buffer = StringIO()
def append_payload_chunk(self, line):
self._payload_buffer.write(line)
def finalize_payload(self):
self.set_payload(self._payload_buffer.getvalue())
self._payload_buffer.close()
del self._payload_buffer
from binascii import a2b_base64
class StreamAndDecodeOnlyBase64Message(message.Message):
def __init__(self, *args, **kwargs):
'''
This class does almost everything like the default
message.Message class but will decode and store base64 payloads
on the fly without storing the full payload in RAM (twice). If
CTE is not set to base64 it should behave as usual.
For this to somewhat work:
1) one should be able to pass kwargs to FeedParser().
FeedParser then passes those to the supplied factory class
when creating new message objects.
2) FeedParser has to call the message object to add new payload
lines instead of adding them locally and then setting a str.
See changes in feedparser.py for an experimental version.
There are some things left out like catching decoding
exceptions and adding those to defects. Its also unclear how to
proceed in such a situation. Currently message.Message silently
delivers the encoded parts in get_payload() if decoding fails.
This is not the right thing to do if a user requests decoding.
There is also no check if start_payoad_chunks() was actually
called before appending new lines.
'''
self._create_tmp_file = kwargs.pop(
'tmp_file_creator', lambda msg: open('/tmp/bad_fallback', 'r+b')
)
message.Message.__init__(self, *args, **kwargs)
def start_payload_chunks(self):
if self.get('content-transfer-encoding', '').lower() == 'base64':
assert(callable(self._create_tmp_file))
self._payload_file = self._create_tmp_file(self)
self._payload_file_start = self._payload_file.tell()
self._left_over = ''
self._is_base64 = True
self.append_payload_chunk = self._append_payload_chunk_file
else:
self._payload_buffer = list()
self._is_base64 = False
self.append_payload_chunk = self._append_payload_chunk_memory
def _append_payload_chunk_memory(self, line):
self._payload_buffer.append(line)
def _append_payload_chunk_file(self, line):
# Base64 specific
line = self._left_over + line.rstrip()
mod = (len(line) % 4) * -1
if mod != 0:
line = line[:mod]
self._left_over = line[mod:]
self._payload_file.write(a2b_base64(line))
if mod == 0:
self._left_over = ''
def finalize_payload(self):
if not self._is_base64:
self.set_payload(''.join(self._payload_buffer))
del self._payload_buffer
else:
'''
It is unclear to me how get_payload() could
be modified to deliver either a filename or
File object in this case without breaking
existing code. Regardless of this it should
*not* hold the full decoded content in memory.
len(self._left_over) > 0 should raise a decoding exception
which should be added to defects. Not sure where, here or
within FeedParser(). See last part of __init__ comment.
'''
self._payload_file.seek(self._payload_file_start)
self.set_payload('')
def is_streamed(self):
return self._is_base64
def get_payload_file(self):
assert(self._is_base64)
return self._payload_file
if __name__ == '__main__':
import sys
from hashlib import md5
def _create_temporary_file(msg):
''' Just some test dummy '''
return open('/tmp/some.very_large_payload', 'r+b')
def show_parts(msg, level=0):
''' Just some debugging dummy '''
_fmt = "{sp:\t<{lvl}}{part}: {mime} as {charset} via {encoding}:\t{hash}"
_fmt_kw = {
'sp': '',
'lvl': level,
'mime': msg.get_content_type(),
'charset': msg.get_content_charset(),
'encoding': msg.get('content-transfer-encoding', 'unknown')
}
if msg.is_multipart():
print _fmt.format(part='multipart', hash='', **_fmt_kw)
for part in msg.get_payload():
show_parts(part, level=level+1)
return
if hasattr(msg, 'is_streamed') and msg.is_streamed():
_checksum = md5()
with msg.get_payload_file() as _payload_file:
while True:
_chunk = _payload_file.read(_checksum.block_size)
if len(_chunk) == 0:
break
_checksum.update(_chunk)
_checksum = _checksum.hexdigest() + ' (streamed)'
else:
_checksum = md5(msg.get_payload(decode=True)).hexdigest()
print _fmt.format(part='single part', hash=_checksum, **_fmt_kw)
# Init two parser instances
stream_parser = FeedParser(
_factory=StreamAndDecodeOnlyBase64Message,
tmp_file_creator=_create_temporary_file
)
default_parser = FeedParser()
# And test
for name, parser in (('default', default_parser), ('stream', stream_parser)):
print "\nUsing %s parser:" % name
with open(sys.argv[1], 'rb') as some_largish_mime_message:
for line in some_largish_mime_message:
parser.feed(line)
msg = parser.close()
show_parts(msg, level=1)
More information about the Email-SIG
mailing list