[Python-checkins] gh-106628: email parsing speedup (gh-106629)
corona10
webhook-mailer at python.org
Thu Jul 13 02:13:00 EDT 2023
https://github.com/python/cpython/commit/7e6ce48872fa3de98c986057764f35e1b2f4b936
commit: 7e6ce48872fa3de98c986057764f35e1b2f4b936
branch: main
author: CF Bolz-Tereick <cfbolz at gmx.de>
committer: corona10 <donghee.na92 at gmail.com>
date: 2023-07-13T15:12:56+09:00
summary:
gh-106628: email parsing speedup (gh-106629)
files:
A Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
M Lib/email/feedparser.py
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 885097c7dda06..53d71f5022515 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -37,6 +37,8 @@
headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
EMPTYSTRING = ''
NL = '\n'
+boundaryendRE = re.compile(
+ r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
NeedMoreData = object()
@@ -327,9 +329,10 @@ def _parsegen(self):
# this onto the input stream until we've scanned past the
# preamble.
separator = '--' + boundary
- boundaryre = re.compile(
- '(?P<sep>' + re.escape(separator) +
- r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+ def boundarymatch(line):
+ if not line.startswith(separator):
+ return None
+ return boundaryendRE.match(line, len(separator))
capturing_preamble = True
preamble = []
linesep = False
@@ -341,7 +344,7 @@ def _parsegen(self):
continue
if line == '':
break
- mo = boundaryre.match(line)
+ mo = boundarymatch(line)
if mo:
# If we're looking at the end boundary, we're done with
# this multipart. If there was a newline at the end of
@@ -373,13 +376,13 @@ def _parsegen(self):
if line is NeedMoreData:
yield NeedMoreData
continue
- mo = boundaryre.match(line)
+ mo = boundarymatch(line)
if not mo:
self._input.unreadline(line)
break
# Recurse to parse this subpart; the input stream points
# at the subpart's first line.
- self._input.push_eof_matcher(boundaryre.match)
+ self._input.push_eof_matcher(boundarymatch)
for retval in self._parsegen():
if retval is NeedMoreData:
yield NeedMoreData
diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
new file mode 100644
index 0000000000000..6fa276e901f64
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
@@ -0,0 +1,2 @@
+Speed up parsing of emails by about 20% by not compiling a new regular
+expression for every single email.
More information about the Python-checkins
mailing list