[Python-checkins] gh-106628: email parsing speedup (gh-106629)

Thu Jul 13 02:13:00 EDT 2023

https://github.com/python/cpython/commit/7e6ce48872fa3de98c986057764f35e1b2f4b936
commit: 7e6ce48872fa3de98c986057764f35e1b2f4b936
branch: main
author: CF Bolz-Tereick <cfbolz at gmx.de>
committer: corona10 <donghee.na92 at gmail.com>
date: 2023-07-13T15:12:56+09:00
summary:

gh-106628: email parsing speedup (gh-106629)

files:
A Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
M Lib/email/feedparser.py

diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 885097c7dda06..53d71f5022515 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -37,6 +37,8 @@
 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
 EMPTYSTRING = ''
 NL = '\n'
+boundaryendRE = re.compile(
+    r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 
 NeedMoreData = object()
 
@@ -327,9 +329,10 @@ def _parsegen(self):
             # this onto the input stream until we've scanned past the
             # preamble.
             separator = '--' + boundary
-            boundaryre = re.compile(
-                '(?P<sep>' + re.escape(separator) +
-                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+            def boundarymatch(line):
+                if not line.startswith(separator):
+                    return None
+                return boundaryendRE.match(line, len(separator))
             capturing_preamble = True
             preamble = []
             linesep = False
@@ -341,7 +344,7 @@ def _parsegen(self):
                     continue
                 if line == '':
                     break
-                mo = boundaryre.match(line)
+                mo = boundarymatch(line)
                 if mo:
                     # If we're looking at the end boundary, we're done with
                     # this multipart.  If there was a newline at the end of
@@ -373,13 +376,13 @@ def _parsegen(self):
                         if line is NeedMoreData:
                             yield NeedMoreData
                             continue
-                        mo = boundaryre.match(line)
+                        mo = boundarymatch(line)
                         if not mo:
                             self._input.unreadline(line)
                             break
                     # Recurse to parse this subpart; the input stream points
                     # at the subpart's first line.
-                    self._input.push_eof_matcher(boundaryre.match)
+                    self._input.push_eof_matcher(boundarymatch)
                     for retval in self._parsegen():
                         if retval is NeedMoreData:
                             yield NeedMoreData
diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
new file mode 100644
index 0000000000000..6fa276e901f64
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
@@ -0,0 +1,2 @@
+Speed up parsing of emails by about 20% by not compiling a new regular
+expression for every single email.