[Python-checkins] bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)

miss-islington webhook-mailer at python.org
Tue Mar 22 05:37:25 EDT 2022


https://github.com/python/cpython/commit/4f97d64c831c94660ceb01f34d51fa236ad968b0
commit: 4f97d64c831c94660ceb01f34d51fa236ad968b0
branch: main
author: Christian Heimes <christian at python.org>
committer: miss-islington <31488909+miss-islington at users.noreply.github.com>
date: 2022-03-22T02:37:00-07:00
summary:

bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)

files:
A Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
M Doc/library/hashlib.rst
M Lib/hashlib.py
M Lib/test/test_hashlib.py

diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst
index aa24131f8bf44..da97b0e9a74d1 100644
--- a/Doc/library/hashlib.rst
+++ b/Doc/library/hashlib.rst
@@ -228,6 +228,49 @@ by the SHAKE algorithm.
    exchange the value safely in email or other non-binary environments.
 
 
+File hashing
+------------
+
+The hashlib module provides a helper function for efficient hashing of
+a file or file-like object.
+
+.. function:: file_digest(fileobj, digest, /)
+
+   Return a digest object that has been updated with contents of file object.
+
+   *fileobj* must be a file-like object opened for reading in binary mode.
+   It accepts file objects from  builtin :func:`open`, :class:`~io.BytesIO`
+   instances, SocketIO objects from :meth:`socket.socket.makefile`, and
+   similar. The function may bypass Python's I/O and use the file descriptor
+   from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
+   in an unknown state after this function returns or raises. It is up to
+   the caller to close *fileobj*.
+
+   *digest* must either be a hash algorithm name as a *str*, a hash
+   constructor, or a callable that returns a hash object.
+
+   Example:
+
+      >>> import io, hashlib, hmac
+      >>> with open(hashlib.__file__, "rb") as f:
+      ...     digest = hashlib.file_digest(f, "sha256")
+      ...
+      >>> digest.hexdigest()  # doctest: +ELLIPSIS
+      '...'
+
+      >>> buf = io.BytesIO(b"somedata")
+      >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
+      >>> digest = hashlib.file_digest(buf, lambda: mac1)
+
+      >>> digest is mac1
+      True
+      >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
+      >>> mac1.digest() == mac2.digest()
+      True
+
+   .. versionadded:: 3.11
+
+
 Key derivation
 --------------
 
diff --git a/Lib/hashlib.py b/Lib/hashlib.py
index 562501860a72b..b546a3fd79531 100644
--- a/Lib/hashlib.py
+++ b/Lib/hashlib.py
@@ -65,7 +65,7 @@
 algorithms_available = set(__always_supported)
 
 __all__ = __always_supported + ('new', 'algorithms_guaranteed',
-                                'algorithms_available', 'pbkdf2_hmac')
+                                'algorithms_available', 'pbkdf2_hmac', 'file_digest')
 
 
 __builtin_constructor_cache = {}
@@ -254,6 +254,52 @@ def prf(msg, inner=inner, outer=outer):
     pass
 
 
+def file_digest(fileobj, digest, /, *, _bufsize=2**18):
+    """Hash the contents of a file-like object. Returns a digest object.
+
+    *fileobj* must be a file-like object opened for reading in binary mode.
+    It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
+    The function may bypass Python's I/O and use the file descriptor *fileno*
+    directly.
+
+    *digest* must either be a hash algorithm name as a *str*, a hash
+    constructor, or a callable that returns a hash object.
+    """
+    # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
+    # hashing with hardware acceleration.
+    if isinstance(digest, str):
+        digestobj = new(digest)
+    else:
+        digestobj = digest()
+
+    if hasattr(fileobj, "getbuffer"):
+        # io.BytesIO object, use zero-copy buffer
+        digestobj.update(fileobj.getbuffer())
+        return digestobj
+
+    # Only binary files implement readinto().
+    if not (
+        hasattr(fileobj, "readinto")
+        and hasattr(fileobj, "readable")
+        and fileobj.readable()
+    ):
+        raise ValueError(
+            f"'{fileobj!r}' is not a file-like object in binary reading mode."
+        )
+
+    # binary file, socket.SocketIO object
+    # Note: socket I/O uses different syscalls than file I/O.
+    buf = bytearray(_bufsize)  # Reusable buffer to reduce allocations.
+    view = memoryview(buf)
+    while True:
+        size = fileobj.readinto(buf)
+        if size == 0:
+            break  # EOF
+        digestobj.update(view[:size])
+
+    return digestobj
+
+
 for __func_name in __always_supported:
     # try them all, some may not work due to the OpenSSL
     # version not supporting that algorithm.
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
index ea31f8be2cb82..daf6e3862a24f 100644
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -10,6 +10,7 @@
 from binascii import unhexlify
 import hashlib
 import importlib
+import io
 import itertools
 import os
 import sys
@@ -20,6 +21,7 @@
 from test import support
 from test.support import _4G, bigmemtest
 from test.support.import_helper import import_fresh_module
+from test.support import os_helper
 from test.support import threading_helper
 from test.support import warnings_helper
 from http.client import HTTPException
@@ -371,6 +373,31 @@ def check(self, name, data, hexdigest, shake=False, **kwargs):
             if not shake:
                 self.assertEqual(len(digest), m.digest_size)
 
+        if not shake and kwargs.get("key") is None:
+            # skip shake and blake2 extended parameter tests
+            self.check_file_digest(name, data, hexdigest)
+
+    def check_file_digest(self, name, data, hexdigest):
+        hexdigest = hexdigest.lower()
+        digests = [name]
+        digests.extend(self.constructors_to_test[name])
+
+        with open(os_helper.TESTFN, "wb") as f:
+            f.write(data)
+
+        try:
+            for digest in digests:
+                buf = io.BytesIO(data)
+                buf.seek(0)
+                self.assertEqual(
+                    hashlib.file_digest(buf, digest).hexdigest(), hexdigest
+                )
+                with open(os_helper.TESTFN, "rb") as f:
+                    digestobj = hashlib.file_digest(f, digest)
+                self.assertEqual(digestobj.hexdigest(), hexdigest)
+        finally:
+            os.unlink(os_helper.TESTFN)
+
     def check_no_unicode(self, algorithm_name):
         # Unicode objects are not allowed as input.
         constructors = self.constructors_to_test[algorithm_name]
@@ -1117,6 +1144,33 @@ def test_normalized_name(self):
         self.assertNotIn("blake2b512", hashlib.algorithms_available)
         self.assertNotIn("sha3-512", hashlib.algorithms_available)
 
+    def test_file_digest(self):
+        data = b'a' * 65536
+        d1 = hashlib.sha256()
+        self.addCleanup(os.unlink, os_helper.TESTFN)
+        with open(os_helper.TESTFN, "wb") as f:
+            for _ in range(10):
+                d1.update(data)
+                f.write(data)
+
+        with open(os_helper.TESTFN, "rb") as f:
+            d2 = hashlib.file_digest(f, hashlib.sha256)
+
+        self.assertEqual(d1.hexdigest(), d2.hexdigest())
+        self.assertEqual(d1.name, d2.name)
+        self.assertIs(type(d1), type(d2))
+
+        with self.assertRaises(ValueError):
+            hashlib.file_digest(None, "sha256")
+
+        with self.assertRaises(ValueError):
+            with open(os_helper.TESTFN, "r") as f:
+                hashlib.file_digest(f, "sha256")
+
+        with self.assertRaises(ValueError):
+            with open(os_helper.TESTFN, "wb") as f:
+                hashlib.file_digest(f, "sha256")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
new file mode 100644
index 0000000000000..1c6ea5a8e43bc
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
@@ -0,0 +1 @@
+Add :func:`hashlib.file_digest` helper for efficient hashing of file object.



More information about the Python-checkins mailing list