[Python-checkins] cpython (3.5): Issue #22088: Clarify base-64 alphabets and which characters are discarded

Tue Feb 23 20:19:12 EST 2016

https://hg.python.org/cpython/rev/c8933fbc9171
changeset:   100307:c8933fbc9171
branch:      3.5
parent:      100304:419d20551d26
user:        Martin Panter <vadmium+py at gmail.com>
date:        Tue Feb 23 22:30:50 2016 +0000
summary:
  Issue #22088: Clarify base-64 alphabets and which characters are discarded

* There are only two base-64 alphabets defined by the RFCs, not three
* Due to the internal translation, plus (+) and slash (/) are never discarded
* standard_ and urlsafe_b64decode() discard characters as well

Also update the doc strings to clarify data types, based on revision
92760d2edc9e, correct the exception raised by b16decode(), and correct the
parameter name for the base-85 functions.

files:
  Doc/library/base64.rst  |   29 +++--
  Lib/base64.py           |  122 +++++++++++++--------------
  Lib/test/test_base64.py |   20 ++++-
  3 files changed, 92 insertions(+), 79 deletions(-)

diff --git a/Doc/library/base64.rst b/Doc/library/base64.rst
--- a/Doc/library/base64.rst
+++ b/Doc/library/base64.rst
@@ -24,8 +24,8 @@
 There are two interfaces provided by this module.  The modern interface
 supports encoding :term:`bytes-like objects <bytes-like object>` to ASCII
 :class:`bytes`, and decoding :term:`bytes-like objects <bytes-like object>` or
-strings containing ASCII to :class:`bytes`.  All three :rfc:`3548` defined
-alphabets (normal, URL-safe, and filesystem-safe) are supported.
+strings containing ASCII to :class:`bytes`.  Both base-64 alphabets
+defined in :rfc:`3548` (normal, and URL- and filesystem-safe) are supported.
 
 The legacy interface does not support decoding from strings, but it does
 provide functions for encoding and decoding to and from :term:`file objects
@@ -69,9 +69,10 @@
    A :exc:`binascii.Error` exception is raised
    if *s* is incorrectly padded.
 
-   If *validate* is ``False`` (the default), non-base64-alphabet characters are
+   If *validate* is ``False`` (the default), characters that are neither
+   in the normal base-64 alphabet nor the alternative alphabet are
    discarded prior to the padding check.  If *validate* is ``True``,
-   non-base64-alphabet characters in the input result in a
+   these non-alphabet characters in the input result in a
    :exc:`binascii.Error`.
 
 
@@ -89,7 +90,8 @@
 
 .. function:: urlsafe_b64encode(s)
 
-   Encode :term:`bytes-like object` *s* using a URL-safe alphabet, which
+   Encode :term:`bytes-like object` *s* using the
+   URL- and filesystem-safe alphabet, which
    substitutes ``-`` instead of ``+`` and ``_`` instead of ``/`` in the
    standard Base64 alphabet, and return the encoded :class:`bytes`.  The result
    can still contain ``=``.
@@ -97,7 +99,8 @@
 
 .. function:: urlsafe_b64decode(s)
 
-   Decode :term:`bytes-like object` or ASCII string *s* using a URL-safe
+   Decode :term:`bytes-like object` or ASCII string *s*
+   using the URL- and filesystem-safe
    alphabet, which substitutes ``-`` instead of ``+`` and ``_`` instead of
    ``/`` in the standard Base64 alphabet, and return the decoded
    :class:`bytes`.
@@ -145,14 +148,14 @@
    lowercase alphabet is acceptable as input.  For security purposes, the default
    is ``False``.
 
-   A :exc:`TypeError` is raised if *s* is
+   A :exc:`binascii.Error` is raised if *s* is
    incorrectly padded or if there are non-alphabet characters present in the
    input.
 
 
-.. function:: a85encode(s, *, foldspaces=False, wrapcol=0, pad=False, adobe=False)
+.. function:: a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False)
 
-   Encode the :term:`bytes-like object` *s* using Ascii85 and return the
+   Encode the :term:`bytes-like object` *b* using Ascii85 and return the
    encoded :class:`bytes`.
 
    *foldspaces* is an optional flag that uses the special short sequence 'y'
@@ -172,9 +175,9 @@
    .. versionadded:: 3.4
 
 
-.. function:: a85decode(s, *, foldspaces=False, adobe=False, ignorechars=b' \\t\\n\\r\\v')
+.. function:: a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \\t\\n\\r\\v')
 
-   Decode the Ascii85 encoded :term:`bytes-like object` or ASCII string *s* and
+   Decode the Ascii85 encoded :term:`bytes-like object` or ASCII string *b* and
    return the decoded :class:`bytes`.
 
    *foldspaces* is a flag that specifies whether the 'y' short sequence
@@ -192,9 +195,9 @@
    .. versionadded:: 3.4
 
 
-.. function:: b85encode(s, pad=False)
+.. function:: b85encode(b, pad=False)
 
-   Encode the :term:`bytes-like object` *s* using base85 (as used in e.g.
+   Encode the :term:`bytes-like object` *b* using base85 (as used in e.g.
    git-style binary diffs) and return the encoded :class:`bytes`.
 
    If *pad* is true, the input is padded with ``b'\0'`` so its length is a
diff --git a/Lib/base64.py b/Lib/base64.py
--- a/Lib/base64.py
+++ b/Lib/base64.py
@@ -12,7 +12,7 @@
 
 
 __all__ = [
-    # Legacy interface exports traditional RFC 1521 Base64 encodings
+    # Legacy interface exports traditional RFC 2045 Base64 encodings
     'encode', 'decode', 'encodebytes', 'decodebytes',
     # Generalized interface for other encodings
     'b64encode', 'b64decode', 'b32encode', 'b32decode',
@@ -49,14 +49,11 @@
 # Base64 encoding/decoding uses binascii
 
 def b64encode(s, altchars=None):
-    """Encode a byte string using Base64.
+    """Encode the bytes-like object s using Base64 and return a bytes object.
 
-    s is the byte string to encode.  Optional altchars must be a byte
-    string of length 2 which specifies an alternative alphabet for the
-    '+' and '/' characters.  This allows an application to
-    e.g. generate url or filesystem safe Base64 strings.
-
-    The encoded byte string is returned.
+    Optional altchars should be a byte string of length 2 which specifies an
+    alternative alphabet for the '+' and '/' characters.  This allows an
+    application to e.g. generate url or filesystem safe Base64 strings.
     """
     # Strip off the trailing newline
     encoded = binascii.b2a_base64(s)[:-1]
@@ -67,18 +64,19 @@
 
 
 def b64decode(s, altchars=None, validate=False):
-    """Decode a Base64 encoded byte string.
+    """Decode the Base64 encoded bytes-like object or ASCII string s.
 
-    s is the byte string to decode.  Optional altchars must be a
-    string of length 2 which specifies the alternative alphabet used
-    instead of the '+' and '/' characters.
+    Optional altchars must be a bytes-like object or ASCII string of length 2
+    which specifies the alternative alphabet used instead of the '+' and '/'
+    characters.
 
-    The decoded string is returned.  A binascii.Error is raised if s is
-    incorrectly padded.
+    The result is returned as a bytes object.  A binascii.Error is raised if
+    s is incorrectly padded.
 
-    If validate is False (the default), non-base64-alphabet characters are
-    discarded prior to the padding check.  If validate is True,
-    non-base64-alphabet characters in the input result in a binascii.Error.
+    If validate is False (the default), characters that are neither in the
+    normal base-64 alphabet nor the alternative alphabet are discarded prior
+    to the padding check.  If validate is True, these non-alphabet characters
+    in the input result in a binascii.Error.
     """
     s = _bytes_from_decode_data(s)
     if altchars is not None:
@@ -91,19 +89,19 @@
 
 
 def standard_b64encode(s):
-    """Encode a byte string using the standard Base64 alphabet.
+    """Encode bytes-like object s using the standard Base64 alphabet.
 
-    s is the byte string to encode.  The encoded byte string is returned.
+    The result is returned as a bytes object.
     """
     return b64encode(s)
 
 def standard_b64decode(s):
-    """Decode a byte string encoded with the standard Base64 alphabet.
+    """Decode bytes encoded with the standard Base64 alphabet.
 
-    s is the byte string to decode.  The decoded byte string is
-    returned.  binascii.Error is raised if the input is incorrectly
-    padded or if there are non-alphabet characters present in the
-    input.
+    Argument s is a bytes-like object or ASCII string to decode.  The result
+    is returned as a bytes object.  A binascii.Error is raised if the input
+    is incorrectly padded.  Characters that are not in the standard alphabet
+    are discarded prior to the padding check.
     """
     return b64decode(s)
 
@@ -112,21 +110,22 @@
 _urlsafe_decode_translation = bytes.maketrans(b'-_', b'+/')
 
 def urlsafe_b64encode(s):
-    """Encode a byte string using a url-safe Base64 alphabet.
+    """Encode bytes using the URL- and filesystem-safe Base64 alphabet.
 
-    s is the byte string to encode.  The encoded byte string is
-    returned.  The alphabet uses '-' instead of '+' and '_' instead of
+    Argument s is a bytes-like object to encode.  The result is returned as a
+    bytes object.  The alphabet uses '-' instead of '+' and '_' instead of
     '/'.
     """
     return b64encode(s).translate(_urlsafe_encode_translation)
 
 def urlsafe_b64decode(s):
-    """Decode a byte string encoded with the standard Base64 alphabet.
+    """Decode bytes using the URL- and filesystem-safe Base64 alphabet.
 
-    s is the byte string to decode.  The decoded byte string is
-    returned.  binascii.Error is raised if the input is incorrectly
-    padded or if there are non-alphabet characters present in the
-    input.
+    Argument s is a bytes-like object or ASCII string to decode.  The result
+    is returned as a bytes object.  A binascii.Error is raised if the input
+    is incorrectly padded.  Characters that are not in the URL-safe base-64
+    alphabet, and are not a plus '+' or slash '/', are discarded prior to the
+    padding check.
 
     The alphabet uses '-' instead of '+' and '_' instead of '/'.
     """
@@ -142,9 +141,7 @@
 _b32rev = None
 
 def b32encode(s):
-    """Encode a byte string using Base32.
-
-    s is the byte string to encode.  The encoded byte string is returned.
+    """Encode the bytes-like object s using Base32 and return a bytes object.
     """
     global _b32tab2
     # Delay the initialization of the table to not waste memory
@@ -182,11 +179,10 @@
     return bytes(encoded)
 
 def b32decode(s, casefold=False, map01=None):
-    """Decode a Base32 encoded byte string.
+    """Decode the Base32 encoded bytes-like object or ASCII string s.
 
-    s is the byte string to decode.  Optional casefold is a flag
-    specifying whether a lowercase alphabet is acceptable as input.
-    For security purposes, the default is False.
+    Optional casefold is a flag specifying whether a lowercase alphabet is
+    acceptable as input.  For security purposes, the default is False.
 
     RFC 3548 allows for optional mapping of the digit 0 (zero) to the
     letter O (oh), and for optional mapping of the digit 1 (one) to
@@ -196,7 +192,7 @@
     the letter O).  For security purposes the default is None, so that
     0 and 1 are not allowed in the input.
 
-    The decoded byte string is returned.  binascii.Error is raised if
+    The result is returned as a bytes object.  A binascii.Error is raised if
     the input is incorrectly padded or if there are non-alphabet
     characters present in the input.
     """
@@ -257,23 +253,20 @@
 # lowercase.  The RFC also recommends against accepting input case
 # insensitively.
 def b16encode(s):
-    """Encode a byte string using Base16.
-
-    s is the byte string to encode.  The encoded byte string is returned.
+    """Encode the bytes-like object s using Base16 and return a bytes object.
     """
     return binascii.hexlify(s).upper()
 
 
 def b16decode(s, casefold=False):
-    """Decode a Base16 encoded byte string.
+    """Decode the Base16 encoded bytes-like object or ASCII string s.
 
-    s is the byte string to decode.  Optional casefold is a flag
-    specifying whether a lowercase alphabet is acceptable as input.
-    For security purposes, the default is False.
+    Optional casefold is a flag specifying whether a lowercase alphabet is
+    acceptable as input.  For security purposes, the default is False.
 
-    The decoded byte string is returned.  binascii.Error is raised if
-    s were incorrectly padded or if there are non-alphabet characters
-    present in the string.
+    The result is returned as a bytes object.  A binascii.Error is raised if
+    s is incorrectly padded or if there are non-alphabet characters present
+    in the input.
     """
     s = _bytes_from_decode_data(s)
     if casefold:
@@ -316,19 +309,17 @@
     return b''.join(chunks)
 
 def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
-    """Encode a byte string using Ascii85.
-
-    b is the byte string to encode. The encoded byte string is returned.
+    """Encode bytes-like object b using Ascii85 and return a bytes object.
 
     foldspaces is an optional flag that uses the special short sequence 'y'
     instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
     feature is not supported by the "standard" Adobe encoding.
 
-    wrapcol controls whether the output should have newline ('\\n') characters
+    wrapcol controls whether the output should have newline (b'\\n') characters
     added to it. If this is non-zero, each output line will be at most this
     many characters long.
 
-    pad controls whether the input string is padded to a multiple of 4 before
+    pad controls whether the input is padded to a multiple of 4 before
     encoding. Note that the btoa implementation always pads.
 
     adobe controls whether the encoded byte sequence is framed with <~ and ~>,
@@ -359,9 +350,7 @@
     return result
 
 def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
-    """Decode an Ascii85 encoded byte string.
-
-    s is the byte string to decode.
+    """Decode the Ascii85 encoded bytes-like object or ASCII string b.
 
     foldspaces is a flag that specifies whether the 'y' short sequence should be
     accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
@@ -373,6 +362,8 @@
     ignorechars should be a byte string containing characters to ignore from the
     input. This should only contain whitespace characters, and by default
     contains all whitespace characters in ASCII.
+
+    The result is returned as a bytes object.
     """
     b = _bytes_from_decode_data(b)
     if adobe:
@@ -432,10 +423,10 @@
 _b85dec = None
 
 def b85encode(b, pad=False):
-    """Encode an ASCII-encoded byte array in base85 format.
+    """Encode bytes-like object b in base85 format and return a bytes object.
 
-    If pad is true, the input is padded with "\\0" so its length is a multiple of
-    4 characters before encoding.
+    If pad is true, the input is padded with b'\\0' so its length is a multiple of
+    4 bytes before encoding.
     """
     global _b85chars, _b85chars2
     # Delay the initialization of tables to not waste memory
@@ -446,7 +437,10 @@
     return _85encode(b, _b85chars, _b85chars2, pad)
 
 def b85decode(b):
-    """Decode base85-encoded byte array"""
+    """Decode the base85-encoded bytes-like object or ASCII string b
+
+    The result is returned as a bytes object.
+    """
     global _b85dec
     # Delay the initialization of tables to not waste memory
     # if the function is never called
@@ -531,7 +525,7 @@
 
 
 def encodebytes(s):
-    """Encode a bytestring into a bytestring containing multiple lines
+    """Encode a bytestring into a bytes object containing multiple lines
     of base-64 data."""
     _input_type_check(s)
     pieces = []
@@ -549,7 +543,7 @@
 
 
 def decodebytes(s):
-    """Decode a bytestring of base-64 data into a bytestring."""
+    """Decode a bytestring of base-64 data into a bytes object."""
     _input_type_check(s)
     return binascii.a2b_base64(s)
 
diff --git a/Lib/test/test_base64.py b/Lib/test/test_base64.py
--- a/Lib/test/test_base64.py
+++ b/Lib/test/test_base64.py
@@ -243,14 +243,26 @@
                  (b'@@', b''),
                  (b'!', b''),
                  (b'YWJj\nYWI=', b'abcab'))
+        funcs = (
+            base64.b64decode,
+            base64.standard_b64decode,
+            base64.urlsafe_b64decode,
+        )
         for bstr, res in tests:
-            self.assertEqual(base64.b64decode(bstr), res)
-            self.assertEqual(base64.b64decode(bstr.decode('ascii')), res)
+            for func in funcs:
+                with self.subTest(bstr=bstr, func=func):
+                    self.assertEqual(func(bstr), res)
+                    self.assertEqual(func(bstr.decode('ascii')), res)
             with self.assertRaises(binascii.Error):
                 base64.b64decode(bstr, validate=True)
             with self.assertRaises(binascii.Error):
                 base64.b64decode(bstr.decode('ascii'), validate=True)
 
+        # Normal alphabet characters not discarded when alternative given
+        res = b'\xFB\xEF\xBE\xFF\xFF\xFF'
+        self.assertEqual(base64.b64decode(b'++[[//]]', b'[]'), res)
+        self.assertEqual(base64.urlsafe_b64decode(b'++--//__'), res)
+
     def test_b32encode(self):
         eq = self.assertEqual
         eq(base64.b32encode(b''), b'')
@@ -360,6 +372,10 @@
            b'\x01\x02\xab\xcd\xef')
         eq(base64.b16decode(array('B', b"0102abcdef"), True),
            b'\x01\x02\xab\xcd\xef')
+        # Non-alphabet characters
+        self.assertRaises(binascii.Error, base64.b16decode, '0102AG')
+        # Incorrect "padding"
+        self.assertRaises(binascii.Error, base64.b16decode, '010')
 
     def test_a85encode(self):
         eq = self.assertEqual

-- 
Repository URL: https://hg.python.org/cpython