diff --git a/Doc/library/base64.rst b/Doc/library/base64.rst index 4876117f6403b2..41f0332a3dd39d 100644 --- a/Doc/library/base64.rst +++ b/Doc/library/base64.rst @@ -73,6 +73,7 @@ POST request. .. function:: b64decode(s, altchars=None, validate=False) + b64decode(s, altchars=None, validate=True, *, ignorechars) Decode the Base64 encoded :term:`bytes-like object` or ASCII string *s* and return the decoded :class:`bytes`. @@ -84,16 +85,24 @@ POST request. A :exc:`binascii.Error` exception is raised if *s* is incorrectly padded. - If *validate* is ``False`` (the default), characters that are neither + If *ignorechars* is specified, it should be a byte string containing + characters to ignore from the input, and *validate* is ``True`` by default. + Otherwise *validate* is ``False`` by default. + + If *validate* is false, characters that are neither in the normal base-64 alphabet nor the alternative alphabet are - discarded prior to the padding check. If *validate* is ``True``, - these non-alphabet characters in the input result in a - :exc:`binascii.Error`. + discarded prior to the padding check. + If *validate* is true, these non-alphabet characters in the input + result in a :exc:`binascii.Error`. For more information about the strict base64 check, see :func:`binascii.a2b_base64` May assert or raise a :exc:`ValueError` if the length of *altchars* is not 2. + .. versionchanged:: next + Added the *ignorechars* parameter. + + .. function:: standard_b64encode(s) Encode :term:`bytes-like object` *s* using the standard Base64 alphabet diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index eaf755711bc292..a53bcb45717708 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -49,10 +49,16 @@ The :mod:`binascii` module defines the following functions: .. function:: a2b_base64(string, /, *, strict_mode=False) + a2b_base64(string, /, *, strict_mode=True, ignorechars) Convert a block of base64 data back to binary and return the binary data. More than one line may be passed at a time. + If *ignorechars* is specified, it should be a byte string containing + characters to ignore from the input when *strict_mode* is true. + *strict_mode* is ``True`` by default, if *ignorechars* is specified, + ``False`` otherwise. + If *strict_mode* is true, only valid base64 data will be converted. Invalid base64 data will raise :exc:`binascii.Error`. @@ -66,6 +72,9 @@ The :mod:`binascii` module defines the following functions: .. versionchanged:: 3.11 Added the *strict_mode* parameter. + .. versionchanged:: next + Added the *ignorechars* parameter. + .. function:: b2a_base64(data, *, wrapcol=0, newline=True) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index b7a27d5db63875..0cd686e1af6b3a 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -444,6 +444,8 @@ base64 * Added the *wrapcol* parameter in :func:`~base64.b64encode`. (Contributed by Serhiy Storchaka in :gh:`143214`.) +* Added the *ignorechars* parameter in :func:`~base64.b64decode`. + (Contributed by Serhiy Storchaka in :gh:`144001`.) binascii -------- @@ -451,6 +453,9 @@ binascii * Added the *wrapcol* parameter in :func:`~binascii.b2a_base64`. (Contributed by Serhiy Storchaka in :gh:`143214`.) +* Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`. + (Contributed by Serhiy Storchaka in :gh:`144001`.) + calendar -------- diff --git a/Lib/base64.py b/Lib/base64.py index e62ae6aff580fa..abf202357308f7 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -26,6 +26,8 @@ ] +_NOT_SPECIFIED = ['NOT SPECIFIED'] + bytes_types = (bytes, bytearray) # Types acceptable as binary data def _bytes_from_decode_data(s): @@ -62,7 +64,7 @@ def b64encode(s, altchars=None, *, wrapcol=0): return encoded -def b64decode(s, altchars=None, validate=False): +def b64decode(s, altchars=None, validate=_NOT_SPECIFIED, *, ignorechars=_NOT_SPECIFIED): """Decode the Base64 encoded bytes-like object or ASCII string s. Optional altchars must be a bytes-like object or ASCII string of length 2 @@ -72,10 +74,14 @@ def b64decode(s, altchars=None, validate=False): The result is returned as a bytes object. A binascii.Error is raised if s is incorrectly padded. - If validate is False (the default), characters that are neither in the - normal base-64 alphabet nor the alternative alphabet are discarded prior - to the padding check. If validate is True, these non-alphabet characters - in the input result in a binascii.Error. + If ignorechars is specified, it should be a byte string containing + characters to ignore from the input, and validate is True by default. + Otherwise validate is False by default. + + If validate is false, characters that are neither in the normal base-64 + alphabet nor the alternative alphabet are discarded prior to the + padding check. If validate is true, these non-alphabet characters in + the input result in a binascii.Error if they are not in ignorechars. For more information about the strict base64 check, see: https://docs.python.org/3.11/library/binascii.html#binascii.a2b_base64 @@ -85,7 +91,12 @@ def b64decode(s, altchars=None, validate=False): altchars = _bytes_from_decode_data(altchars) assert len(altchars) == 2, repr(altchars) s = s.translate(bytes.maketrans(altchars, b'+/')) - return binascii.a2b_base64(s, strict_mode=validate) + if validate is _NOT_SPECIFIED: + validate = ignorechars is not _NOT_SPECIFIED + if ignorechars is _NOT_SPECIFIED: + ignorechars = b'' + return binascii.a2b_base64(s, strict_mode=validate, + ignorechars=ignorechars) def standard_b64encode(s): diff --git a/Lib/test/test_base64.py b/Lib/test/test_base64.py index 120c5824a42a40..d1561c514a6a5f 100644 --- a/Lib/test/test_base64.py +++ b/Lib/test/test_base64.py @@ -298,22 +298,22 @@ def test_b64decode_padding_error(self): def test_b64decode_invalid_chars(self): # issue 1466065: Test some invalid characters. - tests = ((b'%3d==', b'\xdd'), - (b'$3d==', b'\xdd'), - (b'[==', b''), - (b'YW]3=', b'am'), - (b'3{d==', b'\xdd'), - (b'3d}==', b'\xdd'), - (b'@@', b''), - (b'!', b''), - (b"YWJj\n", b"abc"), - (b'YWJj\nYWI=', b'abcab')) + tests = ((b'%3d==', b'\xdd', b'%$'), + (b'$3d==', b'\xdd', b'%$'), + (b'[==', b'', None), + (b'YW]3=', b'am', b']'), + (b'3{d==', b'\xdd', b'{}'), + (b'3d}==', b'\xdd', b'{}'), + (b'@@', b'', b'@!'), + (b'!', b'', b'@!'), + (b"YWJj\n", b"abc", b'\n'), + (b'YWJj\nYWI=', b'abcab', b'\n')) funcs = ( base64.b64decode, base64.standard_b64decode, base64.urlsafe_b64decode, ) - for bstr, res in tests: + for bstr, res, ignorechars in tests: for func in funcs: with self.subTest(bstr=bstr, func=func): self.assertEqual(func(bstr), res) @@ -322,6 +322,21 @@ def test_b64decode_invalid_chars(self): base64.b64decode(bstr, validate=True) with self.assertRaises(binascii.Error): base64.b64decode(bstr.decode('ascii'), validate=True) + with self.assertRaises(binascii.Error): + base64.b64decode(bstr, ignorechars=b'') + if ignorechars is not None: + self.assertEqual( + base64.b64decode(bstr, ignorechars=ignorechars), + res) + + with self.assertRaises(TypeError): + base64.b64decode(b'', ignorechars=bytearray()) + with self.assertRaises(TypeError): + base64.b64decode(b'', ignorechars='') + with self.assertRaises(TypeError): + base64.b64decode(b'', ignorechars=[]) + with self.assertRaises(TypeError): + base64.b64decode(b'', ignorechars=None) # Normal alphabet characters not discarded when alternative given res = b'\xfb\xef\xff' diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 47e1e6ab035a17..3e9e692b75d2e8 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -145,16 +145,16 @@ def assertExcessPadding(data, non_strict_mode_expected_result: bytes): # Test excess data exceptions assertExcessData(b'ab==a', b'i') - assertExcessData(b'ab===', b'i') - assertExcessData(b'ab====', b'i') - assertExcessData(b'ab==:', b'i') + assertExcessPadding(b'ab===', b'i') + assertExcessPadding(b'ab====', b'i') + assertNonBase64Data(b'ab==:', b'i') assertExcessData(b'abc=a', b'i\xb7') - assertExcessData(b'abc=:', b'i\xb7') - assertExcessData(b'ab==\n', b'i') - assertExcessData(b'abc==', b'i\xb7') - assertExcessData(b'abc===', b'i\xb7') - assertExcessData(b'abc====', b'i\xb7') - assertExcessData(b'abc=====', b'i\xb7') + assertNonBase64Data(b'abc=:', b'i\xb7') + assertNonBase64Data(b'ab==\n', b'i') + assertExcessPadding(b'abc==', b'i\xb7') + assertExcessPadding(b'abc===', b'i\xb7') + assertExcessPadding(b'abc====', b'i\xb7') + assertExcessPadding(b'abc=====', b'i\xb7') # Test non-base64 data exceptions assertNonBase64Data(b'\nab==', b'i') @@ -170,12 +170,45 @@ def assertExcessPadding(data, non_strict_mode_expected_result: bytes): assertLeadingPadding(b'=====', b'') assertDiscontinuousPadding(b'ab=c=', b'i\xb7') assertDiscontinuousPadding(b'ab=ab==', b'i\xb6\x9b') + assertNonBase64Data(b'ab=:=', b'i') assertExcessPadding(b'abcd=', b'i\xb7\x1d') assertExcessPadding(b'abcd==', b'i\xb7\x1d') assertExcessPadding(b'abcd===', b'i\xb7\x1d') assertExcessPadding(b'abcd====', b'i\xb7\x1d') assertExcessPadding(b'abcd=====', b'i\xb7\x1d') + def test_base64_invalidchars(self): + def assertNonBase64Data(data, expected, ignorechars): + data = self.type2test(data) + assert_regex = r'(?i)Only base64 data' + self.assertEqual(binascii.a2b_base64(data), expected) + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base64(data, strict_mode=True) + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base64(data, ignorechars=b'') + self.assertEqual(binascii.a2b_base64(data, ignorechars=ignorechars), + expected) + self.assertEqual(binascii.a2b_base64(data, strict_mode=False, ignorechars=b''), + expected) + + assertNonBase64Data(b'\nab==', b'i', ignorechars=b'\n') + assertNonBase64Data(b'ab:(){:|:&};:==', b'i', ignorechars=b':;(){}|&') + assertNonBase64Data(b'a\nb==', b'i', ignorechars=b'\n') + assertNonBase64Data(b'a\x00b==', b'i', ignorechars=b'\x00') + assertNonBase64Data(b'ab==:', b'i', ignorechars=b':') + assertNonBase64Data(b'abc=:', b'i\xb7', ignorechars=b':') + assertNonBase64Data(b'ab==\n', b'i', ignorechars=b'\n') + assertNonBase64Data(b'ab=:=', b'i', ignorechars=b':') + + data = self.type2test(b'a\nb==') + with self.assertRaises(TypeError): + binascii.a2b_base64(data, ignorechars=bytearray()) + with self.assertRaises(TypeError): + binascii.a2b_base64(data, ignorechars='') + with self.assertRaises(TypeError): + binascii.a2b_base64(data, ignorechars=[]) + with self.assertRaises(TypeError): + binascii.a2b_base64(data, ignorechars=None) def test_base64errors(self): # Test base64 with invalid padding diff --git a/Misc/NEWS.d/next/Library/2026-01-19-10-26-59.gh-issue-144001.dGj8Nk.rst b/Misc/NEWS.d/next/Library/2026-01-19-10-26-59.gh-issue-144001.dGj8Nk.rst new file mode 100644 index 00000000000000..02d453f4d2ceee --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-01-19-10-26-59.gh-issue-144001.dGj8Nk.rst @@ -0,0 +1,2 @@ +Added the *ignorechars* parameter in :func:`binascii.a2b_base64` and +:func:`base64.b64decode`. diff --git a/Modules/binascii.c b/Modules/binascii.c index c569d3187f2e67..d3774cc6eea534 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -469,32 +469,46 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick) return PyBytesWriter_FinishWithPointer(writer, ascii_data); } + +static int +ignorechar(unsigned char c, PyBytesObject *ignorechars) +{ + return (ignorechars != NULL && + memchr(PyBytes_AS_STRING(ignorechars), c, + PyBytes_GET_SIZE(ignorechars))); +} + /*[clinic input] -@permit_long_docstring_body binascii.a2b_base64 data: ascii_buffer / * - strict_mode: bool = False + strict_mode: bool(c_default="-1", py_default="") = False + When set to true, bytes that are not part of the base64 standard are + not allowed. The same applies to excess data after padding (= / ==). + Set to True by default if ignorechars is specified, False otherwise. + ignorechars: PyBytesObject = NULL + A byte string containing characters to ignore from the input when + strict_mode is true. Decode a line of base64 data. - - strict_mode - When set to True, bytes that are not part of the base64 standard are not allowed. - The same applies to excess data after padding (= / ==). [clinic start generated code]*/ static PyObject * -binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) -/*[clinic end generated code: output=5409557788d4f975 input=13c797187acc9c40]*/ +binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode, + PyBytesObject *ignorechars) +/*[clinic end generated code: output=b1868e0d886cd8cf input=e2d2e48c986e2afb]*/ { assert(data->len >= 0); const unsigned char *ascii_data = data->buf; size_t ascii_len = data->len; binascii_state *state = NULL; - char padding_started = 0; + + if (strict_mode == -1) { + strict_mode = (ignorechars != NULL); + } /* Allocate the buffer */ Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */ @@ -504,14 +518,6 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) } unsigned char *bin_data = PyBytesWriter_GetData(writer); - if (strict_mode && ascii_len > 0 && ascii_data[0] == '=') { - state = get_binascii_state(module); - if (state) { - PyErr_SetString(state->Error, "Leading padding not allowed"); - } - goto error_end; - } - size_t i = 0; /* Current position in input */ /* Fast path: use optimized decoder for complete quads. @@ -538,36 +544,44 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) ** the invalid ones. */ if (this_ch == BASE64_PAD) { - padding_started = 1; + pads++; - if (strict_mode && quad_pos == 0) { - state = get_binascii_state(module); - if (state) { - PyErr_SetString(state->Error, "Excess padding not allowed"); + if (strict_mode) { + if (quad_pos == 0) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, (i == 0) + ? "Leading padding not allowed" + : "Excess padding not allowed"); + } + goto error_end; } - goto error_end; - } - if (quad_pos >= 2 && quad_pos + ++pads >= 4) { - /* A pad sequence means we should not parse more input. - ** We've already interpreted the data from the quad at this point. - ** in strict mode, an error should raise if there's excess data after the padding. - */ - if (strict_mode && i + 1 < ascii_len) { + if (quad_pos == 1) { + /* Set an error below. */ + break; + } + if (quad_pos + pads > 4) { state = get_binascii_state(module); if (state) { - PyErr_SetString(state->Error, "Excess data after padding"); + PyErr_SetString(state->Error, "Excess padding not allowed"); } goto error_end; } - - goto done; + } + else { + if (quad_pos >= 2 && quad_pos + pads >= 4) { + /* A pad sequence means we should not parse more input. + ** We've already interpreted the data from the quad at this point. + */ + goto done; + } } continue; } - this_ch = table_a2b_base64[this_ch]; - if (this_ch >= 64) { - if (strict_mode) { + unsigned char v = table_a2b_base64[this_ch]; + if (v >= 64) { + if (strict_mode && !ignorechar(this_ch, ignorechars)) { state = get_binascii_state(module); if (state) { PyErr_SetString(state->Error, "Only base64 data is allowed"); @@ -578,10 +592,12 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) } // Characters that are not '=', in the middle of the padding, are not allowed - if (strict_mode && padding_started) { + if (strict_mode && pads) { state = get_binascii_state(module); if (state) { - PyErr_SetString(state->Error, "Discontinuous padding not allowed"); + PyErr_SetString(state->Error, (quad_pos + pads == 4) + ? "Excess data after padding" + : "Discontinuous padding not allowed"); } goto error_end; } @@ -590,44 +606,46 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) switch (quad_pos) { case 0: quad_pos = 1; - leftchar = this_ch; + leftchar = v; break; case 1: quad_pos = 2; - *bin_data++ = (leftchar << 2) | (this_ch >> 4); - leftchar = this_ch & 0x0f; + *bin_data++ = (leftchar << 2) | (v >> 4); + leftchar = v & 0x0f; break; case 2: quad_pos = 3; - *bin_data++ = (leftchar << 4) | (this_ch >> 2); - leftchar = this_ch & 0x03; + *bin_data++ = (leftchar << 4) | (v >> 2); + leftchar = v & 0x03; break; case 3: quad_pos = 0; - *bin_data++ = (leftchar << 6) | (this_ch); + *bin_data++ = (leftchar << 6) | (v); leftchar = 0; break; } } - if (quad_pos != 0) { + if (quad_pos == 1) { + /* There is exactly one extra valid, non-padding, base64 character. + ** This is an invalid length, as there is no possible input that + ** could encoded into such a base64 string. + */ state = get_binascii_state(module); - if (state == NULL) { - /* error already set, from get_binascii_state */ - assert(PyErr_Occurred()); - } else if (quad_pos == 1) { - /* - ** There is exactly one extra valid, non-padding, base64 character. - ** This is an invalid length, as there is no possible input that - ** could encoded into such a base64 string. - */ + if (state) { unsigned char *bin_data_start = PyBytesWriter_GetData(writer); PyErr_Format(state->Error, "Invalid base64-encoded string: " "number of data characters (%zd) cannot be 1 more " "than a multiple of 4", (bin_data - bin_data_start) / 3 * 4 + 1); - } else { + } + goto error_end; + } + + if (quad_pos != 0 && quad_pos + pads != 4) { + state = get_binascii_state(module); + if (state) { PyErr_SetString(state->Error, "Incorrect padding"); } goto error_end; diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 524f5fc93d0c21..20bfac116e505f 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -116,20 +116,26 @@ binascii_b2a_uu(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj } PyDoc_STRVAR(binascii_a2b_base64__doc__, -"a2b_base64($module, data, /, *, strict_mode=False)\n" +"a2b_base64($module, data, /, *, strict_mode=,\n" +" ignorechars=)\n" "--\n" "\n" "Decode a line of base64 data.\n" "\n" " strict_mode\n" -" When set to True, bytes that are not part of the base64 standard are not allowed.\n" -" The same applies to excess data after padding (= / ==)."); +" When set to true, bytes that are not part of the base64 standard are\n" +" not allowed. The same applies to excess data after padding (= / ==).\n" +" Set to True by default if ignorechars is specified, False otherwise.\n" +" ignorechars\n" +" A byte string containing characters to ignore from the input when\n" +" strict_mode is true."); #define BINASCII_A2B_BASE64_METHODDEF \ {"a2b_base64", _PyCFunction_CAST(binascii_a2b_base64), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base64__doc__}, static PyObject * -binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode); +binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode, + PyBytesObject *ignorechars); static PyObject * binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) @@ -137,7 +143,7 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 1 + #define NUM_KEYWORDS 2 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD @@ -146,7 +152,7 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) .ob_hash = -1, - .ob_item = { &_Py_ID(strict_mode), }, + .ob_item = { &_Py_ID(strict_mode), &_Py_ID(ignorechars), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -155,17 +161,18 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"", "strict_mode", NULL}; + static const char * const _keywords[] = {"", "strict_mode", "ignorechars", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, .fname = "a2b_base64", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; - int strict_mode = 0; + int strict_mode = -1; + PyBytesObject *ignorechars = NULL; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); @@ -178,12 +185,22 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P if (!noptargs) { goto skip_optional_kwonly; } - strict_mode = PyObject_IsTrue(args[1]); - if (strict_mode < 0) { + if (args[1]) { + strict_mode = PyObject_IsTrue(args[1]); + if (strict_mode < 0) { + goto exit; + } + if (!--noptargs) { + goto skip_optional_kwonly; + } + } + if (!PyBytes_Check(args[2])) { + _PyArg_BadArgument("a2b_base64", "argument 'ignorechars'", "bytes", args[2]); goto exit; } + ignorechars = (PyBytesObject *)args[2]; skip_optional_kwonly: - return_value = binascii_a2b_base64_impl(module, &data, strict_mode); + return_value = binascii_a2b_base64_impl(module, &data, strict_mode, ignorechars); exit: /* Cleanup for data */ @@ -823,4 +840,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=644ccdc8e0d56e65 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=8b16c7f676dfbc40 input=a9049054013a1b77]*/