From e61a5f365ba610d5907a0ae1bc72769bba34294b Mon Sep 17 00:00:00 2001 From: Andreas Eriksen Date: Sat, 28 Feb 2026 22:21:06 +0100 Subject: [PATCH] Set default read_size to 1 for backwards compatibility (#275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The buffered reads introduced in 5.8.0 could cause issues when code needs to access the stream position after decoding. This changes the default back to 1 (matching 5.7.1 behavior) while allowing users to opt-in to faster decoding by passing read_size=4096. Implementation details: - Use function pointer dispatch to eliminate runtime checks for read_size=1 - Skip buffer allocation entirely for unbuffered path - Add read_size parameter to load() and loads() for API completeness CVE: CVE-2026-26209 Upstream-Status: Backport [https://github.com/agronholm/cbor2/commit/e61a5f365ba610d5907a0ae1bc72769bba34294b] Signed-off-by: Hitendra Prajapati --- cbor2/_decoder.py | 33 ++++++++++++++++-- docs/usage.rst | 11 ++++++ source/decoder.c | 78 +++++++++++++++++++++++++++++-------------- source/decoder.h | 16 +++++++-- tests/test_decoder.py | 15 +++++++++ 5 files changed, 123 insertions(+), 30 deletions(-) diff --git a/cbor2/_decoder.py b/cbor2/_decoder.py index 4aeadcf..5a1f65b 100644 --- a/cbor2/_decoder.py +++ b/cbor2/_decoder.py @@ -72,6 +72,7 @@ class CBORDecoder: tag_hook: Callable[[CBORDecoder, CBORTag], Any] | None = None, object_hook: Callable[[CBORDecoder, dict[Any, Any]], Any] | None = None, str_errors: Literal["strict", "error", "replace"] = "strict", + read_size: int = 1, ): """ :param fp: @@ -90,6 +91,13 @@ class CBORDecoder: :param str_errors: determines how to handle unicode decoding errors (see the `Error Handlers`_ section in the standard library documentation for details) + :param read_size: + the minimum number of bytes to read at a time. + Setting this to a higher value like 4096 improves performance, + but is likely to read past the end of the CBOR value, advancing the stream + position beyond the decoded data. This only matters if you need to reuse the + stream after decoding. + Ignored in the pure Python implementation, but included for API compatibility. .. _Error Handlers: https://docs.python.org/3/library/codecs.html#error-handlers @@ -813,6 +821,7 @@ def loads( tag_hook: Callable[[CBORDecoder, CBORTag], Any] | None = None, object_hook: Callable[[CBORDecoder, dict[Any, Any]], Any] | None = None, str_errors: Literal["strict", "error", "replace"] = "strict", + read_size: int = 1, ) -> Any: """ Deserialize an object from a bytestring. @@ -831,6 +840,10 @@ def loads( :param str_errors: determines how to handle unicode decoding errors (see the `Error Handlers`_ section in the standard library documentation for details) + :param read_size: + the minimum number of bytes to read at a time. + Setting this to a higher value like 4096 improves performance. + Ignored in the pure Python implementation, but included for API compatibility. :return: the deserialized object @@ -839,7 +852,11 @@ def loads( """ with BytesIO(s) as fp: return CBORDecoder( - fp, tag_hook=tag_hook, object_hook=object_hook, str_errors=str_errors + fp, + tag_hook=tag_hook, + object_hook=object_hook, + str_errors=str_errors, + read_size=read_size, ).decode() @@ -848,6 +865,7 @@ def load( tag_hook: Callable[[CBORDecoder, CBORTag], Any] | None = None, object_hook: Callable[[CBORDecoder, dict[Any, Any]], Any] | None = None, str_errors: Literal["strict", "error", "replace"] = "strict", + read_size: int = 1, ) -> Any: """ Deserialize an object from an open file. @@ -866,6 +884,13 @@ def load( :param str_errors: determines how to handle unicode decoding errors (see the `Error Handlers`_ section in the standard library documentation for details) + :param read_size: + the minimum number of bytes to read at a time. + Setting this to a higher value like 4096 improves performance, + but is likely to read past the end of the CBOR value, advancing the stream + position beyond the decoded data. This only matters if you need to reuse the + stream after decoding. + Ignored in the pure Python implementation, but included for API compatibility. :return: the deserialized object @@ -873,5 +898,9 @@ def load( """ return CBORDecoder( - fp, tag_hook=tag_hook, object_hook=object_hook, str_errors=str_errors + fp, + tag_hook=tag_hook, + object_hook=object_hook, + str_errors=str_errors, + read_size=read_size, ).decode() diff --git a/docs/usage.rst b/docs/usage.rst index 797db59..6f53174 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -74,6 +74,17 @@ instead encodes a reference to the nth sufficiently long string already encoded. .. warning:: Support for string referencing is rare in other CBOR implementations, so think carefully whether you want to enable it. +Performance tuning +------------------ + +By default, the decoder only reads the exact amount of bytes it needs. But this can negatively +impact the performance due to the potentially large number of individual read operations. +To make it faster, you can pass a different ``read_size`` parameter (say, 4096), to :func:`load`, +:func:`loads` or :class:`CBORDecoder`. + +.. warning:: If the input stream contains data other than the CBOR stream, that data (or parts of) + may be lost. + Tag support ----------- diff --git a/source/decoder.c b/source/decoder.c index 9cd1596..f8adc93 100644 --- a/source/decoder.c +++ b/source/decoder.c @@ -47,6 +47,10 @@ static int _CBORDecoder_set_tag_hook(CBORDecoderObject *, PyObject *, void *); static int _CBORDecoder_set_object_hook(CBORDecoderObject *, PyObject *, void *); static int _CBORDecoder_set_str_errors(CBORDecoderObject *, PyObject *, void *); +// Forward declarations for read dispatch functions +static int fp_read_unbuffered(CBORDecoderObject *, char *, Py_ssize_t); +static int fp_read_buffered(CBORDecoderObject *, char *, Py_ssize_t); + static PyObject * decode(CBORDecoderObject *, DecodeOptions); static PyObject * decode_bytestring(CBORDecoderObject *, uint8_t); static PyObject * decode_string(CBORDecoderObject *, uint8_t); @@ -155,6 +159,7 @@ CBORDecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->readahead_size = 0; self->read_pos = 0; self->read_len = 0; + self->fp_read = fp_read_unbuffered; // default, will be set properly in init } return (PyObject *) self; error: @@ -164,7 +169,7 @@ error: // CBORDecoder.__init__(self, fp=None, tag_hook=None, object_hook=None, -// str_errors='strict', read_size=4096) +// str_errors='strict', read_size=1) int CBORDecoder_init(CBORDecoderObject *self, PyObject *args, PyObject *kwargs) { @@ -233,7 +238,8 @@ _CBORDecoder_set_fp_with_read_size(CBORDecoderObject *self, PyObject *value, Py_ return -1; } - if (self->readahead == NULL || self->readahead_size != read_size) { + // Skip buffer allocation for read_size=1 (direct read path doesn't use buffer) + if (read_size > 1 && (self->readahead == NULL || self->readahead_size != read_size)) { new_buffer = (char *)PyMem_Malloc(read_size); if (!new_buffer) { Py_DECREF(read); @@ -254,8 +260,15 @@ _CBORDecoder_set_fp_with_read_size(CBORDecoderObject *self, PyObject *value, Py_ if (new_buffer) { PyMem_Free(self->readahead); self->readahead = new_buffer; - self->readahead_size = read_size; + } else if (read_size == 1 && self->readahead != NULL) { + // Free existing buffer when switching to direct read path (read_size=1) + PyMem_Free(self->readahead); + self->readahead = NULL; } + self->readahead_size = read_size; + + // Set read dispatch function - eliminates runtime check on every read + self->fp_read = (read_size == 1) ? fp_read_unbuffered : fp_read_buffered; return 0; } @@ -447,9 +460,25 @@ fp_read_bytes(CBORDecoderObject *self, char *buf, Py_ssize_t size) return bytes_read; } -// Read into caller's buffer using the readahead buffer +// Unbuffered read - used when read_size=1 (backwards compatible mode) +// This matches the 5.7.1 behavior with no runtime overhead +static int +fp_read_unbuffered(CBORDecoderObject *self, char *buf, Py_ssize_t size) +{ + Py_ssize_t bytes_read = fp_read_bytes(self, buf, size); + if (bytes_read == size) + return 0; + if (bytes_read >= 0) + PyErr_Format( + _CBOR2_CBORDecodeEOF, + "premature end of stream (expected to read %zd bytes, " + "got %zd instead)", size, bytes_read); + return -1; +} + +// Buffered read - used when read_size > 1 for improved performance static int -fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size) +fp_read_buffered(CBORDecoderObject *self, char *buf, Py_ssize_t size) { Py_ssize_t available, to_copy, remaining, total_copied; @@ -507,7 +536,7 @@ fp_read_object(CBORDecoderObject *self, const Py_ssize_t size) if (!ret) return NULL; - if (fp_read(self, PyBytes_AS_STRING(ret), size) == -1) { + if (self->fp_read(self, PyBytes_AS_STRING(ret), size) == -1) { Py_DECREF(ret); return NULL; } @@ -528,7 +557,7 @@ CBORDecoder_read(CBORDecoderObject *self, PyObject *length) return NULL; ret = PyBytes_FromStringAndSize(NULL, len); if (ret) { - if (fp_read(self, PyBytes_AS_STRING(ret), len) == -1) { + if (self->fp_read(self, PyBytes_AS_STRING(ret), len) == -1) { Py_DECREF(ret); ret = NULL; } @@ -576,19 +605,19 @@ decode_length(CBORDecoderObject *self, uint8_t subtype, if (subtype < 24) { *length = subtype; } else if (subtype == 24) { - if (fp_read(self, value.u8.buf, sizeof(uint8_t)) == -1) + if (self->fp_read(self, value.u8.buf, sizeof(uint8_t)) == -1) return -1; *length = value.u8.value; } else if (subtype == 25) { - if (fp_read(self, value.u16.buf, sizeof(uint16_t)) == -1) + if (self->fp_read(self, value.u16.buf, sizeof(uint16_t)) == -1) return -1; *length = be16toh(value.u16.value); } else if (subtype == 26) { - if (fp_read(self, value.u32.buf, sizeof(uint32_t)) == -1) + if (self->fp_read(self, value.u32.buf, sizeof(uint32_t)) == -1) return -1; *length = be32toh(value.u32.value); } else { - if (fp_read(self, value.u64.buf, sizeof(uint64_t)) == -1) + if (self->fp_read(self, value.u64.buf, sizeof(uint64_t)) == -1) return -1; *length = be64toh(value.u64.value); } @@ -752,7 +781,7 @@ decode_indefinite_bytestrings(CBORDecoderObject *self) list = PyList_New(0); if (list) { while (1) { - if (fp_read(self, &lead.byte, 1) == -1) + if (self->fp_read(self, &lead.byte, 1) == -1) break; if (lead.major == 2 && lead.subtype != 31) { ret = decode_bytestring(self, lead.subtype); @@ -959,7 +988,7 @@ decode_indefinite_strings(CBORDecoderObject *self) list = PyList_New(0); if (list) { while (1) { - if (fp_read(self, &lead.byte, 1) == -1) + if (self->fp_read(self, &lead.byte, 1) == -1) break; if (lead.major == 3 && lead.subtype != 31) { ret = decode_string(self, lead.subtype); @@ -2040,7 +2069,7 @@ CBORDecoder_decode_simple_value(CBORDecoderObject *self) PyObject *tag, *ret = NULL; uint8_t buf; - if (fp_read(self, (char*)&buf, sizeof(uint8_t)) == 0) { + if (self->fp_read(self, (char*)&buf, sizeof(uint8_t)) == 0) { tag = PyStructSequence_New(&CBORSimpleValueType); if (tag) { PyStructSequence_SET_ITEM(tag, 0, PyLong_FromLong(buf)); @@ -2066,7 +2095,7 @@ CBORDecoder_decode_float16(CBORDecoderObject *self) char buf[sizeof(uint16_t)]; } u; - if (fp_read(self, u.buf, sizeof(uint16_t)) == 0) + if (self->fp_read(self, u.buf, sizeof(uint16_t)) == 0) ret = PyFloat_FromDouble(unpack_float16(u.i)); set_shareable(self, ret); return ret; @@ -2084,7 +2113,7 @@ CBORDecoder_decode_float32(CBORDecoderObject *self) char buf[sizeof(float)]; } u; - if (fp_read(self, u.buf, sizeof(float)) == 0) { + if (self->fp_read(self, u.buf, sizeof(float)) == 0) { u.i = be32toh(u.i); ret = PyFloat_FromDouble(u.f); } @@ -2104,7 +2133,7 @@ CBORDecoder_decode_float64(CBORDecoderObject *self) char buf[sizeof(double)]; } u; - if (fp_read(self, u.buf, sizeof(double)) == 0) { + if (self->fp_read(self, u.buf, sizeof(double)) == 0) { u.i = be64toh(u.i); ret = PyFloat_FromDouble(u.f); } @@ -2133,7 +2162,7 @@ decode(CBORDecoderObject *self, DecodeOptions options) if (Py_EnterRecursiveCall(" in CBORDecoder.decode")) return NULL; - if (fp_read(self, &lead.byte, 1) == 0) { + if (self->fp_read(self, &lead.byte, 1) == 0) { switch (lead.major) { case 0: ret = decode_uint(self, lead.subtype); break; case 1: ret = decode_negint(self, lead.subtype); break; @@ -2387,13 +2416,12 @@ PyDoc_STRVAR(CBORDecoder__doc__, " :class:`dict` object. The return value is substituted for the dict\n" " in the deserialized output.\n" ":param read_size:\n" -" the size of the read buffer (default 4096). The decoder reads from\n" -" the stream in chunks of this size for performance. This means the\n" -" stream position may advance beyond the bytes actually decoded. For\n" -" large values (bytestrings, text strings), reads may be larger than\n" -" ``read_size``. Code that needs to read from the stream after\n" -" decoding should use :meth:`decode_from_bytes` instead, or set\n" -" ``read_size=1`` to disable buffering (at a performance cost).\n" +" the minimum number of bytes to read at a time.\n" +" Setting this to a higher value like 4096 improves performance,\n" +" but is likely to read past the end of the CBOR value, advancing the stream\n" +" position beyond the decoded data. This only matters if you need to reuse the\n" +" stream after decoding.\n" +" Ignored in the pure Python implementation, but included for API compatibility.\n" "\n" ".. _CBOR: https://cbor.io/\n" ); diff --git a/source/decoder.h b/source/decoder.h index a2f4bf1..3efff8b 100644 --- a/source/decoder.h +++ b/source/decoder.h @@ -3,10 +3,17 @@ #include #include -// Default readahead buffer size for streaming reads -#define CBOR2_DEFAULT_READ_SIZE 4096 +// Default readahead buffer size for streaming reads. +// Set to 1 for backwards compatibility (no buffering). +#define CBOR2_DEFAULT_READ_SIZE 1 -typedef struct { +// Forward declaration for function pointer typedef +struct CBORDecoderObject_; + +// Function pointer type for read dispatch (eliminates runtime check) +typedef int (*fp_read_fn)(struct CBORDecoderObject_ *, char *, Py_ssize_t); + +typedef struct CBORDecoderObject_ { PyObject_HEAD PyObject *read; // cached read() method of fp PyObject *tag_hook; @@ -23,6 +30,9 @@ typedef struct { Py_ssize_t readahead_size; // size of allocated buffer Py_ssize_t read_pos; // current position in buffer Py_ssize_t read_len; // valid bytes in buffer + + // Read dispatch - points to unbuffered or buffered implementation + fp_read_fn fp_read; } CBORDecoderObject; extern PyTypeObject CBORDecoderType; diff --git a/tests/test_decoder.py b/tests/test_decoder.py index 9bf5a10..c5d1a9c 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -123,6 +123,21 @@ def test_load(impl): assert impl.load(fp=stream) == 1 +def test_stream_position_after_decode(impl): + """Test that stream position is exactly at end of decoded CBOR value.""" + # CBOR: integer 1 (1 byte: 0x01) followed by extra data + cbor_data = b"\x01" + extra_data = b"extra" + with BytesIO(cbor_data + extra_data) as stream: + decoder = impl.CBORDecoder(stream) + result = decoder.decode() + assert result == 1 + # Stream position should be exactly at end of CBOR data + assert stream.tell() == len(cbor_data) + # Should be able to read the extra data + assert stream.read() == extra_data + + @pytest.mark.parametrize( "payload, expected", [ -- 2.50.1