From fb4ee1612a8a1ac0dbd8cf2f2f6f931a4e06d824 Mon Sep 17 00:00:00 2001 From: Andreas Eriksen Date: Mon, 29 Dec 2025 14:01:52 +0100 Subject: [PATCH] Added a read-ahead buffer to the C decoder (#268) CVE: CVE-2026-26209 Upstream-Status: Backport [https://github.com/agronholm/cbor2/commit/fb4ee1612a8a1ac0dbd8cf2f2f6f931a4e06d824] Signed-off-by: Hitendra Prajapati --- source/decoder.c | 225 +++++++++++++++++++++++++++++++++--------- source/decoder.h | 9 ++ tests/test_decoder.py | 85 +++++++++++++++- 3 files changed, 274 insertions(+), 45 deletions(-) diff --git a/source/decoder.c b/source/decoder.c index 4f7ee5d..9cd1596 100644 --- a/source/decoder.c +++ b/source/decoder.c @@ -42,6 +42,7 @@ enum DecodeOption { typedef uint8_t DecodeOptions; static int _CBORDecoder_set_fp(CBORDecoderObject *, PyObject *, void *); +static int _CBORDecoder_set_fp_with_read_size(CBORDecoderObject *, PyObject *, Py_ssize_t); static int _CBORDecoder_set_tag_hook(CBORDecoderObject *, PyObject *, void *); static int _CBORDecoder_set_object_hook(CBORDecoderObject *, PyObject *, void *); static int _CBORDecoder_set_str_errors(CBORDecoderObject *, PyObject *, void *); @@ -101,6 +102,13 @@ CBORDecoder_clear(CBORDecoderObject *self) Py_CLEAR(self->shareables); Py_CLEAR(self->stringref_namespace); Py_CLEAR(self->str_errors); + if (self->readahead) { + PyMem_Free(self->readahead); + self->readahead = NULL; + self->readahead_size = 0; + } + self->read_pos = 0; + self->read_len = 0; return 0; } @@ -143,6 +151,10 @@ CBORDecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->immutable = false; self->shared_index = -1; self->decode_depth = 0; + self->readahead = NULL; + self->readahead_size = 0; + self->read_pos = 0; + self->read_len = 0; } return (PyObject *) self; error: @@ -152,21 +164,27 @@ error: // CBORDecoder.__init__(self, fp=None, tag_hook=None, object_hook=None, -// str_errors='strict') +// str_errors='strict', read_size=4096) int CBORDecoder_init(CBORDecoderObject *self, PyObject *args, PyObject *kwargs) { static char *keywords[] = { - "fp", "tag_hook", "object_hook", "str_errors", NULL + "fp", "tag_hook", "object_hook", "str_errors", "read_size", NULL }; PyObject *fp = NULL, *tag_hook = NULL, *object_hook = NULL, *str_errors = NULL; + Py_ssize_t read_size = CBOR2_DEFAULT_READ_SIZE; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOO", keywords, - &fp, &tag_hook, &object_hook, &str_errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOn", keywords, + &fp, &tag_hook, &object_hook, &str_errors, &read_size)) return -1; - if (_CBORDecoder_set_fp(self, fp, NULL) == -1) + if (read_size < 1) { + PyErr_SetString(PyExc_ValueError, "read_size must be at least 1"); + return -1; + } + + if (_CBORDecoder_set_fp_with_read_size(self, fp, read_size) == -1) return -1; if (tag_hook && _CBORDecoder_set_tag_hook(self, tag_hook, NULL) == -1) return -1; @@ -197,11 +215,12 @@ _CBORDecoder_get_fp(CBORDecoderObject *self, void *closure) } -// CBORDecoder._set_fp(self, value) +// Internal: set fp with configurable read size static int -_CBORDecoder_set_fp(CBORDecoderObject *self, PyObject *value, void *closure) +_CBORDecoder_set_fp_with_read_size(CBORDecoderObject *self, PyObject *value, Py_ssize_t read_size) { PyObject *tmp, *read; + char *new_buffer = NULL; if (!value) { PyErr_SetString(PyExc_AttributeError, "cannot delete fp attribute"); @@ -214,13 +233,43 @@ _CBORDecoder_set_fp(CBORDecoderObject *self, PyObject *value, void *closure) return -1; } + if (self->readahead == NULL || self->readahead_size != read_size) { + new_buffer = (char *)PyMem_Malloc(read_size); + if (!new_buffer) { + Py_DECREF(read); + PyErr_NoMemory(); + return -1; + } + } + // See notes in encoder.c / _CBOREncoder_set_fp tmp = self->read; self->read = read; Py_DECREF(tmp); + + self->read_pos = 0; + self->read_len = 0; + + // Replace buffer (size changed or was NULL) + if (new_buffer) { + PyMem_Free(self->readahead); + self->readahead = new_buffer; + self->readahead_size = read_size; + } + return 0; } +// CBORDecoder._set_fp(self, value) - property setter uses default read size +static int +_CBORDecoder_set_fp(CBORDecoderObject *self, PyObject *value, void *closure) +{ + // Use existing readahead_size if already allocated, otherwise use default + Py_ssize_t read_size = (self->readahead_size > 0) ? + self->readahead_size : CBOR2_DEFAULT_READ_SIZE; + return _CBORDecoder_set_fp_with_read_size(self, value, read_size); +} + // CBORDecoder._get_tag_hook(self) static PyObject * @@ -376,45 +425,93 @@ raise_from(PyObject *new_exc_type, const char *message) { } } -static PyObject * -fp_read_object(CBORDecoderObject *self, const Py_ssize_t size) +// Read directly into caller's buffer (bypassing readahead buffer) +static Py_ssize_t +fp_read_bytes(CBORDecoderObject *self, char *buf, Py_ssize_t size) { - PyObject *ret = NULL; - PyObject *obj, *size_obj; - size_obj = PyLong_FromSsize_t(size); - if (size_obj) { - obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL); - Py_DECREF(size_obj); - if (obj) { - assert(PyBytes_CheckExact(obj)); - if (PyBytes_GET_SIZE(obj) == (Py_ssize_t) size) { - ret = obj; + PyObject *size_obj = PyLong_FromSsize_t(size); + if (!size_obj) + return -1; + + PyObject *obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL); + Py_DECREF(size_obj); + if (!obj) + return -1; + + assert(PyBytes_CheckExact(obj)); + Py_ssize_t bytes_read = PyBytes_GET_SIZE(obj); + if (bytes_read > 0) + memcpy(buf, PyBytes_AS_STRING(obj), bytes_read); + + Py_DECREF(obj); + return bytes_read; +} + +// Read into caller's buffer using the readahead buffer +static int +fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size) +{ + Py_ssize_t available, to_copy, remaining, total_copied; + + remaining = size; + total_copied = 0; + + while (remaining > 0) { + available = self->read_len - self->read_pos; + + if (available > 0) { + // Copy from buffer + to_copy = (available < remaining) ? available : remaining; + memcpy(buf + total_copied, self->readahead + self->read_pos, to_copy); + self->read_pos += to_copy; + total_copied += to_copy; + remaining -= to_copy; + } else { + Py_ssize_t bytes_read; + + if (remaining >= self->readahead_size) { + // Large remaining: read directly into destination, bypass buffer + bytes_read = fp_read_bytes(self, buf + total_copied, remaining); + if (bytes_read > 0) { + total_copied += bytes_read; + remaining -= bytes_read; + } } else { - PyErr_Format( - _CBOR2_CBORDecodeEOF, - "premature end of stream (expected to read %zd bytes, " - "got %zd instead)", size, PyBytes_GET_SIZE(obj)); - Py_DECREF(obj); + // Small remaining: refill buffer + self->read_pos = 0; + self->read_len = 0; + bytes_read = fp_read_bytes(self, self->readahead, self->readahead_size); + if (bytes_read > 0) + self->read_len = bytes_read; + } + + if (bytes_read <= 0) { + if (bytes_read == 0) + PyErr_Format( + _CBOR2_CBORDecodeEOF, + "premature end of stream (expected to read %zd bytes, " + "got %zd instead)", size, total_copied); + return -1; } } } - return ret; -} + return 0; +} -static int -fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size) +// Read and return as PyBytes object +static PyObject * +fp_read_object(CBORDecoderObject *self, const Py_ssize_t size) { - int ret = -1; - PyObject *obj = fp_read_object(self, size); - if (obj) { - char *data = PyBytes_AS_STRING(obj); - if (data) { - memcpy(buf, data, size); - ret = 0; - } - Py_DECREF(obj); + PyObject *ret = PyBytes_FromStringAndSize(NULL, size); + if (!ret) + return NULL; + + if (fp_read(self, PyBytes_AS_STRING(ret), size) == -1) { + Py_DECREF(ret); + return NULL; } + return ret; } @@ -2091,23 +2188,55 @@ static PyObject * CBORDecoder_decode_from_bytes(CBORDecoderObject *self, PyObject *data) { PyObject *save_read, *buf, *ret = NULL; + bool is_nested = (self->decode_depth > 0); + Py_ssize_t save_read_pos = 0, save_read_len = 0; + char *save_buffer = NULL; if (!_CBOR2_BytesIO && _CBOR2_init_BytesIO() == -1) return NULL; + buf = PyObject_CallFunctionObjArgs(_CBOR2_BytesIO, data, NULL); + if (!buf) + return NULL; + self->decode_depth++; save_read = self->read; - buf = PyObject_CallFunctionObjArgs(_CBOR2_BytesIO, data, NULL); - if (buf) { - self->read = PyObject_GetAttr(buf, _CBOR2_str_read); - if (self->read) { - ret = decode(self, DECODE_NORMAL); - Py_DECREF(self->read); + Py_INCREF(save_read); // Keep alive while we use a different read method + save_read_pos = self->read_pos; + save_read_len = self->read_len; + + // Save buffer pointer if nested + if (is_nested) { + save_buffer = self->readahead; + self->readahead = NULL; // Prevent setter from freeing saved buffer + } + + // Set up BytesIO decoder - setter handles buffer allocation + if (_CBORDecoder_set_fp_with_read_size(self, buf, self->readahead_size) == -1) { + if (is_nested) { + PyMem_Free(self->readahead); + self->readahead = save_buffer; } + Py_DECREF(save_read); Py_DECREF(buf); + self->decode_depth--; + return NULL; } - self->read = save_read; + + ret = decode(self, DECODE_NORMAL); + + Py_XDECREF(self->read); // Decrement BytesIO read method + self->read = save_read; // Restore saved read (already has correct refcount) + Py_DECREF(buf); self->decode_depth--; + + if (is_nested) { + PyMem_Free(self->readahead); + self->readahead = save_buffer; + } + self->read_pos = save_read_pos; + self->read_len = save_read_len; + assert(self->decode_depth >= 0); if (self->decode_depth == 0) { clear_shareable_state(self); @@ -2257,6 +2386,14 @@ PyDoc_STRVAR(CBORDecoder__doc__, " dictionary. This callback is invoked for each deserialized\n" " :class:`dict` object. The return value is substituted for the dict\n" " in the deserialized output.\n" +":param read_size:\n" +" the size of the read buffer (default 4096). The decoder reads from\n" +" the stream in chunks of this size for performance. This means the\n" +" stream position may advance beyond the bytes actually decoded. For\n" +" large values (bytestrings, text strings), reads may be larger than\n" +" ``read_size``. Code that needs to read from the stream after\n" +" decoding should use :meth:`decode_from_bytes` instead, or set\n" +" ``read_size=1`` to disable buffering (at a performance cost).\n" "\n" ".. _CBOR: https://cbor.io/\n" ); diff --git a/source/decoder.h b/source/decoder.h index a2f1bcb..a2f4bf1 100644 --- a/source/decoder.h +++ b/source/decoder.h @@ -3,6 +3,9 @@ #include #include +// Default readahead buffer size for streaming reads +#define CBOR2_DEFAULT_READ_SIZE 4096 + typedef struct { PyObject_HEAD PyObject *read; // cached read() method of fp @@ -14,6 +17,12 @@ typedef struct { bool immutable; Py_ssize_t shared_index; Py_ssize_t decode_depth; + + // Readahead buffer for streaming + char *readahead; // allocated buffer + Py_ssize_t readahead_size; // size of allocated buffer + Py_ssize_t read_pos; // current position in buffer + Py_ssize_t read_len; // valid bytes in buffer } CBORDecoderObject; extern PyTypeObject CBORDecoderType; diff --git a/tests/test_decoder.py b/tests/test_decoder.py index 3b4455a..9bf5a10 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -1043,4 +1043,87 @@ class TestDecoderReuse: result = impl.loads(data) assert result == ["hello", "hello"] - assert result[0] is result[1] # Same object reference \ No newline at end of file + assert result[0] is result[1] # Same object reference + + +def test_decode_from_bytes_in_hook_preserves_buffer(impl): + """Test that calling decode_from_bytes from a hook preserves stream buffer state. + This is a documented use case from docs/customizing.rst where hooks decode + embedded CBOR data. Before the fix, the stream's readahead buffer would be + corrupted, causing subsequent reads to fail or return wrong data. + """ + + def tag_hook(decoder, tag): + if tag.tag == 999: + # Decode embedded CBOR (documented pattern) + return decoder.decode_from_bytes(tag.value) + return tag + + # Test data: array with [tag(999, embedded_cbor), "after_hook", "final"] + # embedded_cbor encodes: [1, 2, 3] + data = unhexlify( + "83" # array(3) + "d903e7" # tag(999) + "44" # bytes(4) + "83010203" # embedded: array [1, 2, 3] + "6a" # text(10) + "61667465725f686f6f6b" # "after_hook" + "65" # text(5) + "66696e616c" # "final" + ) + + # Decode from stream (not bytes) to use readahead buffer + stream = BytesIO(data) + decoder = impl.CBORDecoder(stream, tag_hook=tag_hook) + result = decoder.decode() + + # Verify all values decoded correctly + assert result == [[1, 2, 3], "after_hook", "final"] + + # First element should be the decoded embedded CBOR + assert result[0] == [1, 2, 3] + # Second element should be "after_hook" (not corrupted) + assert result[1] == "after_hook" + # Third element should be "final" + assert result[2] == "final" + + +def test_decode_from_bytes_deeply_nested_in_hook(impl): + """Test deeply nested decode_from_bytes calls preserve buffer state. + This tests tag(999, tag(888, tag(777, [1,2,3]))) where each tag value + is embedded CBOR that triggers the hook recursively. + Before the fix, even a single level would corrupt the buffer. With multiple + levels, the buffer would be completely corrupted, mixing data from different + BytesIO objects and the original stream. + """ + + def tag_hook(decoder, tag): + if tag.tag in [999, 888, 777]: + # Recursively decode embedded CBOR + return decoder.decode_from_bytes(tag.value) + return tag + + # Test data: [tag(999, tag(888, tag(777, [1,2,3]))), "after", "final"] + # Each tag contains embedded CBOR + data = unhexlify( + "83" # array(3) + "d903e7" # tag(999) + "4c" # bytes(12) + "d9037848d903094483010203" # embedded: tag(888, tag(777, [1,2,3])) + "65" # text(5) + "6166746572" # "after" + "65" # text(5) + "66696e616c" # "final" + ) + + # Decode from stream to use readahead buffer + stream = BytesIO(data) + decoder = impl.CBORDecoder(stream, tag_hook=tag_hook) + result = decoder.decode() + + # With the fix: all three levels of nesting work correctly + # Without the fix: buffer corruption at each level, test fails + assert result == [[1, 2, 3], "after", "final"] + assert result[0] == [1, 2, 3] + assert result[1] == "after" + assert result[2] == "final" -- 2.50.1