From 1a2ea3d730b9aa3f7295e633dec8b87d0b59907f Mon Sep 17 00:00:00 2001 From: ekeith <55766816+evanmkeith@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:34:21 -0700 Subject: [PATCH 1/2] fix(invoke): encode request body as UTF-8 bytes to prevent Latin-1 corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `invoke()` passed the JSON body as a Python `str` to `requests.post(data=...)`, which encodes `str` bodies as Latin-1 by default. Any non-Latin-1 character in the payload (em dashes, smart quotes, etc.) either raised a `UnicodeEncodeError` or was silently corrupted before the request was sent. - Encode `bt_dumps()` output to UTF-8 bytes before passing to `data=` - Add `Content-Type: application/json` header (was missing; only `Accept` was set) `bt_dumps` is kept — it handles Pydantic models, dataclasses, and NaN/Inf values that stdlib `json` cannot serialize. Other SDK paths (`logger.py`) already use `.encode("utf-8")` correctly; this brings `invoke()` in line. ## Test plan - [ ] Unit test: assert `data=` arg to `requests.post` is `bytes`, contains correct UTF-8 encoding of em dash (`\xe2\x80\x94`), and `Content-Type` header is set - [ ] Manually run `invoke()` with Unicode input (`"result \u2014 excellent"`) and confirm no `UnicodeEncodeError` and payload reaches the API intact Fixes BT-4620 --- py/src/braintrust/functions/invoke.py | 7 +++-- py/src/braintrust/functions/test_invoke.py | 36 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py index 85e471fc..7868356f 100644 --- a/py/src/braintrust/functions/invoke.py +++ b/py/src/braintrust/functions/invoke.py @@ -198,14 +198,17 @@ def invoke( if strict is not None: request["strict"] = strict - headers = {"Accept": "text/event-stream" if stream else "application/json"} + headers = { + "Accept": "text/event-stream" if stream else "application/json", + "Content-Type": "application/json", + } if project_id is not None: headers["x-bt-project-id"] = project_id if org_name is not None: headers["x-bt-org-name"] = org_name request_json = bt_dumps(request) - resp = proxy_conn().post("function/invoke", data=request_json, headers=headers, stream=stream) + resp = proxy_conn().post("function/invoke", data=request_json.encode("utf-8"), headers=headers, stream=stream) if resp.status_code == 500: raise BraintrustInvokeError(resp.text) diff --git a/py/src/braintrust/functions/test_invoke.py b/py/src/braintrust/functions/test_invoke.py index 264217e2..4b004949 100644 --- a/py/src/braintrust/functions/test_invoke.py +++ b/py/src/braintrust/functions/test_invoke.py @@ -83,7 +83,9 @@ def _invoke_with_messages(messages): kwargs = mock_conn.post.call_args.kwargs assert "data" in kwargs, "invoke must use data= (bt_dumps) not json= (json.dumps) (see issue 38)" assert "json" not in kwargs - return json.loads(kwargs["data"]) + data = kwargs["data"] + assert isinstance(data, bytes), "body must be bytes so requests does not re-encode as Latin-1" + return json.loads(data.decode("utf-8")) def test_invoke_serializes_openai_messages(): @@ -114,3 +116,35 @@ def test_invoke_serializes_google_messages(): msg = google_types.Content(role="model", parts=[google_types.Part(text="The answer is X.")]) parsed = _invoke_with_messages([msg]) assert isinstance(parsed, dict) and parsed + + +def test_invoke_encodes_body_as_utf8_bytes(): + """Regression test for BT-4620: non-Latin-1 Unicode must not be corrupted. + + When invoke() serializes the request body via bt_dumps() and passes it to + requests.post(data=...), the body must be UTF-8 encoded bytes — not a str. + Passing a str causes requests to re-encode with Latin-1, which raises + UnicodeEncodeError (or silently corrupts data) for characters outside U+007F. + """ + em_dash = "\u2014" # — (U+2014) is outside Latin-1; triggers the bug when body is str + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {} + mock_conn = MagicMock() + mock_conn.post.return_value = mock_resp + + with ( + patch("braintrust.functions.invoke.login"), + patch("braintrust.functions.invoke.proxy_conn", return_value=mock_conn), + ): + invoke(project_name="test-project", slug="test-fn", input={"text": f"result {em_dash} excellent"}) + + kwargs = mock_conn.post.call_args.kwargs + data = kwargs["data"] + + assert isinstance(data, bytes), "body must be bytes so requests does not Latin-1 encode it" + # Round-trip through UTF-8 decode + JSON parse to confirm the em dash survives intact. + # bt_dumps may use JSON \uXXXX escapes (stdlib) or raw UTF-8 bytes (orjson) — both are valid. + parsed = json.loads(data.decode("utf-8")) + assert parsed["input"]["text"] == f"result {em_dash} excellent", "em dash must survive serialization" + assert kwargs.get("headers", {}).get("Content-Type") == "application/json" From 416407cd8e6fc4def6e511acb6972521204cab19 Mon Sep 17 00:00:00 2001 From: ekeith <55766816+evanmkeith@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:05:49 -0700 Subject: [PATCH 2/2] add vcr test --- ...est_invoke_encodes_body_as_utf8_bytes.yaml | 29 ++++++++++++ py/src/braintrust/functions/test_invoke.py | 44 +++++++++---------- 2 files changed, 50 insertions(+), 23 deletions(-) create mode 100644 py/src/braintrust/functions/cassettes/test_invoke_encodes_body_as_utf8_bytes.yaml diff --git a/py/src/braintrust/functions/cassettes/test_invoke_encodes_body_as_utf8_bytes.yaml b/py/src/braintrust/functions/cassettes/test_invoke_encodes_body_as_utf8_bytes.yaml new file mode 100644 index 00000000..64123a0a --- /dev/null +++ b/py/src/braintrust/functions/cassettes/test_invoke_encodes_body_as_utf8_bytes.yaml @@ -0,0 +1,29 @@ +interactions: +- request: + body: '{"api_version": 1, "input": {"text": "result \u2014 excellent"}, "metadata": + null, "parent": "", "project_name": "test-project", "slug": "test-fn", "stream": + false, "tags": null}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + User-Agent: + - python-requests/2.32.5 + method: POST + uri: https://proxy.braintrust.ai/function/invoke + response: + body: + string: '{"output": "result \u2014 excellent"}' + headers: + Content-Type: + - application/json; charset=utf-8 + status: + code: 200 + message: OK +version: 1 + diff --git a/py/src/braintrust/functions/test_invoke.py b/py/src/braintrust/functions/test_invoke.py index 4b004949..650f5c81 100644 --- a/py/src/braintrust/functions/test_invoke.py +++ b/py/src/braintrust/functions/test_invoke.py @@ -5,7 +5,7 @@ import pytest from braintrust.functions.invoke import init_function, invoke -from braintrust.logger import _internal_get_global_state, _internal_reset_global_state +from braintrust.logger import TEST_API_KEY, _internal_get_global_state, _internal_reset_global_state class TestInitFunction: @@ -118,33 +118,31 @@ def test_invoke_serializes_google_messages(): assert isinstance(parsed, dict) and parsed -def test_invoke_encodes_body_as_utf8_bytes(): +@pytest.mark.vcr +def test_invoke_encodes_body_as_utf8_bytes(monkeypatch): """Regression test for BT-4620: non-Latin-1 Unicode must not be corrupted. When invoke() serializes the request body via bt_dumps() and passes it to requests.post(data=...), the body must be UTF-8 encoded bytes — not a str. Passing a str causes requests to re-encode with Latin-1, which raises UnicodeEncodeError (or silently corrupts data) for characters outside U+007F. - """ - em_dash = "\u2014" # — (U+2014) is outside Latin-1; triggers the bug when body is str - mock_resp = MagicMock() - mock_resp.status_code = 200 - mock_resp.json.return_value = {} - mock_conn = MagicMock() - mock_conn.post.return_value = mock_resp - with ( - patch("braintrust.functions.invoke.login"), - patch("braintrust.functions.invoke.proxy_conn", return_value=mock_conn), - ): - invoke(project_name="test-project", slug="test-fn", input={"text": f"result {em_dash} excellent"}) - - kwargs = mock_conn.post.call_args.kwargs - data = kwargs["data"] + Uses TEST_API_KEY to skip the HTTP login entirely, so the cassette only needs + to capture the single POST to /function/invoke. BRAINTRUST_PROXY_URL is + cleared so the proxy URL is always the predictable test stub value + (https://proxy.braintrust.ai) regardless of the local environment. + """ + # Prevent local env overrides from changing the proxy URL used in the cassette. + monkeypatch.delenv("BRAINTRUST_PROXY_URL", raising=False) + monkeypatch.delenv("BRAINTRUST_API_URL", raising=False) + _internal_reset_global_state() - assert isinstance(data, bytes), "body must be bytes so requests does not Latin-1 encode it" - # Round-trip through UTF-8 decode + JSON parse to confirm the em dash survives intact. - # bt_dumps may use JSON \uXXXX escapes (stdlib) or raw UTF-8 bytes (orjson) — both are valid. - parsed = json.loads(data.decode("utf-8")) - assert parsed["input"]["text"] == f"result {em_dash} excellent", "em dash must survive serialization" - assert kwargs.get("headers", {}).get("Content-Type") == "application/json" + em_dash = "\u2014" # — (U+2014) is outside Latin-1; triggers the bug when body is str + result = invoke( + project_name="test-project", + slug="test-fn", + input={"text": f"result {em_dash} excellent"}, + parent="", # skip span-parent lookup; no extra HTTP call needed + api_key=TEST_API_KEY, + ) + assert result["output"] == f"result {em_dash} excellent"